r - パフォーマンスの向上

Question

遅延/転送に基づいて関数アプリケーションを取得しようとしています。私は広範囲に使用してdata.tableおり、機能するコードもありますが、その能力を知ってdata.tableいると、おそらくパフォーマンスを向上させて同じことを達成するためのより簡単な方法が必要だと思います(関数内で変数をたくさん作成します)。以下は関数の作業コードです ( https://gist.github.com/tomaskrehlik/5262087#file-gistfile1-rで入手可能)

# Lag-function lags the given variable by the date_variable

lag_variable <- function(data, variable, lags, date_variable = c("Date")) {
    if (lags == 0) {
      return(data)
    }
    if (lags>0) {
      name <- "lag"
    } else {
      name <- "forward"
    }
    require(data.table)
    setkeyv(data, date_variable)
    if (lags>0) {
      data[,index:=seq(1:.N)]  
    } else {
      data[,index:=rev(seq(1:.N))]
    }
    setkeyv(data, "index")
    lags <- abs(lags)
    position <- which(names(data)==variable)
    for ( j in 1:lags ) {
      lagname <- paste(variable,"_",name,j,sep="")
      lag <- paste("data[, ",lagname,":=data[list(index-",j,"), ",variable,", roll=TRUE][[",position,"L]]]", sep = "")
      eval(parse( text = lag ))
    }
    setkeyv(data, date_variable)
    data[,index:=NULL]
}

# window_func applies the function to the lagged or forwarded variables created by lag_variable
window_func <- function(data, func.name, variable, direction = "window", steps, date_variable = c("Date"), clean = TRUE) {
    require(data.table)
    require(stringr)
    transform <- match.fun(func.name)
    l <- length(names(data))
    if (direction == "forward") {
      lag_variable(data, variable, -steps, date_variable)
      cols <- which((!(is.na(str_match(names(a), paste(variable,"_forward(",paste(1:steps,collapse="|"),")",sep=""))[,1])))*1==1)
    } else {
      if (direction == "backward") {
        lag_variable(data, variable, steps, date_variable)
        cols <- which((!(is.na(str_match(names(a), paste(variable,"_lag(",paste(1:steps,collapse="|"),")",sep=""))[,1])))*1==1)
      } else {
        if (direction == "window") {
          lag_variable(data, variable, -steps, date_variable)
          lag_variable(data, variable, steps, date_variable)
          cols <- which((!(is.na(str_match(names(a), paste(variable,"_lag(",paste(1:steps,collapse="|"),")",sep=""))[,1])))*1==1)
          cols <- c(cols,which((!(is.na(str_match(names(a), paste(variable,"_forward(",paste(1:steps,collapse="|"),")",sep=""))[,1])))*1==1))
        } else {
          stop("The direction must be either backward, forward or window.")
        }
      }
    }
    data[,transf := apply(data[,cols, with=FALSE], 1, transform)]
    if (clean) {
      data[,cols:=NULL,with=FALSE]
    }
    return(data)
}

# Typical use:
# I have a data.table DT with variables Date (class IDate), value1, value2
# I want to get cumulative sum of next five days
# window_func(DT, "sum", "value1", direction = "forward", steps = 5)

編集:サンプルデータは次の方法で作成できます。

a <- data.table(Date = 1:1000, value = rnorm(1000))

各 Date (ここでは例として整数ですが、あまり重要ではありません) について、次の 10 個の観測値の合計を作成したいと思います。コードを実行して出力を取得するには、次のようにします。

window_func(data = a, func.name = "sum", variable = "value", 
      direction = "forward", steps = 10, date_variable = "Date", clean = TRUE)

この関数は最初に変数を取得し、( function を使用して) 10 個の遅延変数を作成しlag_variable、次に関数を列ごとに適用して、それ自体をクリーンアップします。コードが肥大化するのは、ウィンドウと呼ばれるラグ観測、フォワード観測、および両方で関数を使用する必要がある場合があるためです。

これをより適切に実装する方法について何か提案はありますか? 私のコードはどういうわけか大きすぎるようです。

score 5 · Accepted Answer

関数の残りの部分についてはわかりませんが、次のように遅れた合計をかなり効率的に取得できます。

a[ , lagSum := 
       a[, list(sum=sum(value)), by=list(grp=(Date+lag-i) %/% lag)] [grp!=0, sum]
   , by=list(i=Date %% lag)]

例えば：

set.seed(1)
a[ , lagSum := 
       a[, list(sum=sum(value)), by=list(grp=(Date+lag-i) %/% lag)] [grp!=0, sum]
   , by=list(i=Date %% lag)]

> a
      Date      value      lagSum
   1:    1 -0.6264538  1.32202781
   2:    2  0.1836433  3.46026279
   3:    3 -0.8356286  3.66646270
   4:    4  1.5952808  3.88085074
   5:    5  0.3295078  0.07087005
  ---                            
 996:  996 -0.3132929 -3.79332038
 997:  997 -0.8806707 -3.48002750
 998:  998 -0.4192869 -2.59935677
 999:  999 -1.4827517 -2.18006988
1000: 1000 -0.6973182 -1.88854602

正しい値の確認:

# first n values
n <- 5
for (i in seq(n))
  a[seq(i, length.out=10), print(sum(value))]

#  [1] 1.322028
#  [1] 3.460263
#  [1] 3.666463
#  [1] 3.880851
#  [1] 0.07087005

ベンチマーク (対 for ループのため、あまり公平ではありません)

set.seed(1)
a <- data.table(Date = 1:1000, value = rnorm(1000))

system.time({    a[ , lagSum := 
           a[, list(sum=sum(value)), by=list(grp=(Date+lag-i) %/% lag)] [grp!=0, sum]
       , by=list(i=Date %% lag)]
})

#  user  system elapsed 
# 0.049   0.001   0.056 



set.seed(1)
a <- data.table(Date = 1:1000, value = rnorm(1000))

system.time({    for (i in seq(nrow(a)-lag+1))
      a[seq(i, length.out=10), lagSum := sum(value)]})

#  user  system elapsed 
# 1.526   0.019   2.220

r - パフォーマンスの向上

1 に答える 1

ベンチマーク (対 for ループのため、あまり公平ではありません)

Related

Reference