r - 大規模なデータセットでの行単位の操作

Question

以下の操作を実現するためのより高速な方法を探しています。データセットには 100 万行を超える行が含まれていますが、タスクを説明するために簡単な例を提供しました --

To create the data table --

dt <- data.table(name=c("john","jill"), a1=c(1,4), a2=c(2,5), a3=c(3,6), 
      b1=c(10,40), b2=c(20,50), b3=c(30,60))

colGroups <- c("a","b")   # Columns starting in "a", and in "b"

Original Dataset
-----------------------------------
name    a1   a2   a3   b1   b2   b3
john    1    2    3    10   20   30
jill    4    5    6    40   50   60

上記のデータセットは、一意の名前ごとに 2 つの新しい行が追加され、各行で値が列のグループごとに個別に左にシフトされるように変換されます (この例では、a 列と b 列を使用しましたが、さらに多くの列があります)。

Transformed Dataset
-----------------------------------
name    a1   a2   a3   b1   b2   b3
john    1    2    3    10   20   30  # First Row for John
john    2    3    0    20   30    0  # "a" values left shifted, "b" values left shifted
john    3    0    0    30   0     0  # Same as above, left-shifted again

jill    4    5    6    40   50   60  # Repeated for Jill
jill    5    6    0    50   60    0 
jill    6    0    0    60    0    0

等々。私のデータセットは非常に大きいため、これを実装する効率的な方法があるかどうかを確認しようとしています。

前もって感謝します。

score 5 · Accepted Answer

更新: (はるかに) 高速な解決策は、次のようにインデックスを使用することです (1e6*7 で約 4 秒かかります)。

ll <- vector("list", 3)
ll[[1]] <- copy(dt[, -1])
d_idx <- seq(2, ncol(dt), by=3)
for (j in 1:2) {
    tmp <- vector("list", 2)
    for (i in seq_along(colGroups)) {
        idx <- ((i-1)*3+2):((i*3)+1)
        cols <- setdiff(idx, d_idx[i]:(d_idx[i]+j-1))
        # ..cols means "look up one level"
        tmp[[i]] <- cbind(dt[, ..cols], data.table(matrix(0, ncol=j)))
    }
    ll[[j+1]] <- do.call(cbind, tmp)
}
ans <- cbind(data.table(name=dt$name), rbindlist(ll))
setkey(ans, name)

最初の試み (旧): 非常に興味深い問題です。melt.data.table次のように (1.8.11 から) andを使用してアプローチしdcast.data.tableます。

require(data.table)
require(reshape2)
# melt is S3 generic, calls melt.data.table, returns a data.table (very fast)
ans <- melt(dt, id=1, measure=2:7, variable.factor=FALSE)[, 
                    grp := rep(colGroups, each=nrow(dt)*3)]
setkey(ans, name, grp)
ans <- ans[, list(variable=c(variable, variable[1:(.N-1)], 
          variable[1:(.N-2)]), value=c(value, value[-1],
     value[-(1:2)]), id2=rep.int(1:3, 3:1)), list(name, grp)]
# dcast in reshape2 is not yet a S3 generic, have to call by full name
ans <- dcast.data.table(ans, name+id2~variable, fill=0L)[, id2 := NULL]

同じ数の列を持つ 1e6 行のベンチマーク:

require(data.table)
require(reshape2)
set.seed(45)
N <- 1e6
dt <- cbind(data.table(name=paste("x", 1:N, sep="")), 
               matrix(sample(10, 6*N, TRUE), nrow=N))
setnames(dt, c("name", "a1", "a2", "a3", "b1", "b2", "b3"))
colGroups = c("a", "b")

system.time({
ans <- melt(dt, id=1, measure=2:7, variable.factor=FALSE)[, 
                    grp := rep(colGroups, each=nrow(dt)*3)]
setkey(ans, name, grp)
ans <- ans[, list(variable=c(variable, variable[1:(.N-1)], 
          variable[1:(.N-2)]), value=c(value, value[-1],
     value[-(1:2)]), id2=rep.int(1:3, 3:1)), list(name, grp)]
ans <- dcast.data.table(ans, name+id2~variable, fill=0L)[, id2 := NULL]

})

#   user  system elapsed 
# 45.627   2.197  52.051

score 1 · Accepted Answer

選択した回答の @Arun (s) コードを編集するだけです。コメント欄に書き込めないのでこちらで。

#Parameterized version of @Arun (author) code (in the selected answer)

#Shifting Columns in R
#--------------------------------------------
N = 5  # SET - Number of unique names
set.seed(5)
colGroups <- c("a","b") # ... (i) # SET colGroups
totalColsPerGroup <- 10 # SET Cols Per Group
numColsToLeftShift <- 8 # SET Cols to Shift

lenColGroups <- length(colGroups) # ... (ii)

# From (i) and (ii)
totalCols = lenColGroups * totalColsPerGroup


dt <- cbind(data.table(name=paste("x", 1:N, sep="")), 
            matrix(sample(5, totalCols*N, TRUE), nrow=N)) # Change 5 if needed

ll <- vector("list", numColsToLeftShift)
ll[[1]] <- copy(dt[, -1, with=FALSE])
d_idx <- seq(2, ncol(dt), by=totalColsPerGroup)
for (j in 1:(numColsToLeftShift)) {
  tmp <- vector("list", 2)
  for (i in seq_along(colGroups)) {
    idx <- ((i-1)*totalColsPerGroup+2):((i*totalColsPerGroup)+1) #OK
    tmp[[i]] <- cbind(dt[, setdiff(idx, d_idx[i]:(d_idx[i]+j-1)), 
                         with=FALSE], data.table(matrix(0, ncol=j)))

  }      
  ll[[j+1]] <- do.call(cbind, tmp)

}
ans <- cbind(data.table(name=dt$name), rbindlist(ll))
setkey(ans, name)

--

r - 大規模なデータセットでの行単位の操作

3 に答える 3

Related

Reference