r - 複雑な形状変更

Question

データフレームの形状をロングフォーマットからワイドフォーマットに変更したいのですが、保持したいデータを失います。次の例の場合：

df <- data.frame(Par1 = unlist(strsplit("AABBCCC","")),
                 Par2 = unlist(strsplit("DDEEFFF","")),
                 ParD = unlist(strsplit("foo,bar,baz,qux,bla,xyz,meh",",")),
                 Type = unlist(strsplit("pre,post,pre,post,pre,post,post",",")),
                 Val = c(10,20,30,40,50,60,70))

   #     Par1 Par2 ParD Type Val
   #   1    A    D  foo  pre  10
   #   2    A    D  bar post  20
   #   3    B    E  baz  pre  30
   #   4    B    E  qux post  40
   #   5    C    F  bla  pre  50
   #   6    C    F  xyz post  60
   #   7    C    F  meh post  70

dfw <- dcast(df,
             formula = Par1 + Par2 ~ Type,
             value.var = "Val",
             fun.aggregate = mean)

 #     Par1 Par2 post pre
 #   1    A    D   20  10
 #   2    B    E   40  30
 #   3    C    F   65  50

これはほとんど私が必要としているものですが、私は持っていたいです

フィールドからのデータを保持するいくつかのフィールドParD（たとえば、単一のマージされた文字列として）、
集計に使用された観測値の数。

つまり、結果のdata.frameを次のようにします。

    #     Par1 Par2 post pre Num.pre Num.post ParD
    #   1    A    D   20  10      1      1    foo_bar 
    #   2    B    E   40  30      1      1    baz_qux
    #   3    C    F   65  50      1      2    bla_xyz_meh

どんなアイデアでもありがたいです。たとえば、dcastで次のように記述して2番目のタスクを解決しようとしましたfun.aggregate=function(x) c(Val=mean(x),Num=length(x))が、これによりエラーが発生します。

score 14 · Accepted Answer

パーティーに遅れましたが、次を使用した別の方法がありますdata.table:

require(data.table)
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]), 
          post=mean(Val[Type == "post"]), 
          pre.num=length(Val[Type == "pre"]), 
          post.num=length(Val[Type == "post"]), 
          ParD = paste(ParD, collapse="_")), 
by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo_bar
# 2:    B    E  30   40       1        1     baz_qux
# 3:    C    F  50   65       1        2 bla_xyz_meh

[マシューから] +1 同じの繰り返しを保存し==、内部のローカル変数を示すためのいくつかのマイナーな改善j。

dt[, list(pre=mean(Val[.pre <- Type=="pre"]),     # save .pre
          post=mean(Val[.post <- Type=="post"]),  # save .post
          pre.num=sum(.pre),                      # reuse .pre
          post.num=sum(.post),                    # reuse .post
          ParD = paste(ParD, collapse="_")), 
by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo_bar
# 2:    B    E  30   40       1        1     baz_qux
# 3:    C    F  50   65       1        2 bla_xyz_meh

dt[, { .pre <- Type=="pre"                  # or save .pre and .post up front 
       .post <- Type=="post"
       list(pre=mean(Val[.pre]), 
            post=mean(Val[.post]),
            pre.num=sum(.pre),
            post.num=sum(.post), 
            ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo_bar
# 2:    B    E  30   40       1        1     baz_qux
# 3:    C    F  50   65       1        2 bla_xyz_meh

そして、list列が a ではなくOKの場合paste、これはより高速になるはずです:

dt[, { .pre <- Type=="pre"
       .post <- Type=="post"
       list(pre=mean(Val[.pre]), 
            post=mean(Val[.post]),
            pre.num=sum(.pre),
            post.num=sum(.post), 
            ParD = list(ParD)) }     # list() faster than paste()
, by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo,bar
# 2:    B    E  30   40       1        1     baz,qux
# 3:    C    F  50   65       1        2 bla,xyz,meh

score 13 · Accepted Answer

を使用して2つのステップで解決しますddply （私は満足していませんが、結果は得られます）

dat <- ddply(df,.(Par1,Par2),function(x){
  data.frame(ParD=paste(paste(x$ParD),collapse='_'),
             Num.pre =length(x$Type[x$Type =='pre']),
             Num.post = length(x$Type[x$Type =='post']))
})

merge(dfw,dat)
 Par1 Par2 post pre        ParD Num.pre Num.post
1    A    D  2.0   1     foo_bar       1        1
2    B    E  4.0   3     baz_qux       1        1
3    C    F  6.5   5 bla_xyz_meh       1        2

score 6 · Accepted Answer

私は投稿しますが、agstudyは私を恥ずかしく思います：

step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
    piece1 <- tapply(x$Val, x$Type, mean)
    piece2 <- tapply(x$Type, x$Type, length)
    names(piece2) <- paste0("Num.", names(piece2))
    out <- x[1, 1:2]
    out[, 3:6] <- c(piece1, piece2)
    names(out)[3:6] <-  names(c(piece1, piece2))
    out$ParD <- paste(unique(x$ParD), collapse="_")
    out
})
data.frame(do.call(rbind, step3), row.names=NULL)

降伏：

  Par1 Par2 post pre Num.post Num.pre        ParD
1    A    D  2.0   1        1       1     foo_bar
2    B    E  4.0   3        1       1     baz_qux
3    C    F  6.5   5        2       1 bla_xyz_meh

score 6 · Accepted Answer

2 つの dcast と 1 つの集約のマージを行うことができます。ここでは、すべてが 1 つの大きな式にラップされ、後で中間オブジェクトがぶらぶらするのを避けることができます。

Reduce(merge, list(
    dcast(df, formula = Par1+Par2~Type, value.var="Val",
        fun.aggregate=mean),
    setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
        fun.aggregate=length), c("Par1", "Par2", "Num.post",
        "Num.pre")),
    aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
    ))

score 6 · Accepted Answer

ベンチマークする絶好の機会です！以下は、異なるサンプルサイズ (N = 900、2700、10800) を使用したメソッド ( plyr@Arun が提案) と比較したメソッド (@agstudy が提案) のいくつかの実行です。data.table

要約:
このdata.table方法は、plyr方法よりも 7.5 倍性能が優れています。

#-------------------#
#   M E T H O D S   #
#-------------------#

  # additional methods below, in the updates

  # Method 1  -- suggested by @agstudy
  plyrMethod <- quote({
                  dfw<-dcast(df,
                         formula = Par1+Par2~Type,
                         value.var="Val",
                         fun.aggregate=mean)
                  dat <- ddply(df,.(Par1,Par2),function(x){
                    data.frame(ParD=paste(paste(x$ParD),collapse='_'),
                               Num.pre =length(x$Type[x$Type =='pre']),
                               Num.post = length(x$Type[x$Type =='post']))
                  })
                  merge(dfw,dat)
                })

  # Method 2 -- suggested by @Arun
  dtMethod <- quote(
                dt[, list(pre=mean(Val[Type == "pre"]), 
                          post=mean(Val[Type == "post"]), 
                          Num.pre=length(Val[Type == "pre"]), 
                          Num.post=length(Val[Type == "post"]), 
                          ParD = paste(ParD, collapse="_")), 
                by=list(Par1, Par2)]
              ) 

 # Method 3 -- suggested by @regetz
 reduceMethod <- quote(
                  Reduce(merge, list(
                      dcast(df, formula = Par1+Par2~Type, value.var="Val",
                          fun.aggregate=mean),
                      setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
                          fun.aggregate=length), c("Par1", "Par2", "Num.post",
                          "Num.pre")),
                      aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
                      ))
                  )

 # Method 4 -- suggested by @Ramnath
 castddplyMethod <- quote(
                      reshape::cast(Par1 + Par2 + ParD ~ Type, 
                           data = ddply(df, .(Par1, Par2), transform, 
                           ParD = paste(ParD, collapse = "_")), 
                           fun  = c(mean, length)
                          )
                      )



# SAMPLE DATA #
#-------------#

library(data.table)
library(plyr)
library(reshape2)
library(rbenchmark)


  # for Par1, ParD
  LLL <- apply(expand.grid(LETTERS, LETTERS, LETTERS, stringsAsFactors=FALSE), 1, paste0, collapse="")
  lll <- apply(expand.grid(letters, letters, letters, stringsAsFactors=FALSE), 1, paste0, collapse="")

  # max size is 17568 with current sample data setup, ie: floor(length(LLL) / 18) * 18
  size <- 17568
  size <- 10800
  size <- 900  

  set.seed(1)
  df<-data.frame(Par1=rep(LLL[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
                 , Par2=rep(lll[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
                 , ParD=sample(unlist(lapply(c("f", "b"), paste0, lll)), size, FALSE)
                 , Type=rep(c("pre","post"), size/2)
                 , Val =sample(seq(10,100,10), size, TRUE)
                 )

  dt <- data.table(df, key=c("Par1", "Par2"))


# Confirming Same Results # 
#-------------------------#
  # Evaluate
  DF1 <- eval(plyrMethod)
  DF2 <- eval(dtMethod)

  # Convert to DF and sort columns and sort ParD levels, for use in identical
  colOrder <- sort(names(DF1))
  DF1 <- DF1[, colOrder]
  DF2 <- as.data.frame(DF2)[, colOrder]
  DF2$ParD <- factor(DF2$ParD, levels=levels(DF1$ParD))
  identical((DF1), (DF2))
  # [1] TRUE
#-------------------------#

結果

#--------------------#
#     BENCHMARK      #
#--------------------#
benchmark(plyr=eval(plyrMethod), dt=eval(dtMethod), reduce=eval(reduceMethod), castddply=eval(castddplyMethod),
          replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), 
          order="relative")


# SAMPLE SIZE = 900
  relative      test elapsed user.self sys.self replications
     1.000    reduce   0.392     0.375    0.018            5
     1.003        dt   0.393     0.377    0.016            5
     7.064      plyr   2.769     2.721    0.047            5
     8.003 castddply   3.137     3.030    0.106            5

# SAMPLE SIZE = 2,700
  relative   test elapsed user.self sys.self replications
     1.000     dt   1.371     1.327    0.090            5
     2.205 reduce   3.023     2.927    0.102            5
     7.291   plyr   9.996     9.644    0.377            5

# SAMPLE SIZE = 10,800
  relative      test elapsed user.self sys.self replications
     1.000        dt   8.678     7.168    1.507            5
     2.769    reduce  24.029    23.231    0.786            5
     6.946      plyr  60.277    52.298    7.947            5
    13.796 castddply 119.719   113.333   10.816            5

# SAMPLE SIZE = 17,568
  relative   test elapsed user.self sys.self replications
     1.000     dt  27.421    13.042   14.470            5
     4.030 reduce 110.498    75.853   34.922            5
     5.414   plyr 148.452   105.776   43.156            5

更新: baseMethod1 の結果を追加

# Used only sample size of 90, as it was taking long
relative  test elapsed user.self sys.self replications
   1.000    dt   0.044     0.043    0.001            5
   7.773  plyr   0.342     0.339    0.003            5
  65.614 base1   2.887     2.866    0.028            5

Where
   baseMethod1 <- quote({
                  step1 <- with(df, split(df, list(Par1, Par2)))
                  step2 <- step1[sapply(step1, nrow) > 0]
                  step3 <- lapply(step2, function(x) {
                      piece1 <- tapply(x$Val, x$Type, mean)
                      piece2 <- tapply(x$Type, x$Type, length)
                      names(piece2) <- paste0("Num.", names(piece2))
                      out <- x[1, 1:2]
                      out[, 3:6] <- c(piece1, piece2)
                      names(out)[3:6] <-  names(c(piece1, piece2))
                      out$ParD <- paste(unique(x$ParD), collapse="_")
                      out
                  })
                  data.frame(do.call(rbind, step3), row.names=NULL)
                })

更新 2: メトリックの一部として DT のキーイングを追加

@MatthewDowle のコメントに従って、公平性のベンチマークにインデックス作成ステップを追加します。
ただし、おそらく、data.table が使用されている場合、data.frame の代わりに使用されるため、この手順だけではなく、インデックス作成が 1 回行われます。

   dtMethod.withkey <- quote({
                       dt <- data.table(df, key=c("Par1", "Par2"))       
                       dt[, list(pre=mean(Val[Type == "pre"]), 
                                 post=mean(Val[Type == "post"]), 
                                 Num.pre=length(Val[Type == "pre"]), 
                                 Num.post=length(Val[Type == "post"]), 
                                 ParD = paste(ParD, collapse="_")), 
                       by=list(Par1, Par2)]
                     }) 

# SAMPLE SIZE = 10,800
  relative       test elapsed user.self sys.self replications
     1.000         dt   9.155     7.055    2.137            5
     1.043 dt.withkey   9.553     7.245    2.353            5
     3.567     reduce  32.659    31.196    1.586            5
     6.703       plyr  61.364    54.080    7.600            5

更新 3: @Arun の元の回答に対する @MD の編集のベンチマーク

dtMethod.MD1 <- quote(
                  dt[, list(pre=mean(Val[.pre <- Type=="pre"]),     # save .pre
                            post=mean(Val[.post <- Type=="post"]),  # save .post
                            pre.num=sum(.pre),                      # reuse .pre
                            post.num=sum(.post),                    # reuse .post
                            ParD = paste(ParD, collapse="_")), 
                     by=list(Par1, Par2)]
                  )

dtMethod.MD2 <- quote(
                  dt[, { .pre <- Type=="pre"                  # or save .pre and .post up front 
                         .post <- Type=="post"
                         list(pre=mean(Val[.pre]), 
                              post=mean(Val[.post]),
                              pre.num=sum(.pre),
                              post.num=sum(.post), 
                              ParD = paste(ParD, collapse="_")) }
                  , by=list(Par1, Par2)]
                  )

dtMethod.MD3 <- quote(
                dt[, { .pre <- Type=="pre"
                       .post <- Type=="post"
                       list(pre=mean(Val[.pre]), 
                            post=mean(Val[.post]),
                            pre.num=sum(.pre),
                            post.num=sum(.post), 
                            ParD = list(ParD)) }     # list() faster than paste()
                , by=list(Par1, Par2)]
                )

benchmark(dt.M1=eval(dtMethod.MD1), dt.M2=eval(dtMethod.MD2), dt.M3=eval(dtMethod.MD3), dt=eval(dtMethod),
      replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), 
      order="relative")

#--------------------#

Comparing the different data.table methods amongst themselves


# SAMPLE SIZE = 900
  relative  test elapsed user.self sys.self replications
     1.000 dt.M3   0.198     0.197    0.001            5  <~~~ "list()" Method
     1.242 dt.M1   0.246     0.243    0.004            5
     1.253 dt.M2   0.248     0.242    0.007            5
     1.884    dt   0.373     0.367    0.007            5

# SAMPLE SIZE = 17,568
  relative  test elapsed user.self sys.self replications
     1.000 dt.M3  33.492    24.487    9.122            5   <~~~ "list()" Method
     1.086 dt.M1  36.388    11.442   25.086            5
     1.086 dt.M2  36.388    10.845   25.660            5
     1.126    dt  37.701    13.256   24.535            5

Comparing MD3 ("list" method) with MD1 (best of DT non-list methods)
Using a clean session  (ie, removing string cache)
_Note: Ran the following twice, fresh session each time, with practically identical results
       Then re-ran in the *same* session, with reps=5. Results very different._


benchmark(dt.M1=eval(dtMethod.MD1), dt.M3=eval(dtMethod.MD3), replications=1, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative")
# SAMPLE SIZE=17,568;  CLEAN SESSION
  relative  test elapsed user.self sys.self replications
     1.000 dt.M1   8.885     4.260    4.617            1
     1.633 dt.M3  14.506    12.821    1.677            1

# SAMPLE SIZE=17,568;  *SAME* SESSION
  relative  test elapsed user.self sys.self replications
     1.000 dt.M1  33.443    10.200   23.226            5
     1.048 dt.M3  35.060    26.127    8.915            5

#--------------------#

New benchmarks against previous methods
_Note: Not using the "list method" as results are not the same as other methods_

# SAMPLE SIZE = 900
  relative   test elapsed user.self sys.self replications
     1.000  dt.M1   0.254     0.247    0.008            5
     1.705 reduce   0.433     0.425    0.010            5
    11.280   plyr   2.865     2.842    0.031            5

# SAMPLE SIZE = 17,568
  relative   test elapsed user.self sys.self replications
     1.000  dt.M1  24.826    10.427   14.458            5
     4.348 reduce 107.935    70.107   38.314            5
     5.942   plyr 147.508   106.958   41.083            5

score 2 · Accepted Answer

このベース R ソリューションは、@Arun のデータテーブルソリューションに匹敵すると思います。(これは私がそれを好むと言っているわけではありません; そのコードははるかに簡単です!)

baseMethod2 <- quote({
    is <- unname(split(1:nrow(df), with(df, paste(Par1, Par2, sep="\b"))))
    i1 <- sapply(is, `[`, 1)
    out <- with(df, data.frame(Par1=Par1[i1], Par2=Par2[i1]))
    js <- lapply(is, function(i) split(i, df$Type[i]))
    out$post <- sapply(js, function(j) mean(df$Val[j$post]))
    out$pre <- sapply(js, function(j) mean(df$Val[j$pre]))
    out$Num.pre <- sapply(js, function(j) length(j$pre))
    out$Num.post <- sapply(js, function(j) length(j$post))
    out$ParD <- sapply(is, function(x) paste(df$ParD[x], collapse="_"))
    out
})

@RicardoSaporta のタイミングコードをそれぞれ 900、2700、10,800 で使用:

> relative        test elapsed user.self sys.self replications
3    1.000 baseMethod2   0.230     0.229        0            5
1    1.130          dt   0.260     0.257        0            5
2    8.752        plyr   2.013     2.006        0            5

> relative        test elapsed user.self sys.self replications
3    1.000 baseMethod2   0.877     0.872        0            5
1    1.068          dt   0.937     0.934        0            5
2    8.060        plyr   7.069     7.043        0            5

> relative        test elapsed user.self sys.self replications
1    1.000          dt   6.232     6.178    0.031            5
3    1.085 baseMethod2   6.763     6.683    0.054            5
2    7.263        plyr  45.261    44.983    0.104            5

score 2 · Accepted Answer

reshape::castと組み合わせたワンステップソリューションplyr::ddply

cast(Par1 + Par2 + ParD ~ Type, data = ddply(df, .(Par1, Par2), transform, 
  ParD = paste(ParD, collapse = "_")), fun  = c(mean, length))

関数 inでは複数の集計関数を渡すことができませんが、dcast関数 inでは許可されていることに注意してください。reshape2castreshape

score 0 · Accepted Answer

さまざまな集計式を自己完結型の関数にラップしようとしています（式はアトミック値を生成する必要があります）...

multi.by <- function(X, INDEX,...) {
    expressions <- substitute(...())
    duplicates <- duplicated(INDEX)
    res <- do.call(rbind,sapply(split(X,cumsum(!duplicates),drop=T), function(part) 
        sapply(expressions,eval,part,simplify=F),simplify=F))
    if (is.data.frame(INDEX)) res <- cbind(INDEX[!duplicates,],res)
    else rownames(res) <- INDEX[!duplicates]
    res
}

multi.by(df,df[,1:2],
    pre=mean(Val[Type=="pre"]), 
    post=mean(Val[Type=="post"]),
    Num.pre=sum(Type=="pre"),
    Num.post=sum(Type=="post"),
    ParD=paste(ParD, collapse="_"))

r - 複雑な形状変更

8 に答える 8

結果

更新: baseMethod1 の結果を追加

更新 2: メトリックの一部として DT のキーイングを追加

更新 3: @Arun の元の回答に対する @MD の編集のベンチマーク

Related

Reference