r - dplyr mutate 内の forloop

Question

mutate を使用して変換したいテーブルに 200 を超える列があるため、よりエレガントな方法で mutate を使用していくつかの列操作を実行したいと考えています。

ここに例があります

サンプルデータ：

df <- data.frame(treatment=rep(letters[1:2],10),
c1_x=rnorm(20),c2_y=rnorm(20),c3_z=rnorm(20),
c4_x=rnorm(20),c5_y=rnorm(20),c6_z=rnorm(20),
c7_x=rnorm(20),c8_y=rnorm(20),c9_z=rnorm(20),
c10_x=rnorm(20),c11_y=rnorm(20),c12_z=rnorm(20),
c_n=rnorm(20))

サンプルコード:

dfm<-df %>%
mutate(cx=(c1_x*c4_x/c_n+c7_x*c10_x/c_n),
cy=(c2_y*c5_y/c_n+c8_y*c11_y/c_n),
cz=(c3_z*c6_z/c_n+c9_z*c12_z/c_n))

score 3 · Accepted Answer

接線にもかかわらず、関数を使用するための最初の推奨事項tidyrは、行く必要がある場所です。この関数のパイプは、あなたが提供したものに基づいて仕事をしているようです。

あなたのデータ：

df <- data.frame(treatment=rep(letters[1:2],10),
                 c1_x=rnorm(20), c2_y=rnorm(20), c3_z=rnorm(20),
                 c4_x=rnorm(20), c5_y=rnorm(20), c6_z=rnorm(20),
                 c7_x=rnorm(20), c8_y=rnorm(20), c9_z=rnorm(20),
                 c10_x=rnorm(20), c11_y=rnorm(20), c12_z=rnorm(20),
                 c_n=rnorm(20))
library(dplyr)
library(tidyr)

この最初の補助的な data.frame は、c#_[xyz]変数を統一されたものに変換するために使用されます。これを処理する方法は他にもあると思いますが、それは機能し、200 以上の列に基づいて比較的簡単に再現および拡張できます。

variableTransform <- data_frame(
  cnum = paste0("c", 1:12),
  cvar = rep(paste0("a", 1:4), each = 3)
)
head(variableTransform)
# Source: local data frame [6 x 2]
#    cnum  cvar
#   <chr> <chr>
# 1    c1    a1
# 2    c2    a1
# 3    c3    a1
# 4    c4    a2
# 5    c5    a2
# 6    c6    a2

一気にパイプが来ました。手順をすぐに説明します。探しているのは、treatment、xyz、およびans列の組み合わせである可能性があります。

df %>%
  tidyr::gather(cnum, value, -treatment, -c_n) %>%
  tidyr::separate(cnum, c("cnum", "xyz"), sep = "_") %>%
  left_join(variableTransform, by = "cnum") %>%
  select(-cnum) %>%
  tidyr::spread(cvar, value) %>%
  mutate(
    ans = a1 * (a2/c_n) + a3 * (a4/c_n)
  ) %>%
  head
#   treatment       c_n xyz         a1          a2         a3          a4         ans
# 1         a -1.535934   x -0.3276474  1.45959746 -1.2650369  1.02795419  1.15801448
# 2         a -1.535934   y -1.3662388 -0.05668467  0.4867865 -0.10138979 -0.01828831
# 3         a -1.535934   z -2.5026018 -0.99797169  0.5181513  1.20321878 -2.03197283
# 4         a -1.363584   x -0.9742016 -0.12650863  1.3612361 -0.24840493  0.15759418
# 5         a -1.363584   y -0.9795871  1.52027017  0.5510857  1.08733839  0.65270681
# 6         a -1.363584   z  0.2985557 -0.22883439  0.1536078 -0.09993095  0.06136036

まず、元のデータを取得し、すべての (2 つを除く) 列を「列名」と「列値」のペアの 2 つの列に変換します。

df %>%
  tidyr::gather(cnum, value, -treatment, -c_n) %>%
#   treatment         c_n cnum      value
# 1         a  0.20745647 c1_x -0.1250222
# 2         b  0.01015871 c1_x -0.4585088
# 3         a  1.65671028 c1_x -0.2455927
# 4         b -0.24037137 c1_x  0.6219516
# 5         a -1.16092349 c1_x -0.3716138
# 6         b  1.61191700 c1_x  1.7605452

前者を翻訳して後者を保存するには、andに分割c1_xすると便利です。c1x

  tidyr::separate(cnum, c("cnum", "xyz"), sep = "_") %>%
#   treatment         c_n cnum xyz      value
# 1         a  0.20745647   c1   x -0.1250222
# 2         b  0.01015871   c1   x -0.4585088
# 3         a  1.65671028   c1   x -0.2455927
# 4         b -0.24037137   c1   x  0.6219516
# 5         a -1.16092349   c1   x -0.3716138
# 6         b  1.61191700   c1   x  1.7605452

c1ここから、、c2、およびc3変数をに変換してみましょうa1(他の 9 つの変数についても繰り返します) variableTransform。

  left_join(variableTransform, by = "cnum") %>%
  select(-cnum) %>%
#   treatment         c_n xyz      value cvar
# 1         a  0.20745647   x -0.1250222   a1
# 2         b  0.01015871   x -0.4585088   a1
# 3         a  1.65671028   x -0.2455927   a1
# 4         b -0.24037137   x  0.6219516   a1
# 5         a -1.16092349   x -0.3716138   a1
# 6         b  1.61191700   x  1.7605452   a1

複数の変数を (単純なで) 同時に処理したいのでmutate、いくつかの変数を列に戻す必要があります。gather(私たちが ed と willを行った理由はspread、物事を整理し、適切に名前を付けるのに役立ちます。誰かがそれを行う別の方法を思い付くことができると確信しています。)

  tidyr::spread(cvar, value) %>% head
#   treatment       c_n xyz         a1          a2         a3          a4
# 1         a -1.535934   x -0.3276474  1.45959746 -1.2650369  1.02795419
# 2         a -1.535934   y -1.3662388 -0.05668467  0.4867865 -0.10138979
# 3         a -1.535934   z -2.5026018 -0.99797169  0.5181513  1.20321878
# 4         a -1.363584   x -0.9742016 -0.12650863  1.3612361 -0.24840493
# 5         a -1.363584   y -0.9795871  1.52027017  0.5510857  1.08733839
# 6         a -1.363584   z  0.2985557 -0.22883439  0.1536078 -0.09993095

ここからはmutate、正しい答えを得る必要があります。

score 0 · Accepted Answer

r2evans の答えに似ていますが、結合の代わりに操作が増えます (説明が少なくなります)。

library(tidyr)
library(stringr)
library(dplyr)

# get it into fully long form
gather(df, key = cc_xyz, value = value, c1_x:c12_z) %>%
    # separate off the xyz and the c123
    separate(col = cc_xyz, into = c("cc", "xyz")) %>%
    # extract the number
    mutate(num = as.numeric(str_replace(cc, pattern = "c", replacement = "")),
           # mod it by 4 for groupings and add a letter so its a good col name
           num_mod = paste0("v", (num %% 4) + 1)) %>%
    # remove unwanted columns
    select(-cc, -num) %>%
    # go into a reasonable data width for calculation
    spread(key = num_mod, value = value) %>%
    # calculate
    mutate(result = v1 + v2/c_n + v3 + v4 / c_n)

#    treatment          c_n xyz           v1           v2            v3          v4        result
# 1          a -1.433858289   x  1.242153708 -0.985482158 -0.0240414692  1.98710285    0.51956295
# 2          a -1.433858289   y -0.019255516  0.074453615 -1.6081599298  1.18228939   -2.50389188
# 3          a -1.433858289   z -0.362785313  2.296744655 -0.0610463292  0.89797526   -2.65188998
# 4          a -0.911463819   x -1.088308527 -0.703388193  0.6308253909  0.22685013    0.06534405
# 5          a -0.911463819   y  1.284513516  1.410276163  0.5066869590 -2.07263912    2.51790289
# 6          a -0.911463819   z  0.957778345 -1.136532104  1.3959561507 -0.50021647    4.14947069
# ...

r - dplyr mutate 内の forloop

2 に答える 2

Related

Reference