r - R: タグの幅広いリストから接続の長いリストまで

Question

1 または 0 で示されるタグの個々の設定を持つデータフレームがあります。

mydata <- data.frame(
    ID = c(1:4),
    tag1 = c(1, 0, 1, 0),
    tag2 = c(0, 0, 0, 0),
    tag3 = c(1, 0, 1, 1),
    tag4 = c(1, 1, 0, 1),
    tag5 = c(0, 1, 1, 1)
)

(私のデータには 5 つよりも多くのタグがあります)

ネットワークダイアグラムの場合、ワイド形式のデータを、行内の tag=1 の各ペア間の発生の長い形式のリストに変換する方法を探しています。上記の例では、次のようになります。

mydata2 <- data.frame(
    ID = c(1,1,1,2,3,3,3,4,4,4),
    target = c("tag1","tag1","tag3","tag4","tag1","tag1","tag3","tag3","tag3","tag4"),
    source = c("tag3","tag4","tag4","tag5","tag3","tag5","tag5","tag4","tag5","tag5")
)

これにを使用したかったtidyrのgather()ですが、列のペアに使用する方法がわかりません。ペアごとに新しい変数を作成してそれらを収集することもできますが、タグのリストが長い場合、これは非現実的になります。これを行うよりエレガントな方法はありますか？それとも特定の機能ですか？

score 1 · Accepted Answer

を使用したオプションtidyr/dplyr

library(tidyr)
library(dplyr)

tD <- gather(mydata, Var, Val, -ID) %>% #change wide to long format
                          filter(Val!=0) %>% #remove rows with 0 Val
                          select(-Val) #remove the Val column

tD1 <- left_join(tD, tD, by='ID') %>% #self join with the created data
                      filter(Var.x!=Var.y) %>% #remove rows that are same
                      arrange(ID, Var.x, Var.y) #to order (if needed)

tD1[-1] <- t(apply(tD1[-1], 1, sort)) #sort the rows of 2nd and 3rd columns
res <- unique(tD1, by='ID') %>% #keep only unique rows by "ID"
              rename(target=Var.x, source=Var.y) #rename the column names
row.names(res) <- NULL #change the rownames to NULL
#checking the results with the expected result
mydata2[2:3] <- lapply(mydata2[2:3], as.character)  
all.equal(mydata2, res,check.attributes=FALSE)
#[1] TRUE

res
#   ID target source
#1   1   tag1   tag3
#2   1   tag1   tag4
#3   1   tag3   tag4
#4   2   tag4   tag5
#5   3   tag1   tag3
#6   3   tag1   tag5
#7   3   tag3   tag5
#8   4   tag3   tag4
#9   4   tag3   tag5
#10  4   tag4   tag5

score 0 · Accepted Answer

unexported functiondata.tableを使用していますが、これはを使用する1 つの方法です。vecseq

require(data.table)
foo <- function(x) {
    lx = length(x)
    idx1 = data.table:::vecseq(rep.int(1L, lx), (lx-1L):0L, NULL)
    idx2 = data.table:::vecseq(c(seq_len(lx)[-1L], 1L), (lx-1L):0L, NULL)
    list(x[idx1], x[idx2])
}

melt(dt, id="ID")[value == 1L, foo(variable), by=ID]
#     ID   V1   V2
#  1:  1 tag1 tag3
#  2:  1 tag3 tag4
#  3:  1 tag1 tag4
#  4:  3 tag1 tag3
#  5:  3 tag3 tag5
#  6:  3 tag1 tag5
#  7:  4 tag3 tag4
#  8:  4 tag4 tag5
#  9:  4 tag3 tag5
# 10:  2 tag4 tag5

score 0 · Accepted Answer

私が考えることができる最も直接的なアプローチはmelt、「reshape2」から使用し、次にcombnandを使用することmeltです。

library(reshape2)
M <- melt(mydata, id.vars = "ID")               ## Melt the dataset
M2 <- M[M$value > 0, 1:2]                       ## Subset the rows of interest
melt(
  lapply(
    split(M2$variable, M2$ID), function(x) {
      data.frame(t(combn(as.character(x), 2)))  ## Inside, we use combn
    }), id.vars = c("X1", "X2"))                ## We know we'll just have X1 and X2 
#      X1   X2 L1
# 1  tag1 tag3  1
# 2  tag1 tag4  1
# 3  tag3 tag4  1
# 4  tag4 tag5  2
# 5  tag1 tag3  3
# 6  tag1 tag5  3
# 7  tag3 tag5  3
# 8  tag3 tag4  4
# 9  tag3 tag5  4
# 10 tag4 tag5  4

または、次のmeltように 2 番目を避けます。

library(reshape2)
M <- melt(mydata, id.vars = "ID")
M2 <- M[M$value > 0, 1:2]


MS <- split(M2$variable, M2$ID)
do.call(rbind, 
        lapply(names(MS), function(x) {
          data.frame(ID = x, t(combn(as.character(MS[[x]]), 2)))
        }))

r - R: タグの幅広いリストから接続の長いリストまで

4 に答える 4

Related

Reference