r - データフレーム内の要素のペアに対する操作

Question

列がペアになっているxとの 2 つのデータフレームがあります。weightsデータフレームの例を次に示します。

x = read.table(text = "
  yr1  yr2  yr3  yr4
   10   15    6    8
   10   20   30   NA
   NA    5    2    3
  100  100   NA   NA", 
sep = "", header = TRUE)

weights = read.table(text = "
  yr1  yr2  yr3  yr4
    2    4    1    3
    2    2    4    2
    3    2    2    3
    4    2    2    4", 
sep = "", header = TRUE)

列yr1とyr2は 1 つのペアであり、列yr3とyr4は別のペアです。私の実際のデータでは、列は最大にyr100なり、50 組の列があります。

yr1またはyr2が欠落している場合x、欠落している観測を次のように埋めます。

(5 / 2) * 3

同様にyr3またはyr4:

(30 / 4) * 2

ここで、5 (または 30) は列の要素で、x指定された要素のペアで欠落していません。最初の例の値 2 と 3 (および 2 番目の例の値 4 と 2) は、weightsデータフレーム内の特定の要素ペアに対応するデータフレーム内の要素xです。ペアの両方の要素が欠落している場合は、欠落しxたままにしておきます。

Rネストされたを使用して上記の操作を行うコードを次に示しfor loopsます。ただし、実際のデータセットには 2000 または 3000 行あり、ネストされたデータセットはfor loops現在 10 時間以上実行されています。

for(i in 1: (ncol(x)/2)) {
  for(j in 1: nrow(x)) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)] 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  NA 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  NA

 }
}

if3 番目と 4 番目のステートメントはおそらく必要ないことに気付きました。ifこの 2 つのステートメントを単純に削除すると、このコードの実行時間が大幅に短縮される可能性があります。

reshapeただし、ネストされたの代わりに使用する次の代替ソリューションも思いつきましたfor loops。

n.years <- 4

x2  <- reshape(x      , direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
wt2 <- reshape(weights, direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))

x2$yr1  <- ifelse(is.na(x2$yr1), (x2$yr2 / wt2$yr2) * wt2$yr1, x2$yr1)
x2$yr2  <- ifelse(is.na(x2$yr2), (x2$yr1 / wt2$yr1) * wt2$yr2, x2$yr2)

x3  <- reshape(x2, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
x3

現在の R セッションをシャットダウンして上記のアプローチのいずれかを試す前に、より効率的な代替案を提案してください。私はmicrobenchmark少し使用しましたが、ここではまだ試していません。部分的には、考えられるソリューションごとに関数を作成するのが少し怖いからです。また、関数ファミリーを使用して解決策を考え出そうとしましたapplyが、思い付くことができませんでした。

私のreshape解決策は、次の質問から導き出されました。

複数のメジャー変数を持つデータフレームの形状変更

計算時間に加えて、メモリ不足の可能性も懸念されます。

ベース R に固執するように努めていますが、他のオプションを使用して目的の出力を得ることを検討します。ご提案ありがとうございます。

score 0 · Accepted Answer

トーマスの答えは、私が試した 3 つのアプローチのどれよりもはるかに優れています。ここで、4 つのアプローチをと比較しmicrobenchmarkます。実際のデータでトーマスの答えをまだ試していません。私のオリジナルのネストされた for ループアプローチは、22 時間後も実行されています。

Unit: milliseconds
             expr       min        lq   median       uq      max neval
 fn.1(x, weights)  98.69133  99.47574 100.5313 101.7315 108.8757    20
 fn.2(x, weights) 755.51583 758.12175 762.3775 776.0558 801.9615    20
 fn.3(x, weights) 564.21423 567.98822 568.5322 571.0975 575.1809    20
 fn.4(x, weights) 367.05862 370.52657 371.7439 373.7367 395.0423    20

#########################################################################################

# create data

set.seed(1234)

n.rows <- 40
n.cols <- 40
n.sample <- n.rows * n.cols

x <- sample(20, n.sample, replace=TRUE)
x.NA <- sample(n.rows*n.cols, 10*(n.sample / n.rows), replace=FALSE)
x[x.NA] <- NA
x <- as.data.frame(matrix(x, nrow = n.rows))

weights <- sample(4, n.sample, replace=TRUE)
weights <- as.data.frame(matrix(weights, nrow = n.rows))
weights

#########################################################################################

# Thomas's function

fn.1 <- function(x, weights){

newx <- reshape(x, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("v1", "v2"))

newwt <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("w1", "w2"))

condwtmean <- function(x,y,wtx,wty){
    if(xor(is.na(x),is.na(y))){
        if(is.na(x))
            x <- (y / wty) * wtx # replacement function
        if(is.na(y))
            y <- (x / wtx) * wty # replacement function
        return(weighted.mean(c(x,y),c(wtx,wty)))
    }
    else if(!is.na(x) & !is.na(y))
        return(weighted.mean(c(x,y),c(wtx,wty)))
    else
        return(NA)  
}

newx$wtmean <- mapply(condwtmean, newx$v1, newx$v2, newwt$w1, newwt$w2)

newx2 <- reshape(newx[,c(1,4:5)], v.names = "wtmean", timevar = "time", direction = "wide")

newx2 <- newx2[,2:(n.cols/2+1)]
names(newx2) <- paste('X', 1:(n.cols/2), sep = "")

return(newx2)

}

fn.1.output <- fn.1(x, weights)

#########################################################################################

# nested for-loops with 4 if statements

fn.2 <- function(x, weights){

for(i in 1: (ncol(x)/2)) {
  for(j in 1: nrow(x)) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)] 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  NA 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  NA

 }
}

x.weights = x * weights

numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
  apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})

denominator <- sapply(seq(1,ncol(weights),2), function(i) {
  apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})

weighted.x <- numerator/denominator

for(i in 1: (ncol(x)/2)) {
  for(j in 1:   nrow(x)      ) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  NA 

 }
}

return(weighted.x)

}

fn.2.output <- fn.2(x, weights)

fn.2.output <- as.data.frame(fn.2.output)
names(fn.2.output) <- paste('X', 1:(n.cols/2), sep = "")

#########################################################################################

# nested for-loops with 2 if statements

fn.3 <- function(x, weights){

for(i in 1: (ncol(x)/2)) {
  for(j in 1: nrow(x)) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)] 

 }
}

x.weights = x * weights

numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
  apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})

denominator <- sapply(seq(1,ncol(weights),2), function(i) {
  apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})

weighted.x <- numerator/denominator

for(i in 1: (ncol(x)/2)) {
  for(j in 1:   nrow(x)      ) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  NA 

 }
}

return(weighted.x)

}

fn.3.output <- fn.3(x, weights)

fn.3.output <- as.data.frame(fn.3.output)
names(fn.3.output) <- paste('X', 1:(n.cols/2), sep = "")

#########################################################################################

# my reshape solution

fn.4 <- function(x, weights){

new.x    <- reshape(x      , direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("v1", "v2"))
wt       <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("w1", "w2"))

new.x$v1 <- ifelse(is.na(new.x$v1), (new.x$v2 / wt$w2) * wt$w1, new.x$v1)
new.x$v2 <- ifelse(is.na(new.x$v2), (new.x$v1 / wt$w1) * wt$w2, new.x$v2)

x2  <- reshape(new.x, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("v1", "v2")) 

x <- x2[,2:(n.cols+1)]

x.weights = x * weights

numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
  apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})

denominator <- sapply(seq(1,ncol(weights),2), function(i) {
  apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})

weighted.x <- numerator/denominator

for(i in 1: (ncol(x)/2)) {
  for(j in 1:   nrow(x)      ) {

    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  NA 

 }
}

return(weighted.x)

}

fn.4.output <- fn.4(x, weights)

fn.4.output <- as.data.frame(fn.4.output)
names(fn.4.output) <- paste('X', 1:(n.cols/2), sep = "")

#########################################################################################

rownames(fn.1.output) <- NULL
rownames(fn.2.output) <- NULL
rownames(fn.3.output) <- NULL
rownames(fn.4.output) <- NULL

all.equal(fn.1.output, fn.2.output)
all.equal(fn.1.output, fn.3.output)
all.equal(fn.1.output, fn.4.output)
all.equal(fn.2.output, fn.3.output)
all.equal(fn.2.output, fn.4.output)
all.equal(fn.3.output, fn.4.output)

library(microbenchmark)

microbenchmark(fn.1(x, weights), fn.2(x, weights), fn.3(x, weights), fn.4(x, weights), times=20)

#########################################################################################

r - データ フレーム内の要素のペアに対する操作

2 に答える 2

Related

Reference

r - データフレーム内の要素のペアに対する操作