2

これが私の小さなデータセットです。

Indvidual <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
Parent1 <- c(NA, NA, "A", "A", "C", "C", "C", "E", "A", NA)
Parent2 <- c(NA, NA, "B", "C", "D", "D", "D", NA, "D", NA)
mydf <- data.frame (Indvidual, Parent1, Parent2)

  Indvidual Parent1 Parent2
1         A    <NA>    <NA>
2         B    <NA>    <NA>
3         C       A       B
4         D       A       C
5         E       C       D
6         F       C       D
7         G       C       D
8         H       E    <NA>
9         I       A       D
10        J      <NA>     <NA>

両親が 2 人または 1 人いる人を考えてみてください。両親が持っているスコアを計算して、スコアを比較して導出する必要があります。

ルールは、親 (parent1 または parent2 列の名前) のいずれかが既知 (NA ではない) であり、1 つの追加スコアとその親のスコアを取得することです。2 人の親が知られている場合は、最高得点者が考慮されます。

次に例を示します。

Individual "A", has both parents unknown so will get score 0
Indiviudal "C", has both parents known (i.e. A, B) 
will get 0 score (maximum of their parents) 

プラス 1 (親のいずれかが既知であるため)

したがって、上記のデータフレームからの予想される出力 (説明付き) は次のとおりです。

Indvidual Parent1 Parent2   Scores     Explanation 
1         A    <NA>    <NA>    0       0 (Max of parent Scores NA) + 0 (neither parent knwon) 
2         B    <NA>    <NA>    0       0 (Max of parent Scores NA)  + 0 (neither parent knwon) 
3         C     A       B      1    0 (Max of parent Scores)  +  1 (either parent knwon)       
4         D     A        C      2       1 (Max of parent scores)  +  1 (either parent knwon) 
5         E       C      D      3       2 (Max of parent scores) + 1 (either parent knwon)
6         F       C      D      3       2 (Max of parent scores) + 1 (either parent knwon)
7         G       C      D      3       2 (Max of parent scores) + 1 (either parent knwon)
8         H       E    <NA>     4       3 (Max of parent scores) + 1 (either parent knwon) 
9         I       A       D     3       2 (Max of parent scores) + 1 (either parent knwon)
10        J      <NA>    <NA>   0       0 (Max of parent scores NA)  + 0 (neither parent knwon)

説明: ループが進むにつれて、すでに計算されたスコアが考慮されます。親スコアの最大値

編集:チェイスの質問に基づく

例えば:

Individual C has two parents A and B, each of which has Scores calculated as 0 and 0 
(in row 1 and 2 and column Scores),  means that max (c(0,0)) will be 0

Individual E has parents C and D, whose scores in Scores column is (in row 3 and 4),
 1 and 2, respectively.  So maximum of max(c(1,2)) will be 2.
4

2 に答える 2

2

plyrと再帰引数の使用例

library(plyr)
Indvidual <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
Parent1 <- c(NA, NA, "A", "A", "C", "C", "C", "E", "A", NA)
Parent2 <- c(NA, NA, "B", "C", "D", "D", "D", NA, "D", NA)
mydf <- data.frame (Indvidual, Parent1, Parent2)
scor.fun<-function(x,mydf){
    Explanation<-0
    P1<-as.character(x$Parent1)
    P2<-as.character(x$Parent2)
    score<-as.numeric(!(is.na(P1)||is.na(P1)))
    if(!(is.na(P1)||is.na(P2))){
        Explanation<-max(scor.fun(subset(mydf,Indvidual==P1),mydf)[1],scor.fun(subset(mydf,Indvidual==P2),mydf)[1])
        score<-score+Explanation
    }else{
        Explanation<-ifelse(is.na(P1),0,scor.fun(subset(mydf,Indvidual==P1),mydf)[1])
        Explanation<-max(Explanation,ifelse(is.na(P2),0,scor.fun(subset(mydf,Indvidual==P2),mydf)[1]))
        score<-score+Explanation
    }
    c(score,Explanation)
}

adply(mydf,1,scor.fun,mydf)

おそらく、大きなデータフレームでの再帰に関する最良のアイデアではありません。

于 2012-07-16T14:59:52.220 に答える
1
Individual <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
Parent1 <- c(NA, NA, "A", "A", "C", "C", "C", "E", "A", NA)
Parent2 <- c(NA, NA, "B", "C", "D", "D", "D", NA, "D", NA)
mydf <- data.frame (Individual, Parent1, Parent2, stringsAsFactors = FALSE)

mydf$Scores <- NA
mydf$Scores[rowSums(is.na(mydf[, c("Parent1", "Parent2")])) == 2] <- 0
while(any(is.na(mydf$Scores))){
  KnownScores <- mydf[!is.na(mydf$Scores), c(1, 4)]
  ToCalculate <- mydf[
    mydf$Parent1 %in% c(KnownScores$Individual, NA) & 
    mydf$Parent2 %in% c(KnownScores$Individual, NA) & 
    is.na(mydf$Scores), 
    -4]
  ToCalculate$Score <- apply(
    merge(
      merge(
        ToCalculate, 
        KnownScores, 
        by.x = "Parent1", 
        by.y = "Individual", 
        all.x = TRUE
      ), 
      KnownScores, 
      by.x = "Parent2",
      by.y = "Individual",
      all.x = TRUE
    )[, 4:5], 
    1, 
    max, 
    na.rm = TRUE) + 1
  mydf <- merge(mydf, ToCalculate[, c(1, 4)], all.x = TRUE)
  mydf$Scores[!is.na(mydf$Score)] <- mydf$Score[!is.na(mydf$Score)]
  mydf$Score <- NULL
}
于 2012-07-16T13:12:35.790 に答える