1

次のデータ フレームがあります https://www.dropbox.com/s/c02qu7uobvrc8ku/college_Rda

これはデータのサンプルです: ( copy+paste'able)

 educational_history <- structure(list(SCH_COLLEGE_STATUS_1997_09 = structure(c(1L, 1L, 
1L, 1L, 5L, 1L, 1L, 5L, 5L, 5L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_1998_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_1999_09 = structure(c(3L, 
1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2000_09 = structure(c(3L, 
3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2001_09 = structure(c(3L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 3L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2002_09 = structure(c(3L, 
3L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2003_09 = structure(c(1L, 
3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2004_09 = structure(c(1L, 
3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2005_09 = structure(c(1L, 
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 3L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2006_09 = structure(c(1L, 
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2007_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 3L, 1L, 4L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2008_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 3L, 1L, 4L, 1L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2009_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2010_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 5L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2011_09 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L), .Label = c("Not enrolled in college", 
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program", 
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), PUBID = c(1, 
2, 3, 4, 5, 6, 7, 8, 9, 10)), .Names = c("SCH_COLLEGE_STATUS_1997_09", 
"SCH_COLLEGE_STATUS_1998_09", "SCH_COLLEGE_STATUS_1999_09", "SCH_COLLEGE_STATUS_2000_09", 
"SCH_COLLEGE_STATUS_2001_09", "SCH_COLLEGE_STATUS_2002_09", "SCH_COLLEGE_STATUS_2003_09", 
"SCH_COLLEGE_STATUS_2004_09", "SCH_COLLEGE_STATUS_2005_09", "SCH_COLLEGE_STATUS_2006_09", 
"SCH_COLLEGE_STATUS_2007_09", "SCH_COLLEGE_STATUS_2008_09", "SCH_COLLEGE_STATUS_2009_09", 
"SCH_COLLEGE_STATUS_2010_09", "SCH_COLLEGE_STATUS_2011_09", "PUBID"
), row.names = c(NA, 10L), class = "data.frame")

そのデータを使用して新しいデータ フレームを生成したいと考えています。

PUBID と 4 年制大学に入学した初年度の 2 つのフィールドだけが必要です。年に関する情報は、列の名前の中にあります。私は試した:

FirstYear4C <- function(ID) {
  ndX=which(educational_history$PUBID==ID)
  educational_historyNdX=educational_history[ndX,]
  year=NA
  if (educational_historyNdX$SCH_COLLEGE_STATUS_1997_09=="Enrolled in 4-year college"){
    year=1997
    return(year)
  } 
  if (educational_historyNdX$SCH_COLLEGE_STATUS_1998_09=="Enrolled in 4-year college"){
    year=1998
    return(year)
  }  
  if (educational_historyNdX$SCH_COLLEGE_STATUS_1999_09=="Enrolled in 4-year college"){
    year=1999
    return(year)
  }  
  if (educational_historyNdX$SCH_COLLEGE_STATUS_2000_09=="Enrolled in 4-year college"){
    year=2000
    return(year)
  }
  return(NA)
}
FirstYear<-unlist(lapply(X=educational_history$PUBID,FirstYear4C))
FourYearCollege<-data.frame(PUBID=educational_history$PUBID,
                            FirstYear=FirstYear)

その関数をコーディングするより良い方法があると確信しています。列ごとにコピーして貼り付ける必要があるのは非常に非効率的です。

PUBID    1stYear4YC 
1        1999
2        2000
... 
6        2000 
4

4 に答える 4

1
library(data.table)
library(reshape2)

data.table(melt(educational_history, id.var = 'PUBID'))[,
    list(first.year = sub('.*_([0-9]+)_[0-9]+$',
                          '\\1',
                          variable[value == "Enrolled in 4-year college"][1])),
    by = PUBID]
#    PUBID first.year
# 1:     1       1999
# 2:     2       2000
# 3:     3         NA
# 4:     4         NA
# 5:     5         NA
# 6:     6       2000
# 7:     7         NA
# 8:     8       1999
# 9:     9       2000
#10:    10       2002

それがどのように機能するかを確認するためにバラバラに実行します。基本的な考え方は、最初に長い形式に変換してから、必要なものを簡単に取得することです。

于 2013-09-20T14:51:35.460 に答える
0

行名とPUBIDがサンプルデータと同じであると仮定します

 Map(function(x) cbind(year=substr(x,20,26),PUBID=which(df[x]=="Enrolled in 4-year college")),as.list(names(df)[-16]))

[[1]]
     year     
[1,] "1997_09"

[[2]]
     year     
[1,] "1998_09"

[[3]]
     year      PUBID
[1,] "1999_09" "1"  
[2,] "1999_09" "8"  

[[4]]
     year      PUBID
[1,] "2000_09" "1"  
[2,] "2000_09" "2"  
[3,] "2000_09" "6"  
[4,] "2000_09" "8"  
[5,] "2000_09" "9"  

[[5]]
     year      PUBID
[1,] "2001_09" "1"  
[2,] "2001_09" "9"  

[[6]]
     year      PUBID
[1,] "2002_09" "1"  
[2,] "2002_09" "2"  
[3,] "2002_09" "8"  
[4,] "2002_09" "9"  
[5,] "2002_09" "10" 

[[7]]
     year      PUBID
[1,] "2003_09" "2"  
[2,] "2003_09" "9"  
[3,] "2003_09" "10" 

[[8]]
     year      PUBID
[1,] "2004_09" "2"  
[2,] "2004_09" "9"  
[3,] "2004_09" "10" 

[[9]]
     year      PUBID
[1,] "2005_09" "10" 

[[10]]
     year     
[1,] "2006_09"

[[11]]
     year      PUBID
[1,] "2007_09" "6"  

[[12]]
     year      PUBID
[1,] "2008_09" "6"  

[[13]]
     year     
[1,] "2009_09"

[[14]]
     year     
[1,] "2010_09"

[[15]]
     year     
[1,] "2011_09"
于 2013-09-20T14:45:42.203 に答える
0

もう1つの答え:

educational_history
require(stringr)
require(plyr)

eh <- melt(educational_history, id.var = "PUBID") ## Long format
eh$enrolled <- str_detect(eh$value, pattern = "^Enrolled in 4")

## Extract year
eh$year <- str_extract(eh$variable, pattern="_[0-9]*_") 
eh$year <- as.numeric(str_replace_all(eh$year, pattern="_", replacement= ""))  

## Summarize
ddply(eh[eh$enrolled, ], .variables=.(PUBID),
    .fun= summarize, FirstYear4YC = min(year)) 
于 2013-09-20T16:14:53.157 に答える