貴重なコメントやアドバイスをありがとうございました。最後に、次のコードを使用してタスクを実行しました。
私は優れたプログラマーではなく、ほとんどの場合、問題に対処するために力ずくのアルゴリズムしか使用できません。私はいつもあなたたちのような専門家に私の仕事でもっと賢い方法を学ぶように頼むのが好きです。
そして、私はRにとても慣れていないと思います。私は常に「リスト」、「データフレーム」、...そしてまたlapply、mapply、...の概念に混乱しています...そして問題が正規表現に関係している場合はさらに悪いです...それはすべてを持っていることを非常に光栄に思いますStackOverflowSocietyで私にアドバイスをしてくれたあなたの。
私はあなたたちから多くを学んだと思います
以下は、実際のデータを処理するために使用した最終的なコードです。もっと賢い方法があれば、もう一度アドバイスをお願いします。
何度も何度もありがとう。
##### Clear things to start #####
rm(list=ls(all=TRUE))
#### Testing Data Frame ####
df <- data.frame(ID = c(1, 2, 3, 11, 12, 13), stringsAsFactors=FALSE,
String = c("LocationID=123,321,345&TimeID=456,321,789&TypeID=12,32&DummyID=969",
"LocationID=123,345&TimeID=456,321&DummyID=969",
"LocationID=123,321,345&TypeID=32&DummyID=969",
"LocationID=123,321,345&TimeID=456,321,789&TypeID=&DummyID=969",
"LocationID=123,345&TimeID=&TypeID=A&DummyID=969",
"LocationID=123,321,345&TypeID=32&DummyID=969"),
Values = c(100, 50, 120, 100, 50, NA))
# The wanted IDs
fields <- c("LocationID", "TimeID", "TypeID")
outFile <- "./Co_occurence.xlsx"
#### Main Program ####
# Get each String component in a list
df$splitString <- strsplit(df$String, split='&')
## Identify each String with the record ID before further processing
## (we will collect them out of the dataframe)
names(df$splitString) <- df$ID
# Split each component on [=, ] (*i.e.* split wherever there is an equal sign,
# a comma or a space). The first element is the name of the String, and the
# next elements are each possible value for it.
df$splitStringValues <- lapply(X=df$splitString, FUN=strsplit, split='[=, ]+')
# Processing the list is much easier with plyr
library(package=plyr)
# You take the list of strings, and for each string get the values
# (all but the first, which is the name of the string), record the
# name of the string for each value, and collect everything in a dataframe.
# For empty strings set NA.
strings <- ldply(.data=df$splitStringValues,
.fun=function(record){
ldply(.data=record,
.fun=function(string){
if(length(string)>1){
.tmp <- data.frame(Val=string[-1])
.tmp <- cbind(String=string[1], .tmp)
} else {
# Set String = NA if Value is Empty
# Otherwise will get 1 for entries like ID2 in "ID1=123&ID2=&ID3=456"
.tmp <- data.frame(Val=NA, String=NA)
}
.tmp
})
})
# Remove those unwanted IDs
finalStrings <- strings[strings$String %in% fields,]
finalStrings$String <- factor(finalStrings$String)
# Count values in each string
OutStrings <- as.data.frame.matrix(table(finalStrings$.id, finalStrings$String))
OutStrings$ID <- rownames(OutStrings)
OutStrings <- merge(df[,c("ID", "String", "Values")],
OutStrings, by="ID")
OutStrings$Total <- rowSums(OutStrings[,4:ncol(OutStrings)],)
OutStrings$nFields<- rowSums(OutStrings[,4:ncol(OutStrings)]!=0,)
OutStrings <- OutStrings[with(OutStrings, order(-OutStrings$Value)), ]
# Count the total occurrence for different number of conditions used
OutDistrTotal <- aggregate(Values~Total, sum, data=OutStrings)
OutDistrTotal <- OutDistrTotal[with(OutDistrTotal, order(-OutDistrTotal$Values)),]
# Count the total occurrence for different number of Fields used
OutDistrnFields <- aggregate(Values~nFields, sum, data=OutStrings)
OutDistrnFields <- OutDistrnFields[with(OutDistrnFields, order(-OutDistrnFields$Values)),]
# And calculate co-ocurrence.
co_ocurrence <- ddply(.data=finalStrings, .variables='.id',
.fun=function(x){
data.frame(Combinations=combn(unique(x$String), m=2, paste, collapse='-'))
})
Outcooccurrence <- data.frame(prop.table(table(co_ocurrence$Combinations)))
Outcooccurrence <- Outcooccurrence[with(Outcooccurrence, order(-Outcooccurrence$Freq)),]
library("xlsx")
write.xlsx(OutStrings, outFile, "Strings", append=F, row.names = FALSE)
write.xlsx(OutDistrTotal, outFile, "DistrTotal", append=T, row.names = FALSE)
write.xlsx(OutDistrnFields, outFile, "OutDistrnFields", append=T, row.names = FALSE)
write.xlsx(Outcooccurrence, outFile, "Outcooccurrence", append=T, row.names = FALSE)