1

周波数を検査しようとすると、問題が発生するようです。言葉と連想。

tdm を作成すると、次の情報が表示されます: TermDocumentMatrix

多くのドキュメントで、使用する用語がたくさんあることがわかります。でも!

「tdm」の内容を検査しようとすると、次の情報が表示されます: Inspecting the TDM

tdm が突然空になるのはなぜですか?

誰かが助けてくれることを願っています

tweets <- userTimeline("RDataMining", n = 1000)

(n.tweet <- length(tweets))
tweets[1:3]

#convert tweets to a data frame
tweets.df <- twListToDF(tweets)
dim(tweets.df)


##Text cleaning
library(tm)
#build a corpus and specify the source to be a character vector
myCorpus <- Corpus(VectorSource(tweets.df$text))

#convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower)) 

#remove URLs
removeURL <- function(x) gsub ("http[^[:space:]]*","",x) 
myCorpus <- tm_map(myCorpus,content_transformer(removeURL))

#remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*","",x)
myCorpus <- tm_map(myCorpus,content_transformer(removeNumPunct))

#remove stopwords + 2
myStopwords <- c(stopwords('english'),"available","via")
#remove "r" and "big" from stopwords
myStopwords <- setdiff(myStopwords, c("r","big"))
#remove stopwords from corpus
myCorpus <- tm_map(myCorpus,removeWords,myStopwords)
#remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)

#keep a copy of corpus to use later as a dictionary for stem completion
myCorpusCopy <- myCorpus

#stem words
library(SnowballC)
myCorpus <- tm_map(myCorpus,stemDocument)
stemCompletion2 <- function(x,dictionary) {
x <- unlist(strsplit(as.character(x),""))

#because stemCompletion completes an empty string to a word in dict. Remove empty string to avoid this

 x <- x[x !=""]
 x <- stemCompletion(x, dictionary = dictionary)
 x <- paste (x,sep = "",collapse = "")
 PlainTextDocument(stripWhitespace(x))
}

myCorpus <- lapply(myCorpus, stemCompletion2, dictionary = myCorpusCopy)
myCorpus <- Corpus(VectorSource(myCorpus))

#count freq of "mining"
miningCases <- lapply(myCorpusCopy,
                  function(x) {grep(as.character(x),pattern = "\\<mining")})
sum(unlist(miningCases))

#count freq of "miner"
miningCases <- lapply(myCorpusCopy,
                  function(x) {grep(as.character(x),pattern = "\\<miner")})
sum(unlist(miningCases))

#count freq of "r"
miningCases <- lapply(myCorpusCopy,
                  function(x) {grep(as.character(x),pattern = "\\<r")})
sum(unlist(miningCases))

#replace "miner" with "mining"
myCorpus <- tm_map(myCorpus,content_transformer(gsub),
               pattern = "miner", replacement = "mining")

tdm <- TermDocumentMatrix(myCorpus, control = list(removePunctuation =    TRUE,stopwords = TRUE))
tdm

##Freq words and associations
idx <- which(dimnames(tdm)$Terms == "r")
inspect(tdm[idx + (0:5), 101:110])

#inspect frequent words
(freq.terms <- findFreqTerms(tdm, lowfreq = 15))
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq,term.freq >= 15)
df <- data.frame(term = names(term.freq), freq = term.freq)
4

1 に答える 1