ngramsを見つけながらRの別の変数にTerm Document Matrixのスパース性と最大用語長を格納する方法は?
library(tm)
library(RWeka)
#stdout <- vector('character')
#con <- textConnection('stdout','wr',local = TRUE)
#reading the csv file
worklog <- read.csv("To_Kamal_WorkLogs.csv");
#removing the unwanted columns
cols <- c("A","B","C","D","E","F");
colnames(worklog)<-cols;
worklog2 <- worklog[c("F")]
#removing non-ASCII characters
z=iconv(worklog2, "latin1", "ASCII", sub="")
#cleaning the data Removing Date and Time
worklog2$F=gsub("[0-9]+/[0-9]+/[0-9]+ [0-9]+:[0-9]+:[0-9]+ [A,P][M]","",worklog2$F);
#loading the vector Data to corpus
a <- Corpus(VectorSource(worklog2$F))
#cleaning the data
a <- tm_map(a,removeNumbers)
a <- tm_map(a,removePunctuation)
a <- tm_map(a,stripWhitespace)
a <- tm_map(a,tolower)
a <- tm_map(a, PlainTextDocument)
a <- tm_map(a,removeWords,stopwords("english"))
a <- tm_map(a,stemDocument,language = "english")
#removing custom stopwords
stopwords="open";
if(!is.null(stopwords)) a <- tm_map(a, removeWords, words=as.character(stopwords))
#finding 2,3,4 grams
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm2 <- TermDocumentMatrix(a, control = list(tokenize = bigramTokenizer))
tdm2 <- removeSparseTerms(tdm2, 0.75)
#output
> tdm2
<<TermDocumentMatrix (terms: 27, documents: 8747)>>
Non-/sparse entries: 87804/148365
Sparsity : 63%
Maximal term length: 20
Weighting : term frequency (tf)
上記のスパース性、最大項長、加重、非/スパース エントリを個別の変数に格納する方法。