3

私は R を使用してテキスト マイニングでターム ペーパーを作成しています。私たちのタスクは、記事のトーン (ポジティブ/ネガティブ) を推測することです。記事はそれぞれのフォルダに保存されます。サンプルのトレーニングを通じて学習する分類システムを作成する必要があります。http://www.youtube.com/watch?v=j1V2McKbkLoのコードを再利用し ました。最後の行を除くコード全体が正常に実行されました。以下はコードです。

tone<- c("Positive", "Negative")
folderpath <- "C:/Users/Tanmay/Desktop/R practice/Week8"

options(stringAsFactors = FALSE)

corpus<-Corpus(DirSource(folderpath))
corpuscopy<-corpus
summary(corpus)
inspect(corpus)

#Clean data
CleanCorpus <- function(corpus){

  corpustemp <- tm_map(corpus, removeNumbers)
  corpustemp <- tm_map(corpus, removePunctuation)
  corpustemp <- tm_map(corpus, tolower)
  corpustemp <- tm_map(corpus, removeWords, stopwords("english"))
  corpustemp <- tm_map(corpus, stemDocument,language="english")
  corpustemp <- tm_map(corpus, stripWhitespace)

  return(corpustemp )
}


#Document term matrix
generateTDM <- function(tone,path) {

  corpusdir <- sprintf("%s/%s",path,tone)
  corpus<- Corpus(DirSource( directory=corpusdir ,encoding = "ANSI"))
  corpustemp <- CleanCorpus(corpus)
  corpusclean <- DocumentTermMatrix(corpustemp)
  corpusclean <- removeSparseTerms(corpusclean , 0.7)
  result <- list(Tone = tone, tdm = corpusclean) 
}

tdm <- lapply(tone,generateTDM,path=folderpath)

#Attach tone
ToneBindTotdm <- function(tdm){
  temp.mat <- data.matrix(tdm[["tdm"]])
  temp.df <- as.data.frame(temp.mat)
  temp.df <- cbind(temp.df,rep(tdm[["Tone"]]),nrow(temp.df))
  colnames(temp.df)[ncol(temp.df)] <- "PredictTone"
  return(temp.df)
}
Tonetdm <- lapply(tdm,ToneBindTotdm)


#Stack
Stacktdm <- do.call(rbind.fill,Tonetdm)
Stacktdm[is.na(Stacktdm)] <- 0


#Holdout

trainid <- sample(nrow(Stacktdm),ceiling(nrow(Stacktdm) * 0.7))
testid <- (1:nrow(Stacktdm)) [- trainid]

#knn
tdmone <- Stacktdm[,"PredictTone"]
tdmone.nl <- Stacktdm[, !colnames(Stacktdm) %in% "PredictTone"]

knnPredict <- knn(tdmone.nl[trainid,],tdmone.nl[testid,],tdmone[trainid],k=5)

これを実行しようとすると、最後の行 (knn) でエラーが発生しました:

**Error in knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid],  : 
  NA/NaN/Inf in foreign function call (arg 6)
In addition: Warning messages:
1: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid],  :
  NAs introduced by coercion
2: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid],  :
  NAs introduced by coercion**

誰でも私を助けてください。また、他にもっと簡単で良い分類方法があれば教えてください。ありがとうございます。

4

1 に答える 1

1

I was stuck on the same issue. But I modified it my way to remove all the NA values. You can check my code and compare what might be the problem in your code.

#init
libs <- c("tm" , "plyr" , "class")
lapply(libs,require, character.only=TRUE)

#set options
options(stringsAsFactors = FALSE)

#set parameters

candidates <- c("user1" , "user2" ,"test")
pathname <- "C:/Users/prabhjot.rai/Documents/Project_r/textMining"

#clean text

cleanCorpus <- function(corpus)
{
  corpus.tmp <- tm_map(corpus, removePunctuation)
  corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
  corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
  corpus.tmp <- tm_map(corpus.tmp, removeWords, stopwords("english"))
  corpus.tmp <- tm_map(corpus.tmp, PlainTextDocument)
}

#build TDM

generateTDM <- function(cand,path)
{
  s.dir <- sprintf("%s/%s", path, cand)
  s.cor <- Corpus(DirSource(directory = s.dir))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl)

  s.tdm <- removeSparseTerms(s.tdm, 0.7)
  result <- list(name = cand , tdm = s.tdm)
}



tdm <- lapply(candidates, generateTDM, path = pathname)


test <- t(data.matrix(tdm[[1]]$tdm))
rownames(test) <- c(1:nrow(test))

#attach name and convert to dataframe
makeMatrix <- function(thisTDM){

  test <- t(data.matrix(thisTDM$tdm))
  rownames(test) <- c(1:nrow(test))
  test <- as.data.frame(test, stringsAsFactors = F , na.rm = T)
  test$candidateName <- thisTDM$name
  test <- as.data.frame(test, stringsAsFactors = F , na.rm = T)
}

candTDM <- lapply(tdm, makeMatrix)

# stack all the speeches together

tdm.stack <- do.call(rbind.fill, candTDM)
tdm.stack[is.na(tdm.stack)] <- as.numeric(0)

#testing and training sets
train <- tdm.stack[ tdm.stack$candidateName!= 'test' ,  ]
train <- train[, names(train) != 'candidateName']
test <- tdm.stack[ tdm.stack$candidateName == 'test' , ]
test <- test[, names(test) != 'candidateName']
classes <- tdm.stack [ tdm.stack$candidateName != 'test' , 'candidateName']
classes <- as.factor(classes)

myknn <- knn(train=train, test = test , cl = classes , k=1)
myknn

Keep a testing file in the test folder next to user1 and user2 folders to check the output of this algorithm. And keep the value of k as the square root of number of speeches, preferably an odd number. And ignore the redundancy of testing and training set assignment. It was not working in one line in my machine so did it in two lines.

于 2015-08-21T06:23:13.060 に答える