r - 単語のベクトルをスペルチェックするために R で反復する

Question

不適切な間隔の文でいっぱいのデータセットがあります。いくつかのスペースを削除する方法を考え出そうとしています。

単語のデータフレームに変換する文から始めます。

> word5 <- "hotter the doghou se would be bec ause the co lor was diffe rent"
> abc1 <- data.frame(filler1 = 1,words1=factor(unlist(strsplit(word5, split=" "))))
> abc1
   filler1 words1
1        1 hotter
2        1    the
3        1 doghou
4        1     se
5        1  would
6        1     be
7        1    bec
8        1   ause
9        1    the
10       1     co
11       1    lor
12       1    was
13       1  diffe
14       1   rent

次に、次のコードを使用してスペルチェックを試み、前後の単語の組み合わせである単語を結合します。

abc2 <- abc1
i <- 1
while(i < nrow(abc1)){
  print(abc2)
  if(nrow(aspell(abc1$words1[i])) == 0){
    print(paste(i,"Words OK",sep=" | "));flush.console() 
    i <- i + 1
  }
 else{
  if(nrow(aspell(abc1$words1[i])) > 0 & i != 1){
    preWord1 <- abc1$words1[i-1]
    postWord1 <- abc1$words1[i+1]
    badWord1 <- abc1$words1[i]
    newWord1 <- factor(paste(preWord1,badWord1,sep=""))
    newWord2 <- factor(paste(badWord1,postWord1,sep=""))

    if(nrow(aspell(newWord1)) == 0 & nrow(aspell(newWord2)) != 0){
      abc2[i,"words1"] <-as.character(newWord1)
      abc2 <- abc2[-c(i+1),]
      print(paste(i,"word1",sep=" | "));flush.console()
      i <- i + 1
    }

    if(nrow(aspell(newWord1)) != 0 & nrow(aspell(newWord2)) == 0){
      abc2[i ,"words1"] <-as.character(newWord2)
      abc2 <- abc2[-c(i-1),]
      print(paste(i,"word2",sep=" | "));flush.console()
      i <- i + 1
    }

  }
}
}

これでしばらく遊んだ後、ある種のイテレータが必要であるという結論に達しましたが、Rでそれを実装する方法が不明です.何か提案はありますか?

score 10 · Accepted Answer

注：以前のソリューションのすべての欠点を回避するため、まったく異なる、はるかに優れたソリューションを思いつきました。しかし、私はまだ古いソリューションを維持したいと思います。そのため、新しい回答として追加しました。間違っている場合は修正してください。

このアプローチでは、データセットを少し再フォーマットします。ベースは、私が wordpair オブジェクトと呼んでいるものです。例えば：

> word5
[1] "hotter the doghou se would be bec ause the col or was diffe rent"

次のようになります。

> abc1_pairs
    word1  word2
1  hotter    the
2     the doghou
3  doghou     se
4      se  would
5   would     be
6      be    bec
7     bec   ause
8    ause    the
9     the    col
10    col     or
11     or    was
12    was  diffe
13  diffe   rent

次に、単語ペアを反復処理し、それらが有効な単語であるかどうかを確認し、有効な新しい単語が見つからなくなるまでこれを再帰的に行います (この投稿の最後にいくつかの追加関数がリストされていることに注意してください)。

# Recursively delete wordpairs which lead to a correct word
merge_wordpairs = function(wordpairs) {
  require(plyr)
  merged_pairs = as.character(mlply(wordpairs, merge_word))
  correct_words_idxs = which(sapply(merged_pairs, word_correct))
  if(length(correct_words_idxs) == 0) {
    return(wordpairs)
  } else {
    message(sprintf("Number of words about to be merged in this pass: %s", length(correct_words_idxs)))
    for(idx in correct_words_idxs) {
      wordpairs = merge_specific_pair(wordpairs, idx, delete_pair = FALSE)
    }
    return(merge_wordpairs(wordpairs[-correct_words_idxs,])) # recursive call
  }
}

これをサンプルデータセットに適用すると、次のようになります。

> word5 <- "hotter the doghou se would be bec ause the col or was diffe rent"
> abc1 = strsplit(word5, split = " ")[[1]]
> abc1_pairs = wordlist2wordpairs(abc1)
> abc1_pairs
    word1  word2
1  hotter    the
2     the doghou
3  doghou     se
4      se  would
5   would     be
6      be    bec
7     bec   ause
8    ause    the
9     the    col
10    col     or
11     or    was
12    was  diffe
13  diffe   rent
> abc1_merged_pairs = merge_wordpairs(abc1_pairs)
Number of words about to be merged in this pass: 4
> merged_sentence = paste(wordpairs2wordlist(abc1_merged_pairs), collapse = " ")
> c(word5, merged_sentence)
[1] "hotter the doghou se would be bec ause the col or was diffe rent"
[2] "hotter the doghouse would be because the color was different"

必要な追加機能:

# A bunch of functions
# Data transformation
wordlist2wordpairs = function(word_list) {
  require(plyr)
  wordpairs = ldply(seq_len(length(word_list) - 1), 
                    function(idx) 
                      return(c(word_list[idx], 
                               word_list[idx+1])))
  names(wordpairs) = c("word1", "word2")
  return(wordpairs)
}
wordpairs2wordlist = function(wordpairs) {
  return(c(wordpairs[[1]], wordpairs[[2]][nrow(wordpairs)]))
}

# Some checking functions
# Is the word correct?
word_correct = function(word) return(nrow(aspell(factor(word))) == 0)
# Merge two words
merge_word = function(word1, word2) return(paste(word1, word2, sep = ""))

# Merge a specific pair, option to postpone deletion of pair
merge_specific_pair = function(wordpairs, idx, delete_pair = TRUE) {
  # merge pair into word
  merged_word = do.call("merge_word", wordpairs[idx,])
  # assign the pair to the idx above
  if(!(idx == 1)) wordpairs[idx - 1, "word2"] = merged_word
  if(!(idx == nrow(wordpairs))) wordpairs[idx + 1, "word1"] = merged_word
  # assign the pair to the index below (if not last one)
  if(delete_pair) wordpairs = wordpairs[-idx,]
  return(wordpairs)
}

score 3 · Accepted Answer

できることは、再帰を使用することです。以下のコードは、例を少し変更したバージョンです。すべての単語が正しいかどうかをチェックし、正しい場合は単語のリストを返します。そうでない場合は、その単語を前の単語と結合し、その後の単語と結合しようとします。前の単語のマージが正しかった場合、これはのようなマージにつながりますpaste(word_before, word, word_after)。マージを試みた後、単語をマージする関数が新しい単語リストで呼び出されます。この再帰は、間違った単語がなくなるまで続きます。

# Wrap the spell checking in a function, makes your code much more readable
word_correct = function(word) return(nrow(aspell(factor(word))) == 0)
# Merge two words
merge_word = function(word1, word2) return(paste(word1, word2, sep = ""))
# Merge two words and replace in list
merge_words_in_list = function(word_list, idx1, idx2) {
  word_list[idx1] = merge_word(word_list[idx1], word_list[idx2])
  return(word_list[-idx2])
}
# Function that recursively combines words 
combine_words = function(word_list) {
  message("Current sentence: ", paste(word_list, collapse = " "))
  words_ok = sapply(word_list, word_correct)
  if(all(words_ok)) {
    return(word_list) 
  } else {
    first_wrong_word = which(!words_ok)[1]
    combination_before = merge_word(word_list[first_wrong_word], 
                                    word_list[first_wrong_word-1])
    if(word_correct(combination_before)) {
      word_list = merge_words_in_list(word_list, first_wrong_word-1, 
                                      first_wrong_word)
    }
    combination_after = merge_word(word_list[first_wrong_word], 
                                   word_list[first_wrong_word+1])
    if(word_correct(combination_after)) {
      word_list = merge_words_in_list(word_list, first_wrong_word, 
                                      first_wrong_word+1)
    }
    return(combine_words(word_list))  # Recursive call
  }
}

この一連の関数を文の (わずかに変更した) バージョンに適用すると、次のようになります。

word5 <- "hotter the doghou se would be bec ause the col or was diffe rent"
abc1 = strsplit(word5, split = " ")[[1]]
combine_words(abc1)
Current sentence: hotter the doghou se would be bec ause the col or was diffe rent
Current sentence: hotter the doghouse would be bec ause the col or was diffe rent
Current sentence: hotter the doghouse would be because the col or was diffe rent
Current sentence: hotter the doghouse would be because the col or was different

いくつかの問題:

combination_beforeとの両方combination_afterが有効でない場合、プログラムが無限再帰に陥るという問題がまだあります。プログラムは、すべての単語が有効な場合にのみ停止します。
両方とも前の単語と結合し、次の単語が有効な場合、どうすればよいでしょうか?
コードは間違った単語のみをマージします。たとえば、'col' と 'or' はaspell、マージしたいときに良い単語と判断されます。これは新たな課題につながります。この場合、マージは明らかですが、大規模なデータセットでは、正しい単語のセット自体を組み合わせる方法が明らかでない場合があります。

それでもなお、この例は再帰的アプローチをうまく示していると思います。

r - 単語のベクトルをスペル チェックするために R で反復する

2 に答える 2

Related

Reference

r - 単語のベクトルをスペルチェックするために R で反復する