r - R: Sentence to word-table の行番号が一致しません

Question

私の以前の問題から、私はいくつかtextsの異なる行にいくつかあり、テキストからword-table各単語に対して生成しようとしています。ただし、テキスト列の行番号と、行番号がword-table異なる場合に問題が発生しています。一部のテキストで検出され、2 つ以上の行が作成されています。最後に、cbindこれら2つを一緒にすることはできません。コードはこちら。結果が、どのテキストがどのテキストであるかを示すためにそれらをバインドできるテキストの行番号とまったく同じになることを望んでいますword-table。

texts <- c("concratulations    successfully signed   company  please find attached quick guide   can  t figure   immediately ", " conversation   laughing services  sweden", "p please find attached drafted budget   p ", "p please finad attached  agenda  today s board meeting  p ", "p hi   nbsp   p    p please find attached  darft meeting minutes  today s meeting   p ", "p please find attached  final version   minutes  updated action log  please let  know  actions   done   ll update  excel  nbsp   p ", "p hi    p    p please find attached  draft meeting minutes   action log  please provide comments   end  next week   p    p   nice spring party  saturday    p    p   tuija  p ", " p welcome team priority   hope   enjoy yo  p ", "p please find attached  flyer   can study  share   p ", "p attached new version  voice   receiver   p    p minor change request  invitation code       mentioned    invitation code may       tell  check  code  invitation email    end    alarm bell  example telling  new comments   ", "comment  etc     front page  now  seemed  end without warning    p ", "p memo attached  actions    p ", "p please find attached  updated board roles  responsibilities    made  changes   red   document   please review  especially   role  relevant contact info   prepare  comment   meeting  wednesday  nbsp   p ", "p attached documents  review  please comment  soonest   p ")
texts <- cbind(texts)
## to remove multi-white spaces
MyDf <- gsub("\\s+"," ",texts)
MyDf <- gsub("\r?\n|\r", " ", MyDf)
MyDf <- cbind(MyDf)
colnames(MyDf) <- c("Introduction")

## this way, extra rows are being generated
word_table <- read.table(text = paste(gsub('\n', ' ', MyDf), collapse = '\n'), fill = TRUE)

## this way, the words are being repeated to match with the largest text
word_table <- do.call(rbind, strsplit(as.character(MyDf), " "))

詳細: テキストに複数の空白またはタブが含まれていました。最初の想定では、追加のスペースが問題を引き起こしている可能性がありますが、追加の空白を削除した後でも、同じ問題が発生します。

助けてください

score 0 · Accepted Answer

解決策breaker:およびcSplit機能の佳作。

texts <- c("concratulations    successfully signed   company  please find attached quick guide   can  t figure   immediately ", " conversation   laughing services  sweden", "p please find attached drafted budget   p ", "p please finad attached  agenda  today s board meeting  p ", "p hi   nbsp   p    p please find attached  darft meeting minutes  today s meeting   p ", "p please find attached  final version   minutes  updated action log  please let  know  actions   done   ll update  excel  nbsp   p ", "p hi    p    p please find attached  draft meeting minutes   action log  please provide comments   end  next week   p    p   nice spring party  saturday    p    p   tuija  p ", " p welcome team priority   hope   enjoy yo  p ", "p please find attached  flyer   can study  share   p ", "p attached new version  voice   receiver   p    p minor change request  invitation code       mentioned    invitation code may       tell  check  code  invitation email    end    alarm bell  example telling  new comments   ", "comment  etc     front page  now  seemed  end without warning    p ", "p memo attached  actions    p ", "p please find attached  updated board roles  responsibilities    made  changes   red   document   please review  especially   role  relevant contact info   prepare  comment   meeting  wednesday  nbsp   p ", "p attached documents  review  please comment  soonest   p ")
texts <- cbind(texts)
## to remove multi-white spaces
MyDf <- gsub("\\s+"," ",texts)
MyDf <- gsub("\r?\n|\r", " ", MyDf)
MyDf <- cbind(MyDf)
colnames(MyDf) <- c("Introduction")
n <- matrix(texts[ ,1 ], nrow = nrow(texts), ncol = ncol(texts))
library(splitstackshape)
library(data.table)
breaker <- function(X) {
        strsplit(X, "[[:space:]]|(?=[.!?])", perl=TRUE)
}
aaa <- breaker(n)
aaa <- cbind(aaa)
#############################################################################################
cSplit <- function(indt, splitCols, sep = ",", direction = "wide", 
                   makeEqual = NULL, fixed = TRUE, drop = TRUE, 
                   stripWhite = FALSE) {
        message("`cSplit` is now part of the 'splitstackshape' package (V1.4.0)")
        ## requires data.table >= 1.8.11
        require(data.table)
        if (!is.data.table(indt)) setDT(indt)
        if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols]
        if (any(!vapply(indt[, splitCols, with = FALSE],
                        is.character, logical(1L)))) {
                indt[, eval(splitCols) := lapply(.SD, as.character),
                     .SDcols = splitCols]
        }

        if (length(sep) == 1) 
                sep <- rep(sep, length(splitCols))
        if (length(sep) != length(splitCols)) {
                stop("Verify you have entered the correct number of sep")
        }

        if (isTRUE(stripWhite)) {
                indt[, eval(splitCols) := mapply(function(x, y) 
                        gsub(sprintf("\\s+%s\\s+|\\s+%s|%s\\s+", 
                                     x, x, x), x, y), 
                        sep, indt[, splitCols, with = FALSE], 
                        SIMPLIFY = FALSE)]
        }  

        X <- lapply(seq_along(splitCols), function(x) {
                strsplit(indt[[splitCols[x]]], split = sep[x], fixed = fixed)
        })

        if (direction == "long") {
                if (is.null(makeEqual)) {
                        IV <- function(x,y) if (identical(x,y)) TRUE else FALSE
                        makeEqual <- ifelse(Reduce(IV, rapply(X, length, how = "list")),
                                            FALSE, TRUE)
                }
        } else if (direction == "wide") {
                if (!is.null(makeEqual)) {
                        if (!isTRUE(makeEqual)) {
                                message("makeEqual specified as FALSE but set to TRUE")
                                makeEqual <- TRUE
                        }
                        makeEqual <- TRUE
                } else {
                        makeEqual <- TRUE
                }
        }
        if (isTRUE(makeEqual)) {
                SetUp <- lapply(seq_along(X), function(y) {
                        A <- vapply(X[[y]], length, 1L)
                        list(Mat = cbind(rep(seq_along(A), A), sequence(A)),
                             Val = unlist(X[[y]]))
                })    
                Ncol <- max(unlist(lapply(SetUp, function(y) y[["Mat"]][, 2]), 
                                   use.names = FALSE))
                X <- lapply(seq_along(SetUp), function(y) {
                        M <- matrix(NA_character_, nrow = nrow(indt), ncol = Ncol)
                        M[SetUp[[y]][["Mat"]]] <- SetUp[[y]][["Val"]]
                        M
                })
                if (direction == "wide") {
                        X <- lapply(seq_along(X), function(x) {
                                colnames(X[[x]]) <- paste(splitCols[x], 
                                                          sequence(ncol(X[[x]])), 
                                                          sep = "_")
                                X[[x]]
                        })
                        if (isTRUE(drop)) {
                                cbind(indt, do.call(cbind, X))[, eval(splitCols) := NULL][]
                        } else {
                                cbind(indt, do.call(cbind, X))
                        }
                } else {
                        indt <- indt[rep(sequence(nrow(indt)), each = Ncol)]
                        X <- lapply(X, function(y) as.vector(t(y)))
                        indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
                }  
        } else {
                Rep <- vapply(X[[1]], length, integer(1L))
                indt <- indt[rep(sequence(nrow(indt)), Rep)]
                indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
        }
}

df <- cSplit(as.data.frame(aaa), "aaa", ",")
df <- data.frame(cbind(texts, df))

######################################################################################
## Heading
Heading <- df[ ,1]

## Word Table
df <- df[ ,2:ncol(df)]

## first column
aaa_first <- df[,1]
aaa_first <- cbind(aaa_first)
c <- substring(aaa_first, 3)

## last column
aaa_end <- df[ ,ncol(df)]
aaa_end <- cbind(aaa_end)
e <- substr(aaa_end, 1, nchar(aaa_end)-1)

## Middole columns
d <- df[ ,3:ncol(df)-1]

cc <- cbind(Heading, c, d, e )
## cc <- cbind( c, d, e )

cc <- data.frame(lapply(cc, as.character), stringsAsFactors = FALSE)

df2 <- as.data.frame(sapply(cc,gsub,pattern= ")",replacement=""))
# df2 <- as.data.frame(sapply(df2,gsub,pattern="(",replacement=""))
df3 <- as.data.frame(sapply(df2, function(x) gsub("\"", "", x)))

r - R: Sentence to word-table の行番号が一致しません

1 に答える 1

Related

Reference