r - 教師なし学習のための合成データの生成

Question

ランダムフォレストで教師なし学習用のデータを用意したい。手順は次のとおりです。

データを取得し、すべての例に値 1 の属性「クラス」を追加します
元のデータから合成データを生成します。
- 元のデータビルドの例と同じ数の例はありませんが、次のようになります。
  - 元のデータのその属性のすべての値から新しい属性値を抽出する
  - すべての属性に対してそれを行い、それらを新しい例に結合します
合成データ値 2 の属性「クラス」に割り当てます
両方のデータを結合する

最終的には次のようになります。

        ...      Class
                |1
     Original   |1
     Data       |1
                |1
    --------------
                |2
     Synthetic  |2
     Data       |2
                |2

私のRコードは次のようになります。

library(gtools) #for smartbind()

sample1 <- function(X)   { sample(X, replace=T) } 
g1      <- function(dat) { apply(dat,2,sample1) }

data$class <- rep(1, times=nrow(data)) #add attribute 'class' with value 1

synthData<-data.frame(g1(data[,1:ncol(data)])) #generate synthetic data with sampling from data
synthData$class <- rep(2, times=nrow(synthData)) #attribute 'class' is 2
colnames(synthData) <- colnames(data)
newData <- smartbind(data, synthData) #bind the data together

私が R にまったく慣れていないことはおそらく明らかですが、うまくいきます。ただ 1 つの問題があります。合成データの属性の型は、元のデータの型と同じではありません。元は数字だったのに、今は因数になっています。合成データの生成中に同じ型を保持するにはどうすればよいですか?

ありがとうございました！

Data1 (数値は係数になります):

構造体 (リスト (V2 = c(1.51793、1.51711、1.51645、1.51916、1.51131)、V3 = c(13.21、12.89、13.44、14.15、13.69)、V4 = c(3.48、3.62、3.61、0、3.2)、V5 = c(1.41, 1.57, 1.54, 2.09, 1.81), V6 = c(72.64, 72.96, 72.39, 72.74, 72.81), V7 = c(0.59, 0.61, 0.66, 0, 1.76 ), V8 = c(8.43, 8.11、8.03、10.88、5.43)、V9 = c(0、0、0、0、1.19)、V10 = c(0、0、0、0、0)、realClass = 構造体(c(1L、2L、2L) , 5L, 6L), .Label = c("1", "2", "3", "5", "6", "7"), クラス = "因子")), .Names = c(" V2"、"V3"、"V4"、"V5"、"V6"、"V7"、"V8"、"V9"、"V10"、"realClass")、row.names = c(27L, 138L, 77L、183L、186L)、クラス = "data.frame")

Data2 (要因は chrs になります):

structure(list(realClass = structure(c(2L, 2L, 2L, 1L, 2L), .Label = c("e", "p"), class = "factor"), V2 = 構造(c(6L, 3L, 4L, 6L, 6L), .Label = c("b", "c", "f", "k", "s", "x"), class = "factor"), V3 = 構造体( c(4L, 4L, 3L, 1L, 1L), .Label = c("f", "g", "s", "y"), class = "factor"), V4 = 構造体(c(5L, 5L, 5L, 3L, 4L), .Label = c("b", "c", "e", "g", "n", "p", "r", "u", "w", "y"), class = "factor"), V5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("f", "t" ), クラス = "factor"), V6 = 構造 (c(3L, 9L, 3L, 6L, 3L ), .ラベル = c("a", "c", "f", "l", "m", "n", "p", "s", "y" ), クラス = "因子"), V7 = structure(c(2L, 2L, 2L, 2L, 2L ), .Label = c("a", "f"), class = "factor"), V8 = 構造体(c(1L, 1L, 1L, 1L, 1L), .Label = c("c", "w"), class = "factor"), V9 = structure(c(2L, 2L, 2L, 1L, 1L), .Label = c("b", "n" ), class = "factor"), V10 = structure(c(1L, 1L, 1L, 10L, 4L), .Label = c("b", "e", "g", "h", "k"、"n"、"o"、"p"、"r"、"u"、"w"、"y")、クラス = "因子")、V11 = 構造 (c(2L, 2L, 2L、2L、1L)、.Label = c("e", "t"), class = "factor"), V12 = 構造体(c(NA, NA, NA, 1L, 1L), .Label = c("b", "c", "e", "r"), class = "factor"), V13 = 構造体(c(3L, 2L, 3L, 3L, 2L), .Label = c("f", "k", "s", "y"), class = "factor"), V14 = 構造体(c(3L, 3L, 2L, 3L, 2L), .Label = c("f", "k", "s", "y") , class = "factor"), V15 = structure(c(7L, 8L, 7L, 4L, 7L), .Label = c("b", "c", "e", "g", "n", "o", "p", "w", "y"), class = "factor"), V16 = 構造体(c(7L, 7L, 8L, 4L, 1L), .Label = c("b", "c"、"e"、"g", "n", "o", "p", "w", "y" ), class = "factor"), V17 = structure(c(1L, 1L, 1L, 1L, 1L ), . Label = "p", class = "factor"), V18 = structure(c(3L, 3L, 3L, 3L, 3L), .Label = c("n", "o", "w", "y") ), class = "factor"), V19 = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("n", "o", "t"), class = "factor") , V20 = 構造体(c(1L, 1L, 1L, 5L, 3L), .Label = c("e", "f", "l", "n", "p"), class = "factor") , V21 = 構造体(c(8L, 8L, 8L, 4L, 2L), .Label = c("b", "h", "k", "n", "o", "r", "u" 、「わ」、"y"), class = "factor"), V22 = structure(c(5L, 5L, 5L, 5L, 6L), .Label = c("a", "c", "n", "s", "v", "y"), class = "factor"), V23 = 構造体(c(3L, 3L, 5L, 1L, 2L), .Label = c("d", "g", "l", "m", "p", "u", "w"), class = "factor")), .Names = c("realClass", "V2", "V3", "V4", "V5", 「V6」、「V7」、「V8」、「V9」、「V10」、「V11」、「V12」、「V13」、「V14」、「V15」、「V16」、「V17」、「V18」 "、"V19"、"V20"、"V21"、"V22"、"V23")、row.names = c(4105L、6207L、6696L、2736L、3756L)、クラス = "data.frame")

score 2 · Accepted Answer

このトリックをいつでも使用して、数値列を作成できます

numcol <- as.numeric(as.character(factcol))

しかし、data.frame に因子変数があると思われます。行列を返すためapply、データに因子が 1 つある場合、すべての数値変数も強制的に因数分解されます。

これは、おもちゃのデータセットを使用した例です

set.seed(123)
toydat <- data.frame(A = 1:10, B = rnorm(10), C = LETTERS[1:10])
str(toydat)

## 'data.frame':    10 obs. of  3 variables:
##  $ A: int  1 2 3 4 5 6 7 8 9 10
##  $ B: num  -0.5605 -0.2302 1.5587 0.0705 0.1293 ...
##  $ C: Factor w/ 10 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9 10

set.seed(1)
str(data.frame(apply(toydat[,1:2], 2, sample, replace = TRUE)))

## 'data.frame':    10 obs. of  2 variables:
##  $ A: num  3 4 6 10 3 9 10 7 7 1
##  $ B: num  1.5587 -0.2302 0.4609 0.0705 -1.2651 ...

# with the factor column C     
set.seed(2)
str(data.frame(apply(toydat[,1:3], 2, sample, replace = TRUE)))

## 'data.frame':    10 obs. of  3 variables:
##  $ A: Factor w/ 6 levels "10"," 2"," 5",..: 2 5 4 2 1 1 2 6 3 4
##  $ B: Factor w/ 8 levels " 0.129288","-0.230177",..: 8 7 6 2 1 5 3 7 1 4
##  $ C: Factor w/ 6 levels "B","D","E","G",..: 4 2 5 1 2 3 1 2 6 1

plyr出力を (**ply を使用して) 制御できるため、ここでパッケージが便利になりました。しかし、この場合、colwise機能は十分です

require(plyr)
set.seed(2)
mysamplingfun <- colwise(function(x) sample(x, replace = TRUE))
str(mysamplingfun(toydat[,1:3]))

## 'data.frame':    10 obs. of  3 variables:
##  $ A: int  2 8 6 2 10 10 2 9 5 6
##  $ B: num  1.715 1.559 -1.265 -0.23 0.129 ...
##  $ C: Factor w/ 10 levels "A","B","C","D",..: 7 4 9 2 4 5 2 4 10 2

r - 教師なし学習のための合成データの生成

1 に答える 1

Related

Reference