python - DictVectorizer を使用した sklearn パイプラインのカテゴリ変数

Question

以下のように、数値変数とカテゴリ変数を含むパイプラインを適用したい

import numpy as np
import pandas as pd
from sklearn import linear_model,  pipeline, preprocessing
from sklearn.feature_extraction import DictVectorizer 

df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df['a']
X = df[['b', 'c', 'd']]

数値のインデックスを作成します

numeric = ['b']
numeric_indices = np.array([(column in numeric) for column in X.columns], dtype = bool)

& カテゴリ変数の場合

categorical = ['c', 'd'] 
categorical_indices = np.array([(column in categorical) for column in X.columns], dtype = bool)

次に、パイプラインを作成します

regressor = linear_model.SGDRegressor()
encoder = DictVectorizer(sparse = False)

estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        

            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_indices])),
                ('scaling', preprocessing.StandardScaler(with_mean = 0.))            
                        ])),

            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_indices])),
                ('DictVectorizer', encoder )           
                        ])),
        ])),
    ('model_fitting', regressor)
    ]
)

そして私は得る

estimator.fit(X, y)
ValueError: could not convert string to float: 'f'

パイプラインでencoder.fit()を適用する必要があることはわかっていますが、適用方法がわかりませんまたは、前処理.OneHotEncoder()を使用するのは嫌いですが、文字列を浮動小数点数に変換する必要があります

それを改善する方法は？

score 1 · Accepted Answer

私はちょうどこのように見えます

import numpy as np
import pandas as pd
from sklearn import linear_model, metrics, pipeline, preprocessing
df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df.a
num = df[['b']]
cat = df[['c', 'd']]
from sklearn.feature_extraction import DictVectorizer
enc = DictVectorizer(sparse = False)
enc_data = enc.fit_transform(cat .T.to_dict().values())
crat = pd.DataFrame(enc_data,  columns=enc.get_feature_names())
X = pd.concat([crat, num], axis=1)
cat_columns = ['c=a', 'c=b', 'c=c', 'd=f', 'd=m'] 
cat_indices = np.array([(column in cat_columns) for column in X.columns], dtype = bool)
numeric_col = ['b']
num_indices = np.array([(column in numeric_col) for column in X.columns], dtype = bool)
reg = linear_model.SGDRegressor()
estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        
            ('categorical', preprocessing.FunctionTransformer(lambda data: data[:, cat_indices])), 

            #numeric
            ('numeric', pipeline.Pipeline(steps = [
                ('select', preprocessing.FunctionTransformer(lambda data: data[:, num_indices])),
                ('scale', preprocessing.StandardScaler())            
                        ]))
        ])),
    ('model', reg)
    ]
)
estimator.fit(X, y)

python - DictVectorizer を使用した sklearn パイプラインのカテゴリ変数

1 に答える 1

Related

Reference