这是本文档旧的修订版!
# ############################################################ # File: 02-preprocess-04-OneHotEncoder # Author: jinlong.hao # Date: 2019-12-04 # OneHotEncoder: 将数据进行离散化处理,形成哑变量 # sklearn-19及以前仅支持integer数据,20以后支持string数据了 # Desc: # 1. import语句 # 2. 构造数据 # 3. 使用DataFrameMapper进行转化 # 4. 使用DataFrameMapper结合OneHotEncoder进行转化 # ############################################################ # 1. import语句 from sklearn.preprocessing import OneHotEncoder, LabelBinarizer from sklearn_pandas import DataFrameMapper import numpy as np import pandas as pd from sklearn2pmml.preprocessing import CutTransformer # 2. 加载示例数据 df = pd.DataFrame({ 'age': [3, 3, 7, 4, 2, 4], 'salary': [36, 39, 17, 82, 42, 10], 'name': ['james', 'jucy', 'jessica', 'tony', 'steve', 'jam'] }) # 3. 配合DataFrameMapper使用 dataFrameMapper = DataFrameMapper([ (['age'], OneHotEncoder(handle_unknown='ignore')), (['age'], LabelBinarizer()), (['name'], OneHotEncoder(handle_unknown='ignore')) ], df_out=True) dataFrameMapper.fit_transform(df) # 4 OneHotEncoder 不能在DataRameMapper中连接使用,会报错 dataFrameMapper2 = DataFrameMapper([ (['age'], [ CutTransformer(bins=[0, 3, 5, 200], labels=['1', '2', '3']), OneHotEncoder()] ) #报错,需要用LabelBinarizer() ], df_out=True) dataFrameMapper2.fit_transform(df)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py in check_array (array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 519 "Reshape your data either using array.reshape(-1, 1) if " 520 "your data has a single feature or array.reshape(1, -1) " --> 521 "if it contains a single sample.".format(array)) 522 523 # in the future np.flexible dtypes will be handled like object dtypes ValueError: ['age']: Expected 2D array, got 1D array instead: array=['1' '1' '3' '2' '1' '2' '2' '3' '1']. Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
在DataFrameMapper多重处理中,不使用OneHotEncoder,该用LabelBinarizer,修改后的代码如下:
# 4. OneHotEncoder 不能在DataRameMapper中连接使用,改用LabelBinarizer dataFrameMapper2 = DataFrameMapper([ #(['age'], [ # CutTransformer(bins=[0, 3, 5, 200], labels=['1', '2', '3']), # OneHotEncoder()] #), #报错,需要用LabelBinarizer() ('age', [ CutTransformer(bins=[0, 3, 5, 200], labels=['1', '2', '3']), LabelBinarizer()] ) ], df_out=True) dataFrameMapper2.fit_transform(df)
# ######################################### # file: 09-sample-02-basic-01 # author: jinlong.hao # date: 2019-12-06 # desc: sklearn2pmml基础验证代码 # content: # ######################################## # 1. import import sklearn import sklearn.impute import sklearn.ensemble import sklearn.linear_model import sklearn2pmml import sklearn2pmml.preprocessing from sklearn2pmml.preprocessing import ReplaceTransformer import sklearn_pandas import pandas as pd import numpy as np # 2. 加载数据 # 2.1 加载特征数据 train_x = pd.DataFrame({ 'phone_brand': ['Huawei', 'Huawei', 'Apple', 'Apple', '360', '8848', np.NaN], 'phone_price': [2403, 1123, 4823, 2223, np.NaN, 1583, 2222] }) train_x = pd.DataFrame(data=train_x.values, columns=train_x.columns) # 2.2 train_y train_y = pd.DataFrame({ 'result': [0, 1, 0, 1, 0, 0, 1] }) train_y=pd.DataFrame(data=train_y.values, columns=train_y.columns) # 3. 构建DataFrameMapper预处理程序 dataFrameMapper = sklearn_pandas.DataFrameMapper([ (['phone_brand'], [ sklearn.impute.SimpleImputer(strategy='constant', fill_value='others') ,ReplaceTransformer(pattern='^(?!Huawei|Apple).*', replacement='others') ,sklearn.preprocessing.LabelEncoder() ,sklearn.preprocessing.LabelBinarizer() ]), (['phone_price'], [ sklearn.impute.SimpleImputer(strategy='constant', fill_value=0) ,sklearn2pmml.preprocessing.CutTransformer([-1, 1000, 10000]) ,sklearn.preprocessing.LabelEncoder() ,sklearn.preprocessing.LabelBinarizer() ]) ]) dataFrameMapper.fit_transform(train_x) # 4. 模型训练 # 4.1 构建逻辑回归分类器 logistic_classifier = sklearn.linear_model.LogisticRegression() # 4.2 构建逻辑回归的pipeline logistic_pipeline = sklearn2pmml.PMMLPipeline([ ('mapper', dataFrameMapper), ('classifier', logistic_classifier) ]) # 4.3 执行训练 logistic_pipeline.fit(train_x, train_y) # 5. 模型输出pmml sklearn2pmml.sklearn2pmml(logistic_pipeline, '02-basic-01.pmml')
Standard output is empty Standard error: 十二月 09, 2019 5:32:59 下午 org.jpmml.sklearn.Main run 信息: Parsing PKL.. 十二月 09, 2019 5:32:59 下午 org.jpmml.sklearn.Main run 严重: Failed to parse PKL net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for pandas._libs.interval.Interval) at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23) at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:773) at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:213) at net.razorvine.pickle.Unpickler.load(Unpickler.java:123) at numpy.core.NDArrayUtil.readObject(NDArrayUtil.java:378) at numpy.core.TypeDescriptor.read(TypeDescriptor.java:163) at numpy.core.NDArrayUtil.parseArray(NDArrayUtil.java:214) at numpy.core.NDArrayUtil.parseData(NDArrayUtil.java:189) at joblib.NumpyArrayWrapper.toArray(NumpyArrayWrapper.java:43) at org.jpmml.sklearn.PickleUtil$1.dispatch(PickleUtil.java:88) at net.razorvine.pickle.Unpickler.load(Unpickler.java:123) at org.jpmml.sklearn.PickleUtil.unpickle(PickleUtil.java:98) at org.jpmml.sklearn.Main.run(Main.java:104) at org.jpmml.sklearn.Main.main(Main.java:94)
在使用CutTransformer时,指定labels参数,修改如下:
# 3. 构建DataFrameMapper预处理程序 dataFrameMapper = sklearn_pandas.DataFrameMapper([ (['phone_brand'], [ sklearn.impute.SimpleImputer(strategy='constant', fill_value='others') ,ReplaceTransformer(pattern='^(?!Huawei|Apple).*', replacement='others') ,sklearn.preprocessing.LabelEncoder() ,sklearn.preprocessing.LabelBinarizer() ]), (['phone_price'], [ sklearn.impute.SimpleImputer(strategy='constant', fill_value=0) ,sklearn2pmml.preprocessing.CutTransformer(bins=[-1, 1000, 10000], labels=['1', '2']) # ,sklearn.preprocessing.LabelEncoder() # 指定labels后不需要在使用LabelEncoder ,sklearn.preprocessing.LabelBinarizer() ]) ]) dataFrameMapper.fit_transform(train_x)