这里会显示出您选择的修订版和当前版本之间的差别。
| — |
model:preprocess:cut [2020/07/12 12:07] (当前版本) |
||
|---|---|---|---|
| 行 1: | 行 1: | ||
| + | ====== 使用CutTransformer进行分段 ====== | ||
| + | 连续型特征的分段和离散化,一般使用 CutTransformer、LabelBinarizer来完成,示例代码如下: | ||
| + | |||
| + | <code python> | ||
| + | |||
| + | # ##################################################### | ||
| + | # File: 02-preprocess-05-CutTransformer | ||
| + | # Author: jinlong.hao | ||
| + | # Date: 2019-12-04 | ||
| + | # CutTransfomer: 连续变量分段函数,类似与df.cut(),但是可以整合到pipeline中 | ||
| + | # 与KbinsDescretizer的区别在于,此处时手工进行分段 | ||
| + | # DESC: | ||
| + | # 1. import | ||
| + | # 2. 加载测试数据 | ||
| + | # 3. 基础使用 | ||
| + | # 4. 与DataFrame整合使用 | ||
| + | # #################################################### | ||
| + | |||
| + | # 1. import | ||
| + | from sklearn.preprocessing import OneHotEncoder, LabelBinarizer | ||
| + | from sklearn.preprocessing import OrdinalEncoder, Binarizer, LabelEncoder | ||
| + | from sklearn_pandas import DataFrameMapper | ||
| + | import numpy as np | ||
| + | import pandas as pd | ||
| + | from sklearn2pmml.preprocessing import CutTransformer | ||
| + | |||
| + | |||
| + | # 2. 加载示例数据 | ||
| + | df = pd.DataFrame({ | ||
| + | 'age': [3, 3, 7, 4, 2, 4, 4, 7,2], | ||
| + | 'salary': [36, 39, 17, 82, 42, 10, 83, 19, 22] | ||
| + | }) | ||
| + | |||
| + | |||
| + | # 4. 配合DataFrameMapper,多个转化连续进行 | ||
| + | dataFrameMapper_continue = DataFrameMapper([ | ||
| + | #('age', [CutTransformer([0, 2, 5, 10]), | ||
| + | # LabelEncoder(), | ||
| + | # LabelBinarizer()]), # 该方式在生成pmml时报错 | ||
| + | ('age', [ | ||
| + | CutTransformer([0, 2, 5, 10], labels=['1', '2', '3']), | ||
| + | LabelBinarizer()]) | ||
| + | ], df_out=True) | ||
| + | df_continue = dataFrameMapper_continue.fit_transform(df) | ||
| + | </code> | ||