这里会显示出您选择的修订版和当前版本之间的差别。
| — |
model:preprocess:group [2020/07/12 12:07] (当前版本) |
||
|---|---|---|---|
| 行 1: | 行 1: | ||
| + | ====== 使用ReplaceTransformer进行连续数据分段 ====== | ||
| + | 离散数据分组,一般可以通过 ReplaceTransformer 来进行,示例如下: | ||
| + | |||
| + | <code python> | ||
| + | # ##################################################### | ||
| + | # File: 02-preprocess-07-ReplaceTransformer | ||
| + | # Author: jinlong.hao | ||
| + | # Date: 2019-12-04 | ||
| + | # ReplaceTransformer: 实现替换方式用于实现离散形变量的合并 | ||
| + | # DESC: | ||
| + | # 1. import | ||
| + | # 2. 加载测试数据 | ||
| + | # 3. 基础使用 | ||
| + | # 4. 与DataFrame整合使用 | ||
| + | # #################################################### | ||
| + | |||
| + | # 1. import | ||
| + | from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, LabelEncoder | ||
| + | from sklearn_pandas import DataFrameMapper | ||
| + | import numpy as np | ||
| + | import pandas as pd | ||
| + | from sklearn2pmml.preprocessing import ReplaceTransformer | ||
| + | |||
| + | # 2. 加载示例数据 | ||
| + | df = pd.DataFrame({ | ||
| + | 'age': [3, 3, 4, 4, 2, 2, 1, 7, 8], | ||
| + | 'name': ['james', 'james河北', 'jessica', 'jessica2', 'steve', 'steve', 'lili', 'lucy', 'stone'] | ||
| + | }) | ||
| + | |||
| + | # 4. 与DataFrameMapper配合使用 | ||
| + | dataFrameMapper = DataFrameMapper([ | ||
| + | ('name', [ | ||
| + | ReplaceTransformer(pattern='^(?!james|jessica|steve).*', replacement='others'), | ||
| + | ReplaceTransformer(pattern='^jessica.*', replacement='jessica'), | ||
| + | ReplaceTransformer(pattern='^james.*', replacement='james'), | ||
| + | LabelBinarizer() | ||
| + | ]) | ||
| + | ], df_out=True) | ||
| + | dataFrameMapper.fit_transform(df) | ||
| + | </code> | ||