====== 建模基础示例代码 ====== 进行数据建模一般需要包括以下及各环节: - 建模样本数据加载 - 数据预处理 - 特征选择 - 模型训练 - 模型验证 - 建模结果输出 示例代码如下: # ######################################### # file: 09-sample-02-basic-01 # author: jinlong.hao # date: 2019-12-06 # desc: sklearn2pmml基础验证代码 # content: # 0. import语句 # 1. 加载数据 # 2. 数据预处理 # 3. 特征选择(略) # 4. 模型训练 # 5. 结果验证(略) # 6. 训练结果输出 # ######################################## # 0. import import sklearn import sklearn.impute import sklearn.ensemble import sklearn.linear_model import sklearn2pmml import sklearn2pmml.preprocessing from sklearn2pmml.preprocessing import ReplaceTransformer from sklearn2pmml.preprocessing import CutTransformer import sklearn_pandas import pandas as pd import numpy as np # 1. 加载数据 # 1.1 加载特征数据 train_x = pd.DataFrame({ 'phone_brand': ['Huawei', 'Huawei', 'Apple', 'Apple', '360', '8848', np.NaN], 'phone_price': [2403, 1123, 4823, 2223, np.NaN, 1583, 2222] }) # 1.2 加载目标变量 train_y = pd.DataFrame({ 'result': [0, 1, 0, 1, 0, 0, 1] }) # 2. 数据预处理的封装DataFrameMapper dataFrameMapper = sklearn_pandas.DataFrameMapper([ (['phone_brand'], [ sklearn.impute.SimpleImputer(strategy='constant', fill_value='others') ,ReplaceTransformer(pattern='^(?!Huawei|Apple).*', replacement='others') ,sklearn.preprocessing.LabelBinarizer() ]), # 品牌字段分组 (['phone_price'], [ sklearn.impute.SimpleImputer(strategy='constant', fill_value=0) ,CutTransformer(bins=[-1, 1000, 10000], labels=['1', '2']) ,sklearn.preprocessing.LabelBinarizer() ]) # 价格字段分段处理 ]) # 3. 特征选择(略) # 4. 模型训练 # 4.1 构建逻辑回归的分类器 logistic_classifier = sklearn.linear_model.LogisticRegression() # 4.2 构建pipeline logistic_pipeline = sklearn2pmml.PMMLPipeline([ ('mapper', dataFrameMapper), ('classifier', logistic_classifier) ]) # 4.3 模型训练 logistic_pipeline.fit(train_x, train_y) # 5. 模型验证(略) # 6. 模型结果输出 # 6.1 结果中加载验证数据 logistic_pipeline.verify(train_x.sample(2)) # 6.2 输出结果到Pmml中 sklearn2pmml.sklearn2pmml(logistic_pipeline, '02-basic-01.pmml')