python 多类别变量二分类问题 logistic回归 神经网络
1 数据准备
1.1数据样式
全是中文字符串的离散类别型变量
1.2 数据读入
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.read_excel("info.xlsx")
# 删除带NAN的数据行
df=df.dropna(axis=0,how='any')
1.3 one-hot编码及特征提取
经过one-hot编码,才能使用决策树、逻辑回归等,
如抽样地点分别是生产单位和零售的分别被编码为
01 ,10
#### one-hot编码(针对字符串)
data_dummies = pd.get_dummies(df)
# 选择自变量特征列
#print(data_dummies.columns) 查看列
features = data_dummies.loc[:, '被抽样单位性质_个体/个人':'抽样地点_零售']
X = features.values
y = data_dummies['检验结果_符合规定'].values
1.4 数据集划分
按照6:2:2比例划分,1W数据集左右的基本比例
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val= train_test_split(X_train, y_train,test_size=0.2, random_state=1)
2 logistic 回归法
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("logistic test score: {:.2f}".format(logreg.score(X_test, y_test)))#输出
3 神经网络法
用的keras
ref
keras model.fit
https://www.jianshu.com/p/9ba27074044f
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras.optimizers import SGD
from keras.models import load_model
################构建模型
model = Sequential()
model.add(Dense(32,input_dim=126)) #建立输入层
model.add(Activation('relu')) #**函数采用ReLU
model.add(Dense(200)) #第二层32个单元,对系数b进行正则化
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dense(1)) #输出层
model.add(Activation('sigmoid'))
############## 模型训练
sgd = SGD(lr=0.01, decay=1e-5, momentum=0.9, nesterov=True) #采用随机梯度下降参数
model.compile(optimizer=sgd,
loss='binary_crossentropy',
metrics=['accuracy'])
history=model.fit(X_train,y_train,
nb_epoch=50,batch_size=5,
validation_data=(X_val,y_val)) #编译
history_dict = history.history
history_dict.keys()
############## PLOT LOSS
import matplotlib.pyplot as plt
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1,len(loss_values)+1)
plt.plot(epochs,loss_values,'bo',label='Training loss')
plt.plot(epochs,val_loss_values,'b',label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
model.save("my_model")
NN_score = model.evaluate(X_test, y_test)
print("NN test score: {:.2f}",(NN_score))
model.evaluate()返回的是 loss value & metrics values
4 决策树
REF
https://blog.****.net/bqw18744018044/article/details/82598131#commentBox