交叉验证
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.datasets import load_digits
digits = load_digits()
features = digits.data
target = digits.target
pipeline = make_pipeline(StandardScaler(), LogisticRegression())
cv_res = cross_val_score(pipeline, features, target, cv=KFold(10, shuffle=True, random_state=1), scoring='accuracy', n_jobs=-1)
print(cv_res.mean())
Baseline 模型
数值型baseline
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
boston = load_boston()
features, target = boston.data, boston.target
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)
std_scaler = StandardScaler()
std_scaler.fit(x_train)
x_train = std_scaler.transform(x_train)
x_test = std_scaler.transform(x_test)
baseline = DummyRegressor(strategy='mean')
baseline.fit(x_train, y_train)
baseline.score(x_test, y_test)
clf = LinearRegression()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)
分类型baseline
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
iris = load_iris()
features, target = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)
std_scaler = StandardScaler()
std_scaler.fit(x_train)
x_train = std_scaler.transform(x_train)
x_test = std_scaler.transform(x_test)
baseline = DummyClassifier(strategy='stratified', random_state=1)
baseline.fit(x_train, y_train)
print(baseline.score(x_test, y_test))
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
clf.fit(x_train, y_train)
clf.score(x_test, y_test)
ROC 曲线
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
import matplotlib.pyplot as plt
%matplotlib inline
features, target = make_classification(n_samples = 1000, n_features = 10, n_informative = 2, n_classes = 2)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)
std_scaler = StandardScaler()
std_scaler.fit(x_train)
x_train = std_scaler.transform(x_train)
x_test = std_scaler.transform(x_test)
baseline = DummyClassifier(strategy='stratified')
baseline.fit(x_train, y_train)
y_pred = baseline.predict_proba(x_test)[:,1]
fp, tp, threshold = roc_curve(y_test, y_pred)
plt.plot(fp, tp, label='baseline (%f)' % roc_auc_score(y_test, y_pred))
clf = LogisticRegression(solver='lbfgs')
clf.fit(x_train, y_train)
y_pred = clf.predict_proba(x_test)[:,1]
fp, tp, threshold = roc_curve(y_test, y_pred)
plt.plot(fp, tp, label='mine (%f)' % roc_auc_score(y_test, y_pred))
plt.legend()

Confusion Matrix
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
import seaborn as sns
iris = load_iris()
features, target = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.5)
std_scaler = StandardScaler()
std_scaler.fit(x_train)
x_train = std_scaler.transform(x_train)
x_test = std_scaler.transform(x_test)
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues')
