1 什么是逻辑回归(Logistic Regression)
1.1 介绍
一般用来解决分类问题,只能解决二分类问题。
将样本的特征和样本的概率联系起来,概率是一个数,所以可以叫做回归问题。
在多项式回归中,y^=f(x)=θ⊺⋅xb,θ⊺是系数,xb是添加了xb≡1的向量(矩阵)。y^的值域为(−∞,∞),而概率p^的值域为[0,1],这里使用Sigmoid函数将y的值域范围转换为p的值域范围。
Sigmoid函数:σ(t)=1+e−t1,{t>0,p>0.5t<0,p<0.5
函数图像:

令 p^=σ(θ⊺⋅xb)=1+e−θ⊺⋅xb1,最终分类 y^={1,p^≥0.50,p^≤0.5
问题:
对于给定的样本数据集X和y,如何找到参数θ,使得用这样的方式,可以最大程度的获得样本数据集X对应的分类输出y。
2 逻辑回归的损失函数
cost={−log(p^)ify=1−log(1−p^)ify=0⇒cost=−ylog(p^)−(1−y)log(1−p^)J(θ)=−m1∑i=1m[y(i)log(p^(i))+(1−y(i))log(1−p^(i))],p^(i)=σ(Xb(i)θ)=1+e−Xb(i)θ1
3 使用梯度下降法求θ 使得J(θ)最小
J(θ)=−m1∑i=1m[y(i)log(p^(i))+(1−y(i))log(1−p^(i))],p^(i)=σ(Xb(i)θ)=1+e−Xb(i)θ1J(θ)=−m1∑i=1m[y(i)log(σ(Xb(i)θ))+(1−y(i))log(1−σ(Xb(i)θ))]⎩⎪⎪⎪⎪⎨⎪⎪⎪⎪⎧σ(t)=1+e−t1=(1+e−t)−1,σ(t)′=(1+e−t)−2e−t[logσ(t)]′=σ(t)1σ(t)′=(1+e−t)−11(1+e−t)−2e−t=1+e−te−t=1+e−t1+e−t−1=1−σ(t)[log(1−σ(t))]′=1−σ(t)1(−1)σ(t)′=−1−σ(t)1(1+e−t)−2e−t=−1+e−t1+e−t−1+e−t11(1+e−t)−2e−t=−e−t1+e−t(1+e−t)−2e−t=−(1+e−t)−1=−σ(t)⎩⎨⎧dθjd(y(i)log(σ(Xb(i)θ)))=y(i)(1−σ(Xb(i)θ))Xj(i)dθjd((1−y(i))log(1−σ(Xb(i)θ)))=(1−y(i))(−σ(Xb(i)θ))Xj(i)+⟹[y(i)−σ(Xb(i)θ)]Xj(i)⇒dθjdJ(θ)=m1∑i=1m(σ(Xb(i)θ)−y(i))Xj(i)⇒∇J(θ)=⎝⎜⎜⎜⎛∂J/∂θ0∂J/∂θ1⋮∂J/∂θn⎠⎟⎟⎟⎞=m1⎝⎜⎜⎜⎜⎛∑i=1m(σ(Xb(i)θ)−y(i))X0(i))∑i=1m(σ(Xb(i)θ)−y(i))X1(i))⋮∑i=1m(σ(Xb(i)θ)−y(i))Xn(i))⎠⎟⎟⎟⎟⎞=m1⎝⎜⎜⎜⎜⎛∑i=1m(p^(i)−y(i))X0(i))∑i=1m(p^(i)−y(i))X1(i))⋮∑i=1m(p^(i)−y(i))Xn(i))⎠⎟⎟⎟⎟⎞(⇒m1⎝⎜⎜⎜⎜⎛∑i=1m(y^(i)−y(i))X0(i))∑i=1m(y^(i)−y(i))X1(i))⋮∑i=1m(y^(i)−y(i))Xn(i))⎠⎟⎟⎟⎟⎞)=m1Xb⊺[σ(Xbθ)−y]
4 Python实现逻辑回归算法
逻辑回归模块:
import numpy as np
from sklearn.metrics import accuracy_score
class LogisticRegression:
def __init__(self):
"""初始化Logistic Regression模型"""
self.coef_ = None
self.intercept_ = None
self._theta = None
def _sigmoid(self, t):
return 1./(1.+np.exp(-t))
def fit(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""根据训练数据集X_train, y_train, 使用梯度下降法训练Logistic Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
"""目标函数"""
y_hat = self._sigmoid(X_b.dot(theta))
try:
return -np.sum(y*np.log(y_hat)+(1-y)*np.log(1-y_hat)) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
"""梯度"""
return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(
X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict_proba(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果概率向量"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return self._sigmoid(X_b.dot(self._theta))
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
proba=self.predict_proba(X_predict)
return np.array(proba>=0.5,dtype='int')
def score(self, X_test, y_test):
"""根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
y_predict = self.predict(X_test)
return accuracy_score(y_test, y_predict)
def __repr__(self):
return "LogisticRegression()"
使用上述模块:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris=datasets.load_iris()
X=iris.data
y=iris.target
X=X[y<2,:2]
y=y[y<2]
X.shape
plt.scatter(X[y==0,0],X[y==0,1],color='r')
plt.scatter(X[y==1,0],X[y==1,1],color='b')
plt.show()

from sklearn.model_selection import train_test_split
import LogisticRegression
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=666)
log_reg=LogisticRegression.LogisticRegression()
log_reg.fit(X_train,y_train)
log_reg.score(X_test,y_test)
5 决策边界
⎩⎪⎪⎨⎪⎪⎧p^=σ(θ⊺⋅xb)=1+e−θ⊺⋅xb1,y^={1,p^≥0.50,p^<0.5p=σ(t)=1+e−t1⇒t>0,p>0.5t<0,p<0,5⇒y^={1,p^≥0.5,θ⊺⋅xb≥00,p^<0.5,θ⊺⋅xb<0⇒θ⊺⋅xb=0
决策边界为: θ⊺⋅xb=0
若X有两个特征:θ0+θ1x1+θ2x2=0⇒x2=θ2−θ0−θ1x1
绘图演示:
def x2(x1):
return (-log_reg.coef_[0]*x1-log_reg.intercept_)/log_reg.coef_[1]
x1_plot=np.linspace(4,8,1000)
x2_plot=x2(x1_plot)
plt.scatter(X[y==0,0],X[y==0,1],color='r')
plt.scatter(X[y==1,0],X[y==1,1],color='b')
plt.plot(x1_plot,x2_plot)
plt.show()
