第六章 Logistic回归与最大熵模型
参考资料
1.李航《统计学习方法》 2.github: https://github.com/fengdu78/lihang-code
Logistic模型与最大熵模型都属于对数线性模型 是否是线性模型取决于训练的参数是否为线性
Logistic回归模型
Logistic分布
设是连续的随机变量,服从Logistic分布是指具有下列分布函数和密度函数:
其中:
import matplotlib.pyplot as plt
import numpy as np
def DrawLogisticDestribution(mu, gamma):
x = np.arange(-10, 10, 0.01)
y = 1.0 / (1 + np.exp(-(x-mu)/gamma))
y2 = (np.exp(-(x-mu)/gamma)) / pow((1 + np.exp(-(x-mu)/gamma)), 2)
plt.figure(figsize=(7, 5))
plt.plot(x, y, 'b-', label='Cumulative Distribution Function')
plt.plot(x, y2, 'r-', label='Probability Dense Function')
plt.xlabel('x')
plt.ylabel('y')
plt.legend(loc='upper left')
plt.show()
DrawLogisticDestribution(0, 1)
Logistic回归模型
二项Logistic回归模型
对于给定的输入,按照上式求得,比较两个条件概率值的大小,将实例分到概率较大的那一类
事件的几率:该事件发生的概率与不发生的概率的比值
如果某事件发生的概率为,则该事件的几率为
对数几率(logit函数):
对Logistic回归而言,
即在逻辑回归模型中,输出的对数几率是输入的线性函数
模型参数估计
对于给定的数据集,其中,可以应用极大似然估计方法估计模型的参数
设,则似然函数为:
对数似然函数为:
求
多项Logistic回归模型
from math import exp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100, [0, 1, -1]])
return data[:,:2], data[:,-1]
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
Logistic回归
对数似然函数为:
关于求偏导:
参数更新:
class LogisticRegressionClassifier(object):
def __init__(self, max_iter=200, learning_rate=0.01):
self.max_iter = max_iter
self.learning_rate = learning_rate
def sigmoid(self, x):
return 1 / (1 + exp(-(x)))
def data_matrix(self, X):
data_mat = []
for d in X:
# d [6.0 2.8] *d 6.0 2.8
data_mat.append([1.0, *d])
return data_mat
def fit(self, X, y):
data_mat = self.data_matrix(X)
self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
for iter_ in range(self.max_iter):
for i in range(len(X)):
result = self.sigmoid(np.dot(data_mat[i], self.weights))
error = y[i] - result
# 参数更新
self.weights += self.learning_rate * error * np.transpose([data_mat[i]])
print("LogisticRegression Model learning_rate={}, max_iter={}".format( self.learning_rate, self.max_iter))
def score(self, X_test, y_test):
right = 0
X_test = self.data_matrix(X_test)
for x, y in zip(X_test, y_test):
result = np.dot(x, self.weights)
if(result > 0 and y == 1) or (result < 0 and y == 0):
right += 1
return right / len(X_test)
lg_clf = LogisticRegressionClassifier()
lg_clf.fit(X_train, y_train)
lg_clf.score(X_test, y_test)
x_ponits = np.arange(4, 8)
y_ = -(lg_clf.weights[1]*x_ponits + lg_clf.weights[0])/lg_clf.weights[2]
plt.plot(x_ponits, y_)
#lg_clf.show_graph()
plt.scatter(X[:50,0],X[:50,1], label='0')
plt.scatter(X[50:,0],X[50:,1], label='1')
plt.legend()
调用sklearn中的内置函数
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
x_ponits = np.arange(4, 8)
y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1]
plt.plot(x_ponits, y_)
plt.plot(X[:50, 0], X[:50, 1], 'bo', color='blue', label='0')
plt.plot(X[50:, 0], X[50:, 1], 'bo', color='orange', label='1')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend()