线性回归及RANSAC异常值清除算法案例
线性回归及RANSAC异常值清除算法案例
1、常规线性回归
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
def lin_regplot(X, y, model):
plt.scatter(X, y, c='blue')
plt.plot(X, model.predict(X), color='red')
plt.savefig('result/Linear.png')
plt.show()
return None
### 线性回归模型
# 数据读取
df = pd.read_csv('dataset/boston.csv', sep=',')
df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
'NOX', 'RM', 'AGE', 'DIS', 'RAD',
'TAX', 'PTRATIO', 'LSTAT', 'MEDV']
# print(df.head())
X = df[['RM']].values
y = df[['MEDV']].values
slr = LinearRegression()
slr.fit(X, y)
print("Slope: %.3f" % slr.coef_[0])
print("intercept: %.3f" % slr.intercept_)
lin_regplot(X, y, slr)
可视化:
2、RANSAC异常值清除后线性回归
### 使用RANSAC清除异常值高鲁棒对的线性回归模型
ransac = RANSACRegressor(LinearRegression(),
max_trials=100,
min_samples=50,
# residual_metric=lambda x: np.sum(np.abs(x), axis=1),
residual_threshold=5.0,
random_state=0)
ransac.fit(X, y)
# 可视化
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
plt.scatter(X[inlier_mask], y[inlier_mask],
c='blue', marker='o', label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask],
c='lightgreen', marker='s', label='Outliers')
plt.plot(line_X, line_y_ransac, color='red')
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $ 1000 [MEDV]')
plt.savefig('result/ransac.png')
plt.show()
可视化
3、所有代码
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
def lin_regplot(X, y, model):
plt.scatter(X, y, c='blue')
plt.plot(X, model.predict(X), color='red')
plt.savefig('result/Linear.png')
plt.show()
return None
### 线性回归模型
# 数据读取
df = pd.read_csv('dataset/boston.csv', sep=',')
df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
'NOX', 'RM', 'AGE', 'DIS', 'RAD',
'TAX', 'PTRATIO', 'LSTAT', 'MEDV']
# print(df.head())
X = df[['RM']].values
y = df[['MEDV']].values
slr = LinearRegression()
slr.fit(X, y)
print("Slope: %.3f" % slr.coef_[0])
print("intercept: %.3f" % slr.intercept_)
lin_regplot(X, y, slr)
### 使用RANSAC清除异常值高鲁棒对的线性回归模型
ransac = RANSACRegressor(LinearRegression(),
max_trials=100,
min_samples=50,
# residual_metric=lambda x: np.sum(np.abs(x), axis=1),
residual_threshold=5.0,
random_state=0)
ransac.fit(X, y)
# 可视化
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
plt.scatter(X[inlier_mask], y[inlier_mask],
c='blue', marker='o', label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask],
c='lightgreen', marker='s', label='Outliers')
plt.plot(line_X, line_y_ransac, color='red')
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $ 1000 [MEDV]')
plt.savefig('result/ransac.png')
plt.show()