机器学习实战专题(一)KNN
KNN算法
一、使用 k 近邻算法改进网站的配对效果
数据上传百度网盘 https://pan.baidu.com/s/1Jj2WwyD25yhgAaVJw5KSgg 提取码:eihp
环境:python3 (jupyter)
#导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
#读取数据
Dating=pd.read_csv(r"C:\Users\Jassy\maching_learning_shizhan\datingTestSet2.txt",sep="\t",header=None)
Dating.rename(columns={0:'flymile',1:'playtime',2:'icecreamlitre',3:'class'},inplace=True)
Dating.head()
flymile playtime icecreamlitre class
0 40920 8.326976 0.953952 3
1 14488 7.153469 1.673904 2
2 26052 1.441871 0.805124 1
3 75136 13.147394 0.428964 1
4 38344 1.669788 0.134296 1
#归一化数据
diffMaxMin=Dating[['flymile','playtime','icecreamlitre']].max()-Dating[['flymile','playtime','icecreamlitre']].min()
normalized=(Dating[['flymile','playtime','icecreamlitre']]-Dating[['flymile','playtime','icecreamlitre']].min())/diffMaxMin
classData=Dating['class']
normalized.head()
flymile playtime icecreamlitre
0 0.448325 0.398051 0.562334
1 0.158733 0.341955 0.987244
2 0.285429 0.068925 0.474496
3 0.823201 0.628480 0.252489
4 0.420102 0.079820 0.078578
import operator
from functools import reduce
plt.figure(figsize=(10,9))
plt.subplot(3,1,1)
plt.scatter(normalized['flymile'],normalized['playtime'],15.0*np.array(classData),c=np.array(classData))
plt.subplot(3,1,2)
plt.scatter(normalized['playtime'],normalized['icecreamlitre'],15.0*np.array(classData),c=np.array(classData))
plt.subplot(3,1,3)
plt.scatter(normalized['flymile'],normalized['icecreamlitre'],15.0*np.array(classData),c=np.array(classData))
plt.show()
#划分训练集和测试集
train_data,test_data,train_class,test_class=train_test_split(normalized,classData,test_size=0.3)
#KNN算法训练数据
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(train_data,train_class)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform')
#测试数据
class_pre=knn.predict(test_data)
class_test_list=test_class.tolist()
res=0
for index in range(len(test_class)):
if class_test_list[index]!=class_pre[index]:
res+=1
print('knn错误个数:',res)
print('knn正确个数:',len(test_class)-res)
print('knn正确率:',(len(test_class)-res)/len(test_class))
knn错误个数: 17
knn正确个数: 283
knn正确率: 0.9433333333333334
二、手写识别系统
#导入库
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
#读取数据
def img2vect(filename):
returnVect=np.zeros((1,1024))
fr=open(filename)
for i in range(32):
lineStr=fr.readline()
for j in range(32):
returnVect[0,32*i+j]=int(lineStr[j])
return returnVect
#KNN训练
trainLabelList=[]
#导入训练数据
trainFileList=os.listdir("digits/trainingDigits")
trainFileNumber=len(trainFileList)
trainVect=np.zeros((trainFileNumber,1024))
for index in range(trainFileNumber):
fileName=trainFileList[index]
trainVect[index,:]=img2vext("digits/trainingDigits/%s"%fileName)
classNumber=int(fileName.split('_')[0])
trainLabelList.append(classNumber)
#开始训练
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(trainVect,trainLabelList)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=3, p=2,
weights='uniform')
#读取测试文件
testFileList=os.listdir("digits/testDigits")
testFileNumber=len(testFileList)
errorCount=0
for index in range(testFileNumber):
fileName=testFileList[index]
classNumber=int(fileName.split('_')[0])
testVect=img2vect("digits/testDigits/%s"%fileName)
preClassNumber=knn.predict(testVect)
if preClassNumber != classNumber:
errorCount+=1
print('error Count: %d'%errorCount)
print('error rate: %f'%(errorCount/float(testFileNumber)))
error Count: 12
error rate: 0.012685