机器学习实战专题(一)KNN

KNN算法

一、使用 k 近邻算法改进网站的配对效果
数据上传百度网盘 https://pan.baidu.com/s/1Jj2WwyD25yhgAaVJw5KSgg 提取码:eihp
环境:python3 (jupyter)

#导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


#读取数据
Dating=pd.read_csv(r"C:\Users\Jassy\maching_learning_shizhan\datingTestSet2.txt",sep="\t",header=None)
Dating.rename(columns={0:'flymile',1:'playtime',2:'icecreamlitre',3:'class'},inplace=True)
Dating.head()
flymile	playtime	icecreamlitre	class
0	40920	8.326976	0.953952	3
1	14488	7.153469	1.673904	2
2	26052	1.441871	0.805124	1
3	75136	13.147394	0.428964	1
4	38344	1.669788	0.134296	1


#归一化数据
diffMaxMin=Dating[['flymile','playtime','icecreamlitre']].max()-Dating[['flymile','playtime','icecreamlitre']].min()
normalized=(Dating[['flymile','playtime','icecreamlitre']]-Dating[['flymile','playtime','icecreamlitre']].min())/diffMaxMin
classData=Dating['class']
normalized.head()
flymile	playtime	icecreamlitre
0	0.448325	0.398051	0.562334
1	0.158733	0.341955	0.987244
2	0.285429	0.068925	0.474496
3	0.823201	0.628480	0.252489
4	0.420102	0.079820	0.078578


import operator
from functools import reduce
plt.figure(figsize=(10,9))
plt.subplot(3,1,1)
plt.scatter(normalized['flymile'],normalized['playtime'],15.0*np.array(classData),c=np.array(classData))
plt.subplot(3,1,2)
plt.scatter(normalized['playtime'],normalized['icecreamlitre'],15.0*np.array(classData),c=np.array(classData))
plt.subplot(3,1,3)
plt.scatter(normalized['flymile'],normalized['icecreamlitre'],15.0*np.array(classData),c=np.array(classData))
plt.show()

机器学习实战专题(一)KNN

#划分训练集和测试集
train_data,test_data,train_class,test_class=train_test_split(normalized,classData,test_size=0.3)


#KNN算法训练数据
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(train_data,train_class)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
           
           
#测试数据
class_pre=knn.predict(test_data)
class_test_list=test_class.tolist()
res=0
for index in range(len(test_class)):
    if class_test_list[index]!=class_pre[index]:
        res+=1
print('knn错误个数:',res)
print('knn正确个数:',len(test_class)-res)
print('knn正确率:',(len(test_class)-res)/len(test_class))
knn错误个数: 17
knn正确个数: 283
knn正确率: 0.9433333333333334

二、手写识别系统

#导入库
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier


#读取数据
def img2vect(filename):
    returnVect=np.zeros((1,1024))
    fr=open(filename)
    for i in range(32):
        lineStr=fr.readline()
        for j in range(32):
            returnVect[0,32*i+j]=int(lineStr[j])
    return returnVect


#KNN训练
trainLabelList=[]
#导入训练数据
trainFileList=os.listdir("digits/trainingDigits")
trainFileNumber=len(trainFileList)
trainVect=np.zeros((trainFileNumber,1024))
for index in range(trainFileNumber):
    fileName=trainFileList[index]
    trainVect[index,:]=img2vext("digits/trainingDigits/%s"%fileName)
    classNumber=int(fileName.split('_')[0])
    trainLabelList.append(classNumber)
    
    
#开始训练
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(trainVect,trainLabelList)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


#读取测试文件
testFileList=os.listdir("digits/testDigits")
testFileNumber=len(testFileList)
errorCount=0

for index in range(testFileNumber):
    fileName=testFileList[index]
    classNumber=int(fileName.split('_')[0])
    testVect=img2vect("digits/testDigits/%s"%fileName)
    preClassNumber=knn.predict(testVect)
    if  preClassNumber != classNumber:
        errorCount+=1
        
print('error Count: %d'%errorCount)
print('error rate: %f'%(errorCount/float(testFileNumber)))

error Count: 12
error rate: 0.012685