使用KNN简单进行手写体识别
实际使用这个算法时,相对简单,但是算法的执行效率并不高。因为算法需要为每个测试向量做2000次距离
计算,每个距离计算包括了1024个维度浮点运算,总计要执行900次,此外,我们还需要为测试
向量准备2 M B的存储空间。下次介绍优化版-k决策树
import numpy as np import operator from os import listdir def img2vector(filename): """ 对图像文件进行处理,生成一维矩阵""" # 创建一维空数组 returnVect = np.zeros((1, 1024)) # print(returnVect) # print(returnVect[0][35]) fr = open(filename) for i in range(32): # 读取当前这一行 lineStr = fr.readline() for j in range(32): # 给矩阵赋值 returnVect[0][32 * i + j] = int(lineStr[j]) return returnVect def handwritingClassTest(): """生成分类器""" hwLabels = [] # listdir获取当前文件夹名字 trainingFileList = listdir("trainingDigits") print(trainingFileList) m = len(trainingFileList) # 生成多维矩阵 trainingMat = np.zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] # 先切割取出0_0 fileStr = fileNameStr.split(".")[0] # 取出当前图像对应显示的文字 classNumStr = int(fileStr.split("_")[0]) # 添加到列表中 hwLabels.append(classNumStr) # 获取所有正确数据,转变为对应的矩阵 trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr) # 导入需要验证的数据 testFileList = listdir("testDigits") # 错误次数初始化 errorCount = 0 m_test = len(testFileList) for i in range(m_test): fileNameStr = testFileList[i] # 先切割取出0_0 fileStr = fileNameStr.split(".")[0] # 取出当前图像对应显示的文字 classNumStr = int(fileStr.split("_")[0]) # 获取所有需要测试的数据,直接就是一个一维矩阵 vector_under_test = img2vector('testDigits/%s' % fileNameStr) # 调用分类器进行处理,传入正确数据,正确的数字,需要验证的数据,还有knn的k取值 classifler_result = classify0(vector_under_test, trainingMat, hwLabels, 3) if classifler_result != classNumStr: errorCount += 1 print("分类器返回结果:%d,正确结果是:%d" % (classifler_result, classNumStr)) print("运算中错误次数是:%d" % errorCount) print("运算中错误概率是:%f" % (errorCount / float(m_test))) def classify0(inx, dataset, labels, k): """分类器""" dataset_site = dataset.shape[0] # print(type(dataset_site)) diff_mat = np.tile(inx, (dataset_site, 1)) - dataset sq_diffmat = diff_mat ** 2 sq_distances = sq_diffmat.sum(axis=1) distances = sq_distances ** 0.5 sorted_distindicies = distances.argsort() class_count = {} for i in range(k): votei_label = labels[sorted_distindicies[i]] class_count[votei_label] = class_count.get(votei_label, 0) + 1 sorted_classcount = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True) return sorted_classcount[0][0] def main(): # data = img2vector(filename="./trainingDigits/0_0.txt") # print(data) handwritingClassTest() if __name__ == '__main__': main()