1500字范文 > 【机器学习实战】k近邻算法实战——手写识别系统


为了使用前面两个例子的分类器,我们必须将图像格式化处理为一个向量。我们将把一个32× 32的二进制图像矩阵转换为1×1024的向量,这样前两节使用的分类器就可以处理数字图像信息了。


def img2vector(filename):returnVect = zeros((1, 1024))fr = open(filename)for i in range(32):lineStr = fr.readline()for j in range(32):returnVect[0, 32 * i + j] = int(lineStr[j])return returnVect


testVector = img2vector('digits/testDigits/0_13.txt')print(testVector[0, 0:31])print(testVector[0, 32:63])================================[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.0. 0. 0. 0. 0. 0. 0.][0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0.0. 0. 0. 0. 0. 0. 0.]


函数handwritingClassTest()是测试分类器的代码,将其写入kNN.py文件中。在写入这些代码之前,我们必须确保将from os import listdir写入文件的起始部分,这段代码的主要功能是从os模块中导入函数listdir,它可以列出给定目录的文件名。

# 测试分类器def handwritingClassTest():hwLabels = []# 获取目录内容trainingFileList = listdir('digits/trainingDigits')m = len(trainingFileList)trainingMat = zeros((m, 1024))for i in range(m):# 从文件名解析分类数字fileNameStr = trainingFileList[i]fileStr = fileNameStr.split('.')[0]classNumStr = int(fileStr.split('_')[0])hwLabels.append(classNumStr)trainingMat[i, :] = img2vector('digits/trainingDigits/%s' % fileNameStr)testFileList = listdir('digits/testDigits')errorCount = 0.0mTest = len(testFileList)for i in range(mTest):fileNameStr = testFileList[i]fileStr = fileNameStr.split('.')[0]classNumStr = int(fileStr.split('_')[0])vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr)classfierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)print("the classifier came back with %d, the real answer is %d" % (classfierResult, classNumStr))if (classfierResult != classNumStr): errorCount += 1.0print("\n the total number of errors is : %d" % errorCount)print("\n the total error rate is : %f" % (errorCount / float(mTest)))



handwritingClassTest()=====================================the classifier came back with 0, the real answer is 0the classifier came back with 0, the real answer is 0the classifier came back with 0, the real answer is 0the classifier came back with 0, the real answer is 0the classifier came back with 0, the real answer is 0the classifier came back with 0, the real answer is 0the classifier came back with 0, the real answer is 0......the classifier came back with 9, the real answer is 9the classifier came back with 9, the real answer is 9the classifier came back with 9, the real answer is 9the classifier came back with 9, the real answer is 9the classifier came back with 9, the real answer is 9the classifier came back with 9, the real answer is 9the classifier came back with 9, the real answer is 9the classifier came back with 9, the real answer is 9the classifier came back with 9, the real answer is 9the classifier came back with 9, the real answer is 9the total number of errors is : 10the total error rate is : 0.010571




from numpy import *import operatordef classify0(inX, dataSet, labels, k):dataSetSize = dataSet.shape[0]# 距离计算diffMat = tile(inX, (dataSetSize, 1)) - dataSetsqDiffMat = diffMat ** 2sqDistances = sqDiffMat.sum(axis=1)distances = sqDistances ** 0.5sortedDistIndicies = distances.argsort()classCount = {}# 选择距离最小的k个点for i in range(k):voteIlabel = labels[sortedDistIndicies[i]]classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1# 排序sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1), reverse=True)return sortedClassCount[0][0]def autoNorm(dataSet):minVals = dataSet.min(0)maxVals = dataSet.max(0)ranges = maxVals - minValsnormDataSet = zeros(shape(dataSet))m = dataSet.shape[0]normDataSet = dataSet - tile(minVals, (m, 1))# 特征值相除normDataSet = normDataSet / tile(ranges, (m, 1))return normDataSet, ranges, minVals# 手写识别####################### 将图像转换为向量def img2vector(filename):returnVect = zeros((1, 1024))fr = open(filename)for i in range(32):lineStr = fr.readline()for j in range(32):returnVect[0, 32 * i + j] = int(lineStr[j])return returnVect## testVector = img2vector('digits/testDigits/0_13.txt')# print(testVector[0, 0:31])# print(testVector[0, 32:63])from os import listdir# 测试分类器def handwritingClassTest():hwLabels = []# 获取目录内容trainingFileList = listdir('digits/trainingDigits')m = len(trainingFileList)trainingMat = zeros((m, 1024))for i in range(m):# 从文件名解析分类数字fileNameStr = trainingFileList[i]fileStr = fileNameStr.split('.')[0]classNumStr = int(fileStr.split('_')[0])hwLabels.append(classNumStr)trainingMat[i, :] = img2vector('digits/trainingDigits/%s' % fileNameStr)testFileList = listdir('digits/testDigits')errorCount = 0.0mTest = len(testFileList)for i in range(mTest):fileNameStr = testFileList[i]fileStr = fileNameStr.split('.')[0]classNumStr = int(fileStr.split('_')[0])vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr)classfierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)print("the classifier came back with %d, the real answer is %d" % (classfierResult, classNumStr))if (classfierResult != classNumStr): errorCount += 1.0print("\n the total number of errors is : %d" % errorCount)print("\n the total error rate is : %f" % (errorCount / float(mTest)))handwritingClassTest()
