<机器学习实战>学习笔记之KNN-约会数据识别

Jimmy Lee

学习思考|May 4, 2016|Last edited: 2022-7-21|
type
Post
status
Published
date
May 4, 2016
slug
machine-learning-knn
summary
tags
机器学习
ML
category
学习思考
icon
Update time
Jul 21, 2022 02:24 AM
Internal status
password
# -*- coding: utf-8 -*-""" 约会数据kNN """ from numpyimport * import matplotlib.pyplotas plt import operator defclassify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = tile(inX, (dataSetSize, 1)) - dataSet sqDiffMat = diffMat ** 2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances ** 0.5 sortedDistIndicies = distances.argsort() classCount = {} for iin range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] deffile2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines())# get the number of lines in the file returnMat = zeros((numberOfLines, 3))# prepare matrix to return classLabelVector = []# prepare labels return fr = open(filename) index = 0 for linein fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index, :] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat, classLabelVector defautoNorm(dataSet): """ 归一化数值 因为计算距离的时候,差值最大的属性对计算结果的影响最大,如果对于此数据,我们认为权重一样,则需要将数据进行处理,将数值归一化 将属性的取值范围处理为0到1或者-1到1,使用下面的公式 newValue=(oldValue-min)/(max-min) :param dataSet:矩阵 :return:normDataSet:归一化后的矩阵 range:取值范围 minVals:最小值 """ minVals = dataSet.min(0)# 0代表第一列,取得第一列最小值 maxVals = dataSet.max(0)# 取得第一列最大值 ranges = maxVals - minVals# 可能的取值范围 normDataSet = zeros(shape(dataSet))# 创建新的返回矩阵 m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m, 1)) normDataSet = normDataSet / tile(ranges, (m, 1))# element wise dividereturn normDataSet, ranges, minVals defdatingClassTest(): hoRatio = 0.50# hold out 10% datingDataMat, datingLabels = file2matrix('datingData/datingTestSet2.txt')# load data setfrom file normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] numTestVecs = int(m * hoRatio) errorCount = 0.0 for iin range(numTestVecs): classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]) if (classifierResult != datingLabels[i]): errorCount += 1.0 print "the total error rate is: %f" % (errorCount / float(numTestVecs)) print errorCount defshowNormal(): """ 最基本散点图 没有样本类别标签的约会数据散点图.难以辨识途中的点究竟属于那个样本分类 :return: """ datingDataMat, datingLables = file2matrix('datingData/datingTestSet2.txt') fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2]) plt.xlabel("Percentage of Time Spent Playing Video Games") plt.ylabel("Liters of Icc Cream Consumed Per Week") plt.show() defshowLable(): """ 带有样本分类标签的约会数据散点图 虽然能够比较容易区分数据点丛书类别,但依然很那根据这张图得出结论性信息 :return: """ datingDataMat, datingLables = file2matrix('datingData/datingTestSet2.txt') fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0 * array(datingLables), 15.0 * array(datingLables)) ax.axis([-2, 25, -0.2, 2.0]) plt.xlabel("Percentage of Time Spent Playing Video Games") plt.ylabel("Liters of Icc Cream Consumed Per Week") plt.show() defshowClass(): """ 显示不同的分类 标识了三个不同的样本分类区域,具有不同爱好的人其类别区域也不同 每年赢得的飞行常客里程数与玩游戏视频游戏所占百分比的约会数据散点图 约会数据有三个分类标签,通过途中展示的两个特征更容易区分数据点从属的类别 :return: """ n = 1000# number of points to create xcord1 = []; ycord1 = [] xcord2 = []; ycord2 = [] xcord3 = []; ycord3 = [] markers = [] colors = [] fw = open('datingData/datingTestSet.txt', 'w') for iin range(n): [r0, r1] = random.standard_normal(2) myClass = random.uniform(0, 1) if (myClass <= 0.16): fFlyer = random.uniform(22000, 60000) tats = 3 + 1.6 * r1 markers.append(20) colors.append(2.1) classLabel = 1# 'didntLike' xcord1.append(fFlyer); ycord1.append(tats) elif ((myClass > 0.16)and (myClass <= 0.33)): fFlyer = 6000 * r0 + 70000 tats = 10 + 3 * r1 + 2 * r0 markers.append(20) colors.append(1.1) classLabel = 1# 'didntLike'if (tats < 0): tats = 0 if (fFlyer < 0): fFlyer = 0 xcord1.append(fFlyer); ycord1.append(tats) elif ((myClass > 0.33)and (myClass <= 0.66)): fFlyer = 5000 * r0 + 10000 tats = 3 + 2.8 * r1 markers.append(30) colors.append(1.1) classLabel = 2# 'smallDoses'if (tats < 0): tats = 0 if (fFlyer < 0): fFlyer = 0 xcord2.append(fFlyer); ycord2.append(tats) else: fFlyer = 10000 * r0 + 35000 tats = 10 + 2.0 * r1 markers.append(50) colors.append(0.1) classLabel = 3# 'largeDoses'if (tats < 0): tats = 0 if (fFlyer < 0): fFlyer = 0 xcord3.append(fFlyer); ycord3.append(tats) fw.close() fig = plt.figure() ax = fig.add_subplot(111) # ax.scatter(xcord,ycord, c=colors, s=markers) type1 = ax.scatter(xcord1, ycord1, s=20, c='red') type2 = ax.scatter(xcord2, ycord2, s=30, c='green') type3 = ax.scatter(xcord3, ycord3, s=50, c='blue') ax.legend([type1, type2, type3], ["Did Not Like", "Liked in Small Doses", "Liked in Large Doses"], loc=2) ax.axis([-5000, 100000, -2, 25]) plt.xlabel('Frequent Flyier Miles Earned Per Year') plt.ylabel('Percentage of Time Spent Playing Video Games') plt.show() showNormal() showLable() showClass() datingClassTest()
 

开始订阅我的关于终生学习, 生产力以及知识管理的文章. 订阅后, 您将收到我的精选文章.

©2014-2024 Jimmy Lee. All rights reserved. 公众号: 技术管理方法论
Powered By My Lovely Children