机器学习实战--KNN分类器
1.KNN聚类算法
from numpy import *
import operator
def classify(test,data,target,k):
datasize=data.shape[0]
diff=tile(test,(datasize,1))-data#tile()
sq_diff=diff**2
sq_dis=sq_diff.sum(axis=1)
dis=sq_dis**0.5
sorted_dis_index=dis.argsort()
cnt={}
for i in range(k):
num=target[sorted_dis_index[i]]
cnt[num]=cnt.get(num,0)+1#get() return value of cnt[num](if cnt[num] doesn't exist cnt[num]=0)
sorted_cnt=sorted(cnt.items(),key=operator.itemgetter(1),reverse=True)#itemgetter(1)
return sorted_cnt[0][0]
#data,target=createDataSet()
#print(classify([1,1],data,target,3))
2.文件转矩阵形式
def file2matrix(filename):
f=open(filename)
array_line=f.readlines()
len_line=len(array_line)
data=zeros((len_line,3))
tar=[]
index=0
for line in array_line:
line=line.strip()
line2list=line.split('\t')
data[index,:]=line2list[0:3]
tar.append(int(line2list[-1]))
index+=1
return data,tar
dt,tar=file2matrix('datingTestSet2.txt')
#print(dt)
#print(tar)
3.作图观察数据集两列特征值之间的关系
import matplotlib as mlp
mlp.use('Agg')
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(dt[:,1],dt[:,2],15.0*array(tar),15.0*array(tar))
#plt.savefig('/home/xxh/tf/KNN/datingTestPic.png')
得到散点图:
3.数据归一化
def autoNorm(dataset):
minval=dataset.min(0)
maxval=dataset.max(0)
Range=maxval-minval
normdataset=zeros(shape(dataset))
m=dataset.shape[0]
normdataset=dataset-tile(minval,(m,1))
normdataset=normdataset/tile(Range,(m,1))
return normdataset,Range,minval
#normdt,range,minval=autoNorm(dt)
#print(normdt)
#print(range)
#print(minval)
4.分类器测试(主函数)
def datingClassTest():
errorList=[]
ratioList=[]
data,target=file2matrix('datingTestSet2.txt')
normdata,ranges,minval=autoNorm(data)
m=normdata.shape[0]
for j in range(1,100):
ratioOfTrainingData=0.01*j
ratioList.append(ratioOfTrainingData)
numOfTrainingData=int(m*ratioOfTrainingData)
errorNum=0.0
for i in range(numOfTrainingData):
result=classify(normdata[i,:],normdata[numOfTrainingData:m,:],target[numOfTrainingData:m],3)
if(result!=target[i]):errorNum+=1.0
errorRate=errorNum/float(numOfTrainingData)
print(j,end=' ')
print(errorRate)
errorList.append(float(errorRate))
figure=plt.figure()
axx=figure.add_subplot(111)
axx.scatter(ratioList[:],errorList[:])
plt.savefig('/home/xxh/tf/KNN/KNN_TEST.png')
datingClassTest()
顺便画了一下测试集比例与准确率之间的关系图:
5.分类器使用
def classifyPerson():
resultList=['not at all','in small doses','in large doses']
percent=float(input("percentage"))
ffm=float(input("frequence"))
icecream=float(input("ice-cream"))
data,target=file2matrix('datingTestSet2.txt')
normdata,ranges,minval=autoNorm(data)
inArr=array([ffm,percent,icecream])
result=classify((inArr-minval)/ranges,normdata,target,3)
print(resultList[result-1])
classifyPerson()