Helen appointment dataset link
It is suggested to normalize the data in the implementation process , Because in this dataset , Compared to game time and weekly ice cream consumption liters , The number of kilometers flown by the aircraft is the largest in value , Its change has a greater impact on the calculation of distance , It is easier to affect the final classification results . But in Helen's dating case, we think three characteristics are equally important , Therefore, we need to limit the value range of all features , Let the three features have the same impact on the final classification results .
After testing, it is found that the accuracy of the test data before and after normalization is 80%, The latter is 96%
import operator
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
# Import data
def getdata(url):
fr=open(url)
alldata=fr.readlines()# Store all data in a list
alldata_len=len(alldata)# This dataset has 1000 Data
data_matrix=np.zeros([alldata_len,3]) # A matrix for storing features
label_matrix=[] # A matrix for storing labels
index=0
for line in alldata:
line=line.strip()# Delete space characters such as \n,\t
linelist=line.split("\t")# After deleting the space , Make up a list
data_matrix[index]=linelist[0:3]# Fill in the characteristic matrix
if linelist[-1]=="didntLike": # Fill in the label matrix Don't like 0, I like it a little bit 1, I really like 2
label_matrix.append(0)
elif linelist[-1] == "smallDoses":
label_matrix.append(1)
else:
label_matrix.append(2)
index+=1
return data_matrix,label_matrix
# Data visualization
def showdata(data_matrix,label_matrix):
dim=data_matrix.shape[-1]
data_matrix_train=data_matrix[0:200,:] # Too many points , Let's take it 200 Let's have a look
label_matrix_train=label_matrix[0:200]
train_len=data_matrix_train.shape[0]
mark=["or","og","ob"]
ax=plt.axes(projection='3d')
# Print legend
ax.plot([],[],[], 'or', label=' Don't like ', markersize=8)
ax.plot([], [], [], 'og', label=' I like it a little bit ', markersize=8)
ax.plot([], [], [], 'ob', label=' Partiality ', markersize=8)
# Print axis
ax.set_xlabel(" Kilometers flown per year ")
ax.set_ylabel(" Number of game hours ")
ax.set_zlabel(" Litres of ice cream per week ")
plt.legend()
# mapping
j = 0
for i in label_matrix_train:
ax.plot(data_matrix_train[j:j+1,0],data_matrix_train[j:j+1,1],data_matrix_train[j:j+1,2],
mark[i],markersize=5)
j+=1
plt.show()
# Data normalization
def autoNorm(dataSet):
# Get the minimum value of the data
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
# Range of maximum and minimum values
ranges = maxVals - minVals
#shape(dataSet) return dataSet The number of rows and columns of the matrix
normDataSet = np.zeros(np.shape(dataSet))
# return dataSet The number of rows
m = dataSet.shape[0]
# The original value minus the minimum value
normDataSet = dataSet - np.tile(minVals, (m, 1))
# Divided by the difference between the maximum and minimum , Get normalized data
normDataSet = normDataSet / np.tile(ranges, (m, 1))
# Return normalized data results , Data range , minimum value
return normDataSet, ranges, minVals
def KNN(inpudata,data_matrix,label_matrix,k):
''' :param inpudata: Data to be predicted :param data_matrix: Data sets :param label_matrix: Tags for data sets :param k: Number of nearest neighbors to find :return: The result of the classification '''
datalen=data_matrix.shape[0]
inpudata=np.tile(inpudata,(datalen,1))# Helen's dating example turns data into (datalen,3)
sub=inpudata-data_matrix
sq=sub**2
sum=sq.sum(axis=1)# Sum in row direction
distance=sum*0.5
sorteddisarg=distance.argsort()# Index sorted from small to large by distance
classcount={
}# Record the number of occurrences
for i in range(k):
vote=label_matrix[sorteddisarg[i]]# The first i A category of elements
classcount[vote]=classcount.get(vote,0)+1 # to vote Add one to the value of this key , If this key doesn't exist , Create and set default values 0
# key=operator.itemgetter(1) Sort by dictionary value , by 0 Sort by dictionary key
# reverse Sort dictionary in descending order
result=sorted(classcount.items(),key=operator.itemgetter(1),reverse=True)
return result[0][0] # Returns the result of the classification
# Test the correctness
def classifytest():
url = "datingTestSet.txt"
data_matrix, classify = getdata(url)
# showdata(data_matrix, classify)
# Data normalization , Returns the normalized matrix , Data range , Minimum data
normMat, ranges, minimal = autoNorm(data_matrix)
rate = 0.1
m = normMat.shape[0]
datatestnum = int(m * rate)
errornum = 0.0
for i in range(0, datatestnum):
result = KNN(normMat[i, :], normMat[datatestnum:m, :], classify[datatestnum:m], 4)
# print(" Classification results :",result," The actual result ",classify[i])
if result != classify[i]:
errornum += 1;
print(" The correct rate is %.2f%%" %((1-(errornum/datatestnum))*100))
if __name__ == '__main__':
classifytest()# Test the accuracy of the classifier
classtype=[" Don't like "," I like it a little bit "," Partiality "]
# The number of kilometers flown by aircraft per year , Game time , Litres of ice cream per week
inputdata1=[40000,10,1] # I should like it very much
inputdata2 = [10000, 5,2] # I should like it a little
inputdata3 = [70000, 15, 0.6] # Should not like
data_matrix,label_matrix=getdata("datingTestSet.txt")
showdata(data_matrix,label_matrix)
result1=KNN(inputdata1,data_matrix,label_matrix,5)
result2 = KNN(inputdata2, data_matrix, label_matrix, 5)
result3 = KNN(inputdata3, data_matrix, label_matrix, 5)
print(" Helen may %s This man " %classtype[result1])
print(" Helen may %s This man " % classtype[result2])
print(" Helen may %s This man " % classtype[result3])