import numpy as np from sklearn import datasets import math from pylab import * iris = datasets.load_iris() irisdata = iris.data irislabel = iris.target def calcDistance(point,target): xp=point[0] yp=point[1] xt=target[0] yt=target[1] dist = math.sqrt((xp-xt)**2+(yp-yt)**2) return dist def computeLabel(k,min_j_arr): if k==1: label = irislabel[min_j_arr[0]] elif k==3: candidates = [irislabel[min_j_arr[0]],irislabel[min_j_arr[1]],irislabel[min_j_arr[2]]] label = np.bincount(candidates).argmax() # decide by majority elif k==5: candidates = [irislabel[min_j_arr[0]],irislabel[min_j_arr[1]],irislabel[min_j_arr[2]],irislabel[min_j_arr[3]],irislabel[min_j_arr[4]]] label= np.bincount(candidates).argmax() # decide by majority return label def computeMinimumDistanceIndex(k,d,min_j_arr,min_d_arr,j): if k==1: if d < min_d_arr[0]: min_d_arr[0] = d min_j_arr[0] = j elif k==3: if d < min_d_arr[0]: min_d_arr[0] = d min_j_arr[0] = j elif d < min_d_arr[1]: min_d_arr[1] = d min_j_arr[1] = j elif d < min_d_arr[2]: min_d_arr[2] = d min_j_arr[2] = j elif k==5: if d < min_d_arr[0]: min_d_arr[0] = d min_j_arr[0] = j elif d < min_d_arr[1]: min_d_arr[1] = d min_j_arr[1] = j elif d < min_d_arr[2]: min_d_arr[2] = d min_j_arr[2] = j elif d < min_d_arr[3]: min_d_arr[3] = d min_j_arr[3] = j elif d < min_d_arr[4]: min_d_arr[4] = d min_j_arr[4] = j def kNN(k,coords): trained = [] xs = coords[:,0] ys = coords[:,1] for i in range(len(xs)): xp = xs[i] yp = ys[i] min_d_arr = [] min_j_arr = [] for i in range(k): min_d_arr.append(math.sqrt(xp**2 + yp**2)) # initialize array with arbitral large number for i in range(k): min_j_arr.append(0) # initialize array with zero for j in range(len(xs)): if i==j: continue xt = xs[j] yt = ys[j] d = calcDistance((xp,yp),(xt,yt)) computeMinimumDistanceIndex(k,d,min_j_arr,min_d_arr,j) label = computeLabel(k,min_j_arr) trained.append(label) return trained def label_to_color(label_array): cs = [] for l in label_array: if l==0: cs.append('r') elif l==1: cs.append('g') elif l==2: cs.append('b') return cs def compute_score(label_train,label_trained): success = 0 fail = 0 for i in range(len(label_train)): if (label_train[i]==label_trained[i]): success = success + 1 else: fail = fail + 1 score = success / float(success+fail) * 100 return score trained1 = kNN(1,irisdata) trained3 = kNN(3,irisdata) trained5 = kNN(5,irisdata) score1 = compute_score(irislabel,trained1) score3 = compute_score(irislabel,trained3) score5 = compute_score(irislabel,trained5) print "1-NN score: ",score1 print "3-NN score: ",score3 print "5-NN score: ",score5 t1= label_to_color(irislabel) c1 = label_to_color(trained1) c3 = label_to_color(trained3) c5 = label_to_color(trained5) xs = irisdata[:,0] ys = irisdata[:,1] subplot(221) scatter(xs,ys,c=t1) title('datasets') subplot(222) scatter(xs,ys,c=c1) title('1-NN classifier') subplot(223) scatter(xs,ys,c=c3) title('3-NN classifier') subplot(224) scatter(xs,ys,c=c5) title('5-NN classifier') show()
実行結果
テストデータに対しては1-NN法が最も精度が高い
1-NN score: 92.0 3-NN score: 85.3333333333 5-NN score: 80.0
Sepal.LengthとPetal.Lengthで試したら精度が高くなった
import numpy as np from sklearn import datasets import math from pylab import * iris = datasets.load_iris() irisdata = iris.data irislabel = iris.target def calcDistance(point,target): xp=point[0] yp=point[1] xt=target[0] yt=target[1] dist = math.sqrt((xp-xt)**2+(yp-yt)**2) return dist def computeLabel(k,min_j_arr): if k==1: label = irislabel[min_j_arr[0]] elif k==3: candidates = [irislabel[min_j_arr[0]],irislabel[min_j_arr[1]],irislabel[min_j_arr[2]]] label = np.bincount(candidates).argmax() # decide by majority elif k==5: candidates = [irislabel[min_j_arr[0]],irislabel[min_j_arr[1]],irislabel[min_j_arr[2]],irislabel[min_j_arr[3]],irislabel[min_j_arr[4]]] label= np.bincount(candidates).argmax() # decide by majority return label def computeMinimumDistanceIndex(k,d,min_j_arr,min_d_arr,j): if k==1: if d < min_d_arr[0]: min_d_arr[0] = d min_j_arr[0] = j elif k==3: if d < min_d_arr[0]: min_d_arr[0] = d min_j_arr[0] = j elif d < min_d_arr[1]: min_d_arr[1] = d min_j_arr[1] = j elif d < min_d_arr[2]: min_d_arr[2] = d min_j_arr[2] = j elif k==5: if d < min_d_arr[0]: min_d_arr[0] = d min_j_arr[0] = j elif d < min_d_arr[1]: min_d_arr[1] = d min_j_arr[1] = j elif d < min_d_arr[2]: min_d_arr[2] = d min_j_arr[2] = j elif d < min_d_arr[3]: min_d_arr[3] = d min_j_arr[3] = j elif d < min_d_arr[4]: min_d_arr[4] = d min_j_arr[4] = j def kNN(k,coords): trained = [] xs = coords[:,0] ys = coords[:,2] for i in range(len(xs)): xp = xs[i] yp = ys[i] min_d_arr = [] min_j_arr = [] for i in range(k): min_d_arr.append(math.sqrt(xp**2 + yp**2)) # initialize array with arbitral large number for i in range(k): min_j_arr.append(0) # initialize array with zero for j in range(len(xs)): if i==j: continue xt = xs[j] yt = ys[j] d = calcDistance((xp,yp),(xt,yt)) computeMinimumDistanceIndex(k,d,min_j_arr,min_d_arr,j) label = computeLabel(k,min_j_arr) trained.append(label) return trained def label_to_color(label_array): cs = [] for l in label_array: if l==0: cs.append('r') elif l==1: cs.append('g') elif l==2: cs.append('b') return cs def compute_score(label_train,label_trained): success = 0 fail = 0 for i in range(len(label_train)): if (label_train[i]==label_trained[i]): success = success + 1 else: fail = fail + 1 score = success / float(success+fail) * 100 return score trained1 = kNN(1,irisdata) trained3 = kNN(3,irisdata) trained5 = kNN(5,irisdata) score1 = compute_score(irislabel,trained1) score3 = compute_score(irislabel,trained3) score5 = compute_score(irislabel,trained5) print "1-NN score: ",score1 print "3-NN score: ",score3 print "5-NN score: ",score5 t1= label_to_color(irislabel) c1 = label_to_color(trained1) c3 = label_to_color(trained3) c5 = label_to_color(trained5) xs = irisdata[:,0] ys = irisdata[:,2] subplot(221) scatter(xs,ys,c=t1) title('datasets') subplot(222) scatter(xs,ys,c=c1) title('1-NN classifier') subplot(223) scatter(xs,ys,c=c3) title('3-NN classifier') subplot(224) scatter(xs,ys,c=c5) title('5-NN classifier') show()
1-NN score: 99.3333333333 3-NN score: 96.0 5-NN score: 95.3333333333