クラスタリング結果が初期値に依存するけど、k-meansほどではない?
#coding:utf-8 import numpy as np from sklearn import datasets import math from numpy import dot from numpy.linalg import inv from numpy.linalg import det import random from numpy.matlib import repmat from numpy import matrix import pdb import matplotlib.pyplot as plt iris = datasets.load_iris() data = iris.data target = iris.target d = 4 #特徴量の次元 K = 3 #クラス数 L_last = 0 #直前ステップでの対数尤度 xs = data[:,0] # for scatter plot ys = data[:,2] plot_s = [30 for i in range(len(xs))] #scatter plot point size """functions""" def Gaussian(x, mu,sigma): coef = 1/(((2*math.pi)**(1/K))*(det(sigma))**0.5) exp = math.e**(-(1/2.0)*dot((dot(matrix(x-mu),inv(sigma))),matrix(x-mu).T).item()) N = coef * exp return N def makeColor(labels): colors=[] for l in labels: if l==0: colors.append('r') elif l==1: colors.append('g') elif l==2: colors.append('b') return colors def calcPrecision(target,results): success = 0 fail = 0 for i in range(len(target)): if target[i]==results[i]: success = success + 1 else: fail = fail+1 return success/float(success+fail)*100 """initialize""" gamma = np.zeros((len(data),K)) # for i in range(len(gamma)): # problist = [3.5/10.0, 3.4/10.0, 3.1/10.0] # rindex = int(math.floor(random.random()*3)) # g0 = problist.pop(rindex) # rindex = int(math.floor(random.random()*2)) # g1 = problist.pop(rindex) # g2 = problist[0] # gamma[i][0] = g0 # gamma[i][1] = g1 # gamma[i][2] = g2 for i in range(len(gamma)): true_label = target[i] luck = random.random() if (luck<0.3): # if True: # all pass gamma[i][0] = 1/10.0 gamma[i][1] = 1/10.0 gamma[i][2] = 1/10.0 gamma[i][true_label] = 8/10.0 else: nums = [0,1,2] idx1 = int(math.floor(random.random()*3)) rand_idx1 = nums.pop(idx1) gamma[i][rand_idx1] = 8/10.0 idx2 = int(math.floor(random.random()*2)) rand_idx2 = nums.pop(idx2) gamma[i][rand_idx2] = 1/10.0 rand_idx3 = nums[0] gamma[i][rand_idx3] = 1/10.0 pi = np.array([1/3.0,1/3.0,1/3.0]) #混合比 labels = np.array([g.index(max(g)) for g in gamma.tolist()]) initial_labels = np.array([g.index(max(g)) for g in gamma.tolist()]) idx_0 = np.where(labels==0)[0] x_0 = data[idx_0,:] mu_0 = sum(x_0) / len(x_0) mu_0s = repmat(mu_0,len(x_0),1) S0 = np.zeros((d,d)) for i in range(d): for j in range(d): S0[i,j] = sum((x_0[:,i]-mu_0s[:,i])*(x_0[:,j]-mu_0s[:,j]))/len(x_0) idx_1 = np.where(labels==1)[0] x_1 = data[idx_1,:] mu_1 = sum(x_1) / len(x_1) mu_1s = repmat(mu_1,len(x_1),1) S1 = np.zeros((d,d)) for i in range(d): for j in range(d): S1[i,j] = sum((x_1[:,i]-mu_1s[:,i])*(x_1[:,j]-mu_1s[:,j]))/len(x_1) idx_2 = np.where(labels==2)[0] x_2 = data[idx_2,:] mu_2 = sum(x_2) / len(x_2) mu_2s = repmat(mu_2,len(x_2),1) S2 = np.zeros((d,d)) for i in range(d): for j in range(d): S2[i,j] = sum((x_2[:,i]-mu_2s[:,i])*(x_2[:,j]-mu_2s[:,j]))/len(x_2) MAX_LOOP = 100000 success_array=[] """EM algorithm""" for epoch in range(MAX_LOOP): """E-step""" for i in range(len(data)): xi = data[i,:] Gauss0 = Gaussian(xi,mu_0,S0) Gauss1 = Gaussian(xi,mu_1,S1) Gauss2 = Gaussian(xi,mu_2,S2) gaussian_array = np.array([Gauss0,Gauss1,Gauss2]) for k in range(K): gamma[i][k] = (pi[k]*gaussian_array[k])/sum(pi*gaussian_array) """M-step""" N0 = sum(gamma[:,0]).item() N1 = sum(gamma[:,1]).item() N2 = sum(gamma[:,2]).item() mu_0 = 1/N0 * sum([gamma[i,0].item()*data[i,:] for i in range(len(data))]) mu_1 = 1/N1 * sum([gamma[i,1].item()*data[i,:] for i in range(len(data))]) mu_2 = 1/N2 * sum([gamma[i,2].item()*data[i,:] for i in range(len(data))]) mu = np.array([mu_0,mu_1,mu_2]) S0 = np.zeros((d,d)) for i in range(len(data)): delta = gamma[i,0]*dot(matrix(data[i,:]-mu_0).T,matrix(data[i,:]-mu_0))/N0 S0 = S0 + delta S1 = np.zeros((d,d)) for i in range(len(data)): delta = gamma[i,1]*dot(matrix(data[i,:]-mu_1).T,matrix(data[i,:]-mu_1))/N1 S1 = S1 + delta S2 = np.zeros((d,d)) for i in range(len(data)): delta = gamma[i,2]*dot(matrix(data[i,:]-mu_2).T,matrix(data[i,:]-mu_2))/N2 S2 = S2 + delta S = [S0,S1,S2] N = N0+N1+N2 pi_0 = N0/N pi_1 = N1/N pi_2 = N2/N pi = np.array([pi_0,pi_1,pi_2]) zs = np.zeros((len(data),K)) for i in range(len(gamma)): g = gamma[i] k = np.where(g==max(g))[0].tolist()[0] if(labels[i] != k): print "#", #->update label labels[i] = k zs[i][k] = 1 L = 0 for i in range(len(zs)): for k in range(K): L = L + zs[i][k]*math.log(pi[k]) + zs[i][k]*(-d/2*math.log(2*math.pi) -1/2.0*math.log(det(S[k])) -1/2.0*(dot(dot(matrix(data[i,:]-mu[k]),inv(S[k])),matrix(data[i,:]-mu[k]).T))) if(abs(L-L_last)<math.e**-20): print "converged" break L_last = L.item() rate = calcPrecision(target,labels) print "Likelihood:",L.item(),"rate:",rate success_array.append([epoch,L,rate]) epoch = epoch + 1 print "initialized data",initial_labels print "computed result",labels print "correct data",target # plt.subplot2grid((2,2),(0,0),rowspan=2) plt.subplot(221) plt.scatter(xs,ys,plot_s,makeColor(initial_labels)) plt.title('Initialized sample') plt.subplot(223) plt.scatter(xs,ys,plot_s,makeColor(labels)) plt.title('Classification using EM algorithm') res = np.array(success_array) ts = res[:,0] Ls = res[:,1] Rates = res[:,2] # plt.subplot2grid((2,2),(0,1)) plt.subplot(222) plt.plot(ts,Ls,'b-') plt.title('Likelihood') # plt.subplot2grid((2,2),(1,1)) plt.subplot(224) plt.plot(ts,Rates,'r-') plt.title('Precision') plt.figure() plt.scatter(xs,ys,plot_s,makeColor(target)) plt.title('original') plt.show()