二分K-Means算法
二分K-Means算法是对K-Means算法的优化,主要优化的地方是在选取质心的时候,二分K-Means算法有效地避免了在初始选取质心时的误差,可以有效地提高算法效率。
测试数据:testSet2.txt 中间用tab分隔
3.275154 2.957587
-3.344465 2.603513
0.355083 -3.376585
1.852435 3.547351
-2.078973 2.552013
-0.993756 -0.884433
2.682252 4.007573
-3.087776 2.878713
-1.565978 -1.256985
2.441611 0.444826
-0.659487 3.111284
-0.459601 -2.618005
2.177680 2.387793
-2.920969 2.917485
-0.028814 -4.168078
3.625746 2.119041
-3.912363 1.325108
-0.551694 -2.814223
2.855808 3.483301
-3.594448 2.856651
0.421993 -2.372646
1.650821 3.407572
-2.082902 3.384412
-0.718809 -2.492514
4.513623 3.841029
-4.822011 4.607049
-0.656297 -1.449872
1.919901 4.439368
-3.287749 3.918836
-1.576936 -2.977622
3.598143 1.975970
-3.977329 4.900932
-1.791080 -2.184517
3.914654 3.559303
-1.910108 4.166946
-1.226597 -3.317889
1.148946 3.345138
-2.113864 3.548172
0.845762 -3.589788
2.629062 3.535831
-1.640717 2.990517
-1.881012 -2.485405
4.606999 3.510312
-4.366462 4.023316
0.765015 -3.001270
3.121904 2.173988
-4.025139 4.652310
-0.559558 -3.840539
4.376754 4.863579
-1.874308 4.032237
-0.089337 -3.026809
3.997787 2.518662
-3.082978 2.884822
0.845235 -3.454465
1.327224 3.358778
-2.889949 3.596178
-0.966018 -2.839827
2.960769 3.079555
-3.275518 1.577068
0.639276 -3.412840
功能函数:
from numpy import *
from math import *
'''
loadDataSet(fileName)函数将文本文件导入到一个列表中,
文本文件每一行为tab分隔的浮点数,
每一个列表会被添加到dataMat中,最后返回dataMat,
该返回值是一个包含许多其他列表的列表
'''
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float,curLine))
dataMat.append(fltLine)
return dataMat
'''
distEclud(vecA, vecB)函数计算两个向量的欧式距离
公式:sqrt((x1-x2)^2+(y1-y2)^2)
'''
def distEclud(vecA, vecB):
return math.sqrt(sum(power(vecA - vecB, 2)))
'''
randCent()函数为给定数据集构建一个包含k个随机质心的集合。
随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小值和最大值来完成。
然后生成0到1.0之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内。
'''
def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k,n)))#创建存储k个质心的矩阵
for j in range(n):#在边界范围内,随机生成k个质心
minJ = min(dataSet[:,j]) #边界的最小值
rangeJ = float(max(dataSet[:,j]) - minJ) #边界范围
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
return centroids
二分K-Means算法:
'''
二分K-均值聚类算法
'''
def biKmeans(dataSet, k, distMeas=distEclud):
m = shape(dataSet)[0]#确定数据集中数据点的总数
#创建一个矩阵来存放每个点的簇分配结果,包含两列:一列是记录簇索引值,第二列是存储误差。
#误差是指当前点到簇质心的距离,后面将使用该误差来评价聚类的效果。
clusterAssment = mat(zeros((m,2)))
centroid0 = mean(dataSet, axis=0).tolist()[0]#计算整个数据集的质心,即初始时的质心的坐标为所有数据点的均值
centList =[centroid0] #创建一个初始化只要一个初始质心的列表
#计算所有数据点到初始质心的距离平方误差
for j in range(m):
clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
#该while循环不断地对簇进行划分,直到得到设定的簇数目为止
while (len(centList) < k):
lowestSSE = inf
for i in range(len(centList)):#对每一个质心
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#将当前簇i中的所有数据看成一个小的数据集
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)#通过KMeans()函数,得到生成两个质心的簇,即二分,获取到质心及其每个簇的误差值
#将二分kMeans结果中的平方和的距离进行求和
sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
#将未参与二分kMeans分配结果中的平方和的距离进行求和
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
#???总的(未拆分和已拆分)误差和越小,越相似,效果越优化,划分的结果越好
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #调用二分kMeans的结果,默认簇是0,1
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit #更新为最佳质心
print('最好的质心列表是: ',bestCentToSplit)
print('最好的簇分配结果的长度是the len of bestClustAss is: ', len(bestClustAss))
#更新质心列表
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#更新原来的质心list中的第i个质心为使用二分kMeans后最好的质心的第一个质心
centList.append(bestNewCents[1,:].tolist()[0])#添加最佳质心的第二个质心
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#重新分配最好簇下的数据(质心)以及误差平方和
return mat(centList), clusterAssment
画图函数:
#画图
def show(dataSet, k, centroids, clusterAssment):
from matplotlib import pyplot as plt
numSamples, dim = dataSet.shape
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
for i in range(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 7, color='orange')
plt.show()
运行函数:
def main():
dataMat = mat(loadDataSet('testSet.txt'))
# 指定获取四个质心
#myCentroids, clustAssing= kMeans(dataMat,4)
myCentroids, clustAssing = biKmeans(dataMat, 4)
print("--------------------------------------------------")
print("最终的质心列表:")
print(myCentroids)
print("--------------------------------------------------")
show(dataMat, 4, myCentroids, clustAssing)
if __name__ == '__main__':
main()
运行结果:
--------------------------------------------------
[[ 0.45583556 -2.83951859]
[ 4.53016863 0.26630517]]
--------------------------------------------------
[[-1.42524564 -0.99766158]
[ 2.80397812 2.36861932]]
--------------------------------------------------
[[-1.4194757 -1.09467459]
[ 2.62933196 2.44063554]]
--------------------------------------------------
[[-1.41575251 -1.18274747]
[ 2.47206763 2.48258193]]
--------------------------------------------------
[[-1.427618 -1.24752315]
[ 2.35525282 2.471975 ]]
--------------------------------------------------
[[-1.42255751 -1.32962551]
[ 2.21590952 2.48810334]]
--------------------------------------------------
[[-1.4083314 -1.41809174]
[ 2.0709171 2.50828943]]
--------------------------------------------------
[[-1.38556017 -1.59171252]
[ 1.81930722 2.52332178]]
--------------------------------------------------
[[-1.14971512 -2.36515883]
[ 0.99613495 2.5978361 ]]
--------------------------------------------------
[[-0.80835482 -2.75696741]
[ 0.56675076 2.72843476]]
--------------------------------------------------
[[-0.54735726 -2.93692713]
[ 0.2978695 2.76065064]]
--------------------------------------------------
[[-0.2897198 -2.83942545]
[ 0.08249337 2.94802785]]
sseSplit, and notSplit: 792.9168565373268 0.0
最好的质心列表是: 0
最好的簇分配结果的长度是the len of bestClustAss is: 80
--------------------------------------------------
[[ 0.56946516 -2.26967164]
[-1.82366006 -2.2423357 ]]
--------------------------------------------------
[[ 2.65077367 -2.79019029]
[-3.53973889 -2.89384326]]
sseSplit, and notSplit: 84.25921395268443 326.2840752011824
--------------------------------------------------
[[ 2.64689211 2.65678291]
[-1.06102414 3.09849093]]
--------------------------------------------------
[[ 2.6265299 3.10868015]
[-2.46154315 2.78737555]]
sseSplit, and notSplit: 66.36683512000786 466.63278133614426
最好的质心列表是: 0
最好的簇分配结果的长度是the len of bestClustAss is: 40
--------------------------------------------------
[[ 2.8424327 -2.29684331]
[ 0.58029953 -3.49421623]]
--------------------------------------------------
[[ 3.03713839 -2.62802833]
[ 0.33258533 -3.763162 ]]
sseSplit, and notSplit: 40.03362852612693 348.38731175812995
--------------------------------------------------
[[-0.97552283 2.72658352]
[ 2.17396721 1.63579194]]
--------------------------------------------------
[[-2.46154315 2.78737555]
[ 2.6265299 3.10868015]]
sseSplit, and notSplit: 66.36683512000786 84.25921395268443
--------------------------------------------------
[[-3.53396447 -2.69981673]
[-2.29716453 -2.54415119]]
--------------------------------------------------
[[-3.98132369 -2.837576 ]
[-2.58297183 -3.01575567]]
--------------------------------------------------
[[-4.12637645 -2.82110091]
[-2.73311225 -2.993864 ]]
--------------------------------------------------
[[-4.1986774 -2.8253822 ]
[-2.807585 -2.96991111]]
sseSplit, and notSplit: 12.83784706044089 388.4400525969194
最好的质心列表是: 1
最好的簇分配结果的长度是the len of bestClustAss is: 40
--------------------------------------------------
最终的质心列表:
[[ 2.65077367 -2.79019029]
[-2.46154315 2.78737555]
[-3.53973889 -2.89384326]
[ 2.6265299 3.10868015]]
--------------------------------------------------