K-Means算法和K-Means++算法的聚类

在构成圆形的30000个随机样本点上,设置7个簇,使用K-Means算法聚类

from math import pi, sin, cos
from collections import namedtuple
from random import random, choice
from copy import copy
import matplotlib.pyplot as plt
import numpy as np
import sys

try:
    import psyco
    psyco.full()
except ImportError:
    pass


FLOAT_MAX = 1e100


class Point:
    __slots__ = ["x", "y", "group"]
    def __init__(self, x=0.0, y=0.0, group=1):
        self.x, self.y, self.group = x, y, group


def generate_points(npoints, radius):
    points = [Point() for _ in range(npoints)]

    # note: this is not a uniform 2-d distribution
    for p in points:
        r = random() * radius
        ang = random() * 2 * pi
        p.x = r * cos(ang)
        p.y = r * sin(ang)

    return points


def nearest_cluster_center(point, cluster_centers):
    """Distance and index of the closest cluster center"""
    def sqr_distance_2D(a, b):
        return (a.x - b.x) ** 2  +  (a.y - b.y) ** 2

    min_index = point.group
    min_dist = FLOAT_MAX

    for i, cc in enumerate(cluster_centers):
        d = sqr_distance_2D(cc, point)
        if min_dist > d:
            min_dist = d
            min_index = i

    return (min_index, min_dist)


def kpp(points, cluster_centers):
    cluster_centers[0] = copy(choice(points))
    d = [0.0 for _ in range(len(points))]

    for i in range(1, len(cluster_centers)):
        sum = 0
        for j, p in enumerate(points):
            d[j] = nearest_cluster_center(p, cluster_centers[:i])[1]
            sum += d[j]

        sum *= random()

        for j, di in enumerate(d):
            sum -= di
            if sum > 0:
                continue
            cluster_centers[i] = copy(points[j])
            break

    for p in points:
        p.group = nearest_cluster_center(p, cluster_centers)[0]


def lloyd(points, nclusters):
    cluster_centers = [Point() for _ in range(nclusters)]

    # call k++ init
    kpp(points, cluster_centers)

    lenpts10 = len(points) >> 10

    changed = 0
    while True:
        # group element for centroids are used as counters
        for cc in cluster_centers:
            cc.x = 0
            cc.y = 0
            cc.group = 0

        for p in points:
            cluster_centers[p.group].group += 1
            cluster_centers[p.group].x += p.x
            cluster_centers[p.group].y += p.y

        for cc in cluster_centers:
            cc.x /= cc.group
            cc.y /= cc.group

        # find closest centroid of each PointPtr
        changed = 0
        for p in points:
            min_i = nearest_cluster_center(p, cluster_centers)[0]
            if min_i != p.group:
                changed += 1
                p.group = min_i

        # stop when 99.9% of points are good
        if changed <= lenpts10:
            break

    for i, cc in enumerate(cluster_centers):
        cc.group = i

    return cluster_centers

def kmeans(points, nclusters):
    cluster_centers = [copy(choice(points)) for _ in xrange(nclusters)]
    lenpts10 = len(points) >> 10
    changed = 0
    while True:
        for p in points:
            cluster_centers[p.group].group += 1
            cluster_centers[p.group].x += p.x
            cluster_centers[p.group].y += p.y

        for cc in cluster_centers:
            #print cc.x,cc.y,cc.group
            cc.x /= cc.group
            cc.y /= cc.group
        changed = 0
        for p in points:
            min_i = nearest_cluster_center(p, cluster_centers)[0]
            if min_i != p.group:
                changed += 1
                p.group = min_i

        # stop when 99.9% of points are good
        if changed <= lenpts10:
            break
    for i, cc in enumerate(cluster_centers):
        cc.group = i

    return cluster_centers

def print_eps(points, cluster_centers):

    c = np.random.rand(len(cluster_centers)+1,3)
    for i, cc in enumerate(cluster_centers):
        plot_x = []
        plot_y = []
        print (cc.x,cc.y,cc.group)
        for p in points:
            if p.group != i:
                continue
            plot_x.append(p.x)
            plot_y.append(p.y)
        plt.scatter(plot_x,plot_y,color=c[i],s=1)
        plt.scatter(cc.x,cc.y, color=c[len(c)-i-1], marker='^',s=50)
    plt.show()


def main():
    npoints = 30000
    k = 7 # # clusters

    points = generate_points(npoints, 10)
    cluster_centers = lloyd(points, k)
    #cluster_centers = kmeans(points,k)
    print_eps(points, cluster_centers)

main()

4.752567342690781 -4.334890736846155 0
-1.3573754262792683 6.986665631867061 1
-0.15969173199245487 2.506935816564994 2
-6.2690747305026315 0.7728991240509486 3
-2.713392739613833 -5.828585413438269 4
5.771515155816214 2.91539616422218 5
0.07082712668661448 -0.7504142150100154 6

K-Means算法和K-Means++算法的聚类