import math import matplotlib.pyplot as plt import numpy as np
def createDataSet(): """ 创建测试的数据集,里面的数值中具有连续值 :return: """ dataSet = [ [0.697, 0.460], [0.774, 0.376], [0.634, 0.264], [0.608, 0.318], [0.556, 0.215], [0.403, 0.237], [0.481, 0.149], [0.437, 0.211], [0.666, 0.091], [0.243, 0.267], [0.245, 0.057], [0.343, 0.099], [0.639, 0.161], [0.657, 0.198], [0.360, 0.370], [0.593, 0.042], [0.719, 0.103], [0.359, 0.188], [0.339, 0.241], [0.282, 0.257], [0.748, 0.232], [0.714, 0.346], [0.483, 0.312], [0.478, 0.437], [0.525, 0.369], [0.751, 0.489], [0.532, 0.472], [0.473, 0.376], [0.725, 0.445], [0.446, 0.459], ]
labels = ['密度', '含糖率']
labels_full = {}
for i in range(len(labels)): labelList = [example[i] for example in dataSet] uniqueLabel = set(labelList) labels_full[labels[i]] = uniqueLabel
return dataSet, labels, labels_full
def dist_eclud(sample, centers): sample = np.array(sample) samples = np.tile(sample, (centers.shape[0], 1)) distances = np.power(samples - centers, 2).sum(axis=1) return distances
class Kmeans(): """Kmeans聚类算法.
Parameters: ----------- k: int 聚类的数目. max_iterations: int 最大迭代次数. varepsilon: float 判断是否收敛, 如果上一次的所有k个聚类中心与本次的所有k个聚类中心的差都小于varepsilon, 则说明算法已经收敛 """
def __init__(self, k=2, max_iterations=500, varepsilon=0.0001): self.k = k self.max_iterations = max_iterations self.varepsilon = varepsilon
def init_rand_center(self, data)->np.ndarray: centroids = np.zeros((self.k, np.shape(data)[1])) row = np.random.choice(np.shape(data)[0], size=self.k) for i in range(self.k): centroids[i] = data[row[i]] return centroids
def _closest_centroids(self, sample, centroids): distance = dist_eclud(sample, centroids) closest_i = np.argmin(distance) return closest_i
def create_clusters(self, centroids, X): clusters = [[] for i in range(self.k)] for index, sample in enumerate(X): centroid_i = self._closest_centroids(sample, centroids) clusters[centroid_i].append(index) return clusters
def update_centroids(self, clusters, X): n_features = np.shape(X)[1] centroids = np.zeros((self.k, n_features)) for i, cluster in enumerate(clusters): centroid = np.mean(X[cluster], axis=0) centroids[i] = centroid return centroids
def get_cluster_labels(self, clusters, X): y_pred = np.zeros(np.shape(X)[0]) for cluster_i, cluster in enumerate(clusters): for sample_i in cluster: y_pred[sample_i] = cluster_i return y_pred
if __name__ == "__main__": plt.rcParams['font.sans-serif'] = ['YaHei Consolas Hybrid'] plt.rcParams['axes.unicode_minus'] = False
dataset, labels, label_full = createDataSet() X = np.array(dataset)
km = Kmeans(3, 100, 0.001) center = km.init_rand_center(X) for _ in range(km.max_iterations): clusters = km.create_clusters(center, X) former_centroids = center center = km.update_centroids(clusters, X) diff = center - former_centroids if diff.any() < km.varepsilon: break y_pred = km.get_cluster_labels(clusters, X)
C = np.c_[X, y_pred] plt.scatter(C[:, 0], C[:, 1], c=C[:, 2]) c = [float(i) for i in range(center.shape[0])] plt.scatter(center[:, 0], center[:, 1], c=c, marker='s') plt.xlabel('密度') plt.ylabel('含糖量') plt.title('k-means聚类') plt.show()
|