K-means聚类分析应用示例

#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''KMeans 聚类分析样例程序

关于样本和样本（点和点）之间距离的算法：
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html#sklearn.metrics.pairwise.pairwise_distances

用 silhouette 评估 KMeans 聚类分析，画图，确定分类数目：
http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

在 KMeans 之前先用主成分分析（PCA）降维：
http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py

'''


import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score, silhouette_samples


# 载入样例数据
from sklearn import datasets
dataset = datasets.load_iris()
X = dataset.data
# y = dataset.target  # 这是实际分类标签。由于 KMeans 是非监督（自动）分类算法，因此不用学习这个真实分类。
n_samples, n_features = X.shape
print('n_samples = {}, n_features = {}'.format(n_samples, n_features))


"""
    需要特别注意的是：这里的样例数据比较简单，每个样本的不同特征的量纲和数量级相同，但是我们要面对的问题可能不是这样：
    一天（看作一个样本、一个点）的特征有风速风向、最高最低气温、湿度、气压，及其24小时变化值等，
    不同要素的量纲和数量级不同，因此最好在进行 KMeans 聚类分析之前，先对每个要素进行标注化或归一化处理。
    归一化即 (x-min(x))/(max(x)-min(x))，把 x 转变为 [0, 1] 区间内。
    标准化即 (x-mean(x))/std(x)，结果有正有负，标准差变为 1。
"""


""" 
    以下两个 KMeans 样例程序，都是把示例数据 X 分成 3 类 
"""


def test_KMeans_default(X=X, n_clusters=3):
    # 直接采用 KMeans 进行聚类分析，默认距离计算方法是 euclidean，即欧几里得距离
    kmeans_model = KMeans(n_clusters=n_clusters, random_state=1).fit(X)
    cluster_labels = kmeans_model.labels_
    silhouette_avg = silhouette_score(X, cluster_labels, metric='euclidean')
    sample_silhouette_values = silhouette_samples(X, cluster_labels, metric='euclidean')
    # print sample_silhouette_values
    print('Average silhouette value: {}'.format(silhouette_avg))


def test_KMeans_using_custom_distance(X=X, n_clusters=3, metric='correlation'):
    # 先按照某种距离计算方法计算距离矩阵 D，如采用 correlation 算法，即 1 减去皮尔逊相关系数，这种算法在我们需要解决的问题中更常用
    D = pairwise_distances(X, metric=metric)
    clusterer = KMeans(n_clusters=n_clusters, random_state=1, precompute_distances=True)
    cluster_labels = clusterer.fit_predict(D)
    silhouette_avg = silhouette_score(D, cluster_labels, metric='precomputed')
    sample_silhouette_values = silhouette_samples(D, cluster_labels, metric='precomputed')
    # print sample_silhouette_values
    print('Average silhouette value: {}'.format(silhouette_avg))

    """
        以下绘图程序复制粘贴自官网示例，样例数据 X 只有 4 个特征，因此右侧的图能看出分类。
        对于高维度的气象数据而言，例如，一天（看作一个样本、一个点）的特征有风速风向、最高最低气温、湿度、气压，及其24小时变化值等，
        因此，无法在简单的二维平面显示他们的分类：画出来的点，虽然属于不同类别，但是都会叠在一起，分不出来。
    """
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # # 2nd Plot showing the actual clusters formed
    # colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    # ax2.scatter(X[:, 1], X[:, 2], marker='.', s=30, lw=0, alpha=0.7,
    #             c=colors)

    # # Labeling the clusters
    # centers = clusterer.cluster_centers_
    # # Draw white circles at cluster centers
    # ax2.scatter(centers[:, 0], centers[:, 1],
    #             marker='o', c="white", alpha=1, s=200)

    # for i, c in enumerate(centers):
    #     ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

    # ax2.set_title("The visualization of the clustered data.")
    # ax2.set_xlabel("Feature space for the 1st feature")
    # ax2.set_ylabel("Feature space for the 2nd feature")

    # plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
    #               "with n_clusters = %d" % n_clusters),
    #              fontsize=14, fontweight='bold')

    plt.show()


if __name__ == '__main__':
    # test_KMeans_using_custom_distance(metric='euclidean')
    test_KMeans_using_custom_distance()