#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''KMeans 聚类分析样例程序
关于样本和样本(点和点)之间距离的算法:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html#sklearn.metrics.pairwise.pairwise_distances
用 silhouette 评估 KMeans 聚类分析,画图,确定分类数目:
http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py
在 KMeans 之前先用主成分分析(PCA)降维:
http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py
'''
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score, silhouette_samples
# 载入样例数据
from sklearn import datasets
dataset = datasets.load_iris()
X = dataset.data
# y = dataset.target # 这是实际分类标签。由于 KMeans 是非监督(自动)分类算法,因此不用学习这个真实分类。
n_samples, n_features = X.shape
print('n_samples = {}, n_features = {}'.format(n_samples, n_features))
"""
需要特别注意的是:这里的样例数据比较简单,每个样本的不同特征的量纲和数量级相同,但是我们要面对的问题可能不是这样:
一天(看作一个样本、一个点)的特征有风速风向、最高最低气温、湿度、气压,及其24小时变化值等,
不同要素的量纲和数量级不同,因此最好在进行 KMeans 聚类分析之前,先对每个要素进行标注化或归一化处理。
归一化即 (x-min(x))/(max(x)-min(x)),把 x 转变为 [0, 1] 区间内。
标准化即 (x-mean(x))/std(x),结果有正有负,标准差变为 1。
"""
"""
以下两个 KMeans 样例程序,都是把示例数据 X 分成 3 类
"""
def test_KMeans_default(X=X, n_clusters=3):
# 直接采用 KMeans 进行聚类分析,默认距离计算方法是 euclidean,即欧几里得距离
kmeans_model = KMeans(n_clusters=n_clusters, random_state=1).fit(X)
cluster_labels = kmeans_model.labels_
silhouette_avg = silhouette_score(X, cluster_labels, metric='euclidean')
sample_silhouette_values = silhouette_samples(X, cluster_labels, metric='euclidean')
# print sample_silhouette_values
print('Average silhouette value: {}'.format(silhouette_avg))
def test_KMeans_using_custom_distance(X=X, n_clusters=3, metric='correlation'):
# 先按照某种距离计算方法计算距离矩阵 D,如采用 correlation 算法,即 1 减去皮尔逊相关系数,这种算法在我们需要解决的问题中更常用
D = pairwise_distances(X, metric=metric)
clusterer = KMeans(n_clusters=n_clusters, random_state=1, precompute_distances=True)
cluster_labels = clusterer.fit_predict(D)
silhouette_avg = silhouette_score(D, cluster_labels, metric='precomputed')
sample_silhouette_values = silhouette_samples(D, cluster_labels, metric='precomputed')
# print sample_silhouette_values
print('Average silhouette value: {}'.format(silhouette_avg))
"""
以下绘图程序复制粘贴自官网示例,样例数据 X 只有 4 个特征,因此右侧的图能看出分类。
对于高维度的气象数据而言,例如,一天(看作一个样本、一个点)的特征有风速风向、最高最低气温、湿度、气压,及其24小时变化值等,
因此,无法在简单的二维平面显示他们的分类:画出来的点,虽然属于不同类别,但是都会叠在一起,分不出来。
"""
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# # 2nd Plot showing the actual clusters formed
# colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
# ax2.scatter(X[:, 1], X[:, 2], marker='.', s=30, lw=0, alpha=0.7,
# c=colors)
# # Labeling the clusters
# centers = clusterer.cluster_centers_
# # Draw white circles at cluster centers
# ax2.scatter(centers[:, 0], centers[:, 1],
# marker='o', c="white", alpha=1, s=200)
# for i, c in enumerate(centers):
# ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)
# ax2.set_title("The visualization of the clustered data.")
# ax2.set_xlabel("Feature space for the 1st feature")
# ax2.set_ylabel("Feature space for the 2nd feature")
# plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
# "with n_clusters = %d" % n_clusters),
# fontsize=14, fontweight='bold')
plt.show()
if __name__ == '__main__':
# test_KMeans_using_custom_distance(metric='euclidean')
test_KMeans_using_custom_distance()
K-means聚类分析应用示例
June 2017 · 3 minute read