年薪30W进阶指南-【聚类分析】王者荣耀英雄数据
# -*- coding: utf-8 -*- import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.mixture import GaussianMixture from sklearn.preprocessing import StandardScaler
In [2]:
# 数据加载,避免中文乱码问题 data_ori = pd.read_csv('/home/kesci/input/hero9310/heros.csv', encoding = 'gb18030') features = [u'最大生命',u'生命成长',u'初始生命',u'最大法力', u'法力成长',u'初始法力',u'最高物攻',u'物攻成长',u'初始物攻',u'最大物防',u'物防成长',u'初始物防', u'最大每5秒回血', u'每5秒回血成长', u'初始每5秒回血', u'最大每5秒回蓝', u'每5秒回蓝成长', u'初始每5秒回蓝', u'最大攻速', u'攻击范围'] data = data_ori[features]
In [3]:
# 对英雄属性之间的关系进行可视化分析 # 用热力图呈现 features_mean 字段之间的相关性 corr = data[features].corr() plt.figure(figsize=(14,14)) # annot=True 显示每个方格的数据 sns.heatmap(corr, annot=True) plt.show()
In [4]:
# 相关性大的属性保留一个,因此可以对属性进行降维 features_remain = [u'最大生命', u'初始生命', u'最大法力', u'最高物攻', u'初始物攻', u'最大物防', u'初始物防', u'最大每5秒回血', u'最大每5秒回蓝', u'初始每5秒回蓝', u'最大攻速', u'攻击范围'] data = data_ori[features_remain] data[u'最大攻速'] = data[u'最大攻速'].apply(lambda x: float(x.strip('%'))/100) data[u'攻击范围']=data[u'攻击范围'].map({'远程':1,'近战':0}) # 采用 Z-Score 规范化数据,保证每个特征维度的数据均值为 0,方差为 1 ss = StandardScaler() data = ss.fit_transform(data) # 构造 GMM 聚类 gmm = GaussianMixture(n_components=30, covariance_type='full') gmm.fit(data) # 训练数据 prediction = gmm.predict(data) print(prediction)
[ 2 15 7 26 16 16 13 7 9 15 1 15 26 6 29 1 28 9 3 10 19 12 10 12 12 12 10 4 11 0 6 0 0 25 0 0 11 5 24 18 18 11 25 0 25 18 15 24 15 25 27 8 3 23 23 2 22 15 20 16 22 3 17 2 14 8 8 21 4]
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy after removing the cwd from sys.path. /opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """
In [5]:
# 将分组结果输出到 CSV 文件中 #data_ori.insert(0, '分组', prediction) #data_ori.to_csv('./hero_out.csv', index=False, sep=',') #输出得分 from sklearn.metrics import calinski_harabaz_score print(calinski_harabaz_score(data, prediction))
21.5401587794009
/opt/conda/lib/python3.6/site-packages/sklearn/utils/deprecation.py:85: DeprecationWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23. warnings.warn(msg, category=DeprecationWarning)