{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"seeds_df = pd.read_csv(\"https://raw.githubusercontent.com/vihar/unsupervised-learning-with-python/master/seeds-less-rows.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" area | \n",
" perimeter | \n",
" compactness | \n",
" length | \n",
" width | \n",
" asymmetry_coefficient | \n",
" groove_length | \n",
" grain_variety | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 14.88 | \n",
" 14.57 | \n",
" 0.8811 | \n",
" 5.554 | \n",
" 3.333 | \n",
" 1.018 | \n",
" 4.956 | \n",
" Kama wheat | \n",
"
\n",
" \n",
" 1 | \n",
" 14.69 | \n",
" 14.49 | \n",
" 0.8799 | \n",
" 5.563 | \n",
" 3.259 | \n",
" 3.586 | \n",
" 5.219 | \n",
" Kama wheat | \n",
"
\n",
" \n",
" 2 | \n",
" 14.03 | \n",
" 14.16 | \n",
" 0.8796 | \n",
" 5.438 | \n",
" 3.201 | \n",
" 1.717 | \n",
" 5.001 | \n",
" Kama wheat | \n",
"
\n",
" \n",
" 3 | \n",
" 13.99 | \n",
" 13.83 | \n",
" 0.9183 | \n",
" 5.119 | \n",
" 3.383 | \n",
" 5.234 | \n",
" 4.781 | \n",
" Kama wheat | \n",
"
\n",
" \n",
" 4 | \n",
" 14.11 | \n",
" 14.26 | \n",
" 0.8722 | \n",
" 5.520 | \n",
" 3.168 | \n",
" 2.688 | \n",
" 5.219 | \n",
" Kama wheat | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" area perimeter compactness length width asymmetry_coefficient \\\n",
"0 14.88 14.57 0.8811 5.554 3.333 1.018 \n",
"1 14.69 14.49 0.8799 5.563 3.259 3.586 \n",
"2 14.03 14.16 0.8796 5.438 3.201 1.717 \n",
"3 13.99 13.83 0.9183 5.119 3.383 5.234 \n",
"4 14.11 14.26 0.8722 5.520 3.168 2.688 \n",
"\n",
" groove_length grain_variety \n",
"0 4.956 Kama wheat \n",
"1 5.219 Kama wheat \n",
"2 5.001 Kama wheat \n",
"3 4.781 Kama wheat \n",
"4 5.219 Kama wheat "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seeds_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Rosa wheat 14\n",
"Kama wheat 14\n",
"Canadian wheat 14\n",
"Name: grain_variety, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seeds_df.grain_variety.value_counts() # grain_variety是标签,分成了3个类别"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"varieties = list(seeds_df.pop('grain_variety')) # 先去掉它,不然就是有监督了\n",
"\n",
"samples = seeds_df.values"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[14.88 , 14.57 , 0.8811, 5.554 , 3.333 , 1.018 , 4.956 ],\n",
" [14.69 , 14.49 , 0.8799, 5.563 , 3.259 , 3.586 , 5.219 ],\n",
" [14.03 , 14.16 , 0.8796, 5.438 , 3.201 , 1.717 , 5.001 ],\n",
" [13.99 , 13.83 , 0.9183, 5.119 , 3.383 , 5.234 , 4.781 ],\n",
" [14.11 , 14.26 , 0.8722, 5.52 , 3.168 , 2.688 , 5.219 ],\n",
" [13.02 , 13.76 , 0.8641, 5.395 , 3.026 , 3.373 , 4.825 ],\n",
" [15.49 , 14.94 , 0.8724, 5.757 , 3.371 , 3.412 , 5.228 ],\n",
" [16.2 , 15.27 , 0.8734, 5.826 , 3.464 , 2.823 , 5.527 ],\n",
" [13.5 , 13.85 , 0.8852, 5.351 , 3.158 , 2.249 , 5.176 ],\n",
" [15.36 , 14.76 , 0.8861, 5.701 , 3.393 , 1.367 , 5.132 ],\n",
" [15.78 , 14.91 , 0.8923, 5.674 , 3.434 , 5.593 , 5.136 ],\n",
" [14.46 , 14.35 , 0.8818, 5.388 , 3.377 , 2.802 , 5.044 ],\n",
" [11.23 , 12.63 , 0.884 , 4.902 , 2.879 , 2.269 , 4.703 ],\n",
" [14.34 , 14.37 , 0.8726, 5.63 , 3.19 , 1.313 , 5.15 ],\n",
" [16.84 , 15.67 , 0.8623, 5.998 , 3.484 , 4.675 , 5.877 ],\n",
" [17.32 , 15.91 , 0.8599, 6.064 , 3.403 , 3.824 , 5.922 ],\n",
" [18.72 , 16.19 , 0.8977, 6.006 , 3.857 , 5.324 , 5.879 ],\n",
" [18.88 , 16.26 , 0.8969, 6.084 , 3.764 , 1.649 , 6.109 ],\n",
" [18.76 , 16.2 , 0.8984, 6.172 , 3.796 , 3.12 , 6.053 ],\n",
" [19.31 , 16.59 , 0.8815, 6.341 , 3.81 , 3.477 , 6.238 ],\n",
" [17.99 , 15.86 , 0.8992, 5.89 , 3.694 , 2.068 , 5.837 ],\n",
" [18.85 , 16.17 , 0.9056, 6.152 , 3.806 , 2.843 , 6.2 ],\n",
" [19.38 , 16.72 , 0.8716, 6.303 , 3.791 , 3.678 , 5.965 ],\n",
" [18.96 , 16.2 , 0.9077, 6.051 , 3.897 , 4.334 , 5.75 ],\n",
" [18.14 , 16.12 , 0.8772, 6.059 , 3.563 , 3.619 , 6.011 ],\n",
" [18.65 , 16.41 , 0.8698, 6.285 , 3.594 , 4.391 , 6.102 ],\n",
" [18.94 , 16.32 , 0.8942, 6.144 , 3.825 , 2.908 , 5.949 ],\n",
" [17.36 , 15.76 , 0.8785, 6.145 , 3.574 , 3.526 , 5.971 ],\n",
" [13.32 , 13.94 , 0.8613, 5.541 , 3.073 , 7.035 , 5.44 ],\n",
" [11.43 , 13.13 , 0.8335, 5.176 , 2.719 , 2.221 , 5.132 ],\n",
" [12.01 , 13.52 , 0.8249, 5.405 , 2.776 , 6.992 , 5.27 ],\n",
" [11.34 , 12.87 , 0.8596, 5.053 , 2.849 , 3.347 , 5.003 ],\n",
" [12.02 , 13.33 , 0.8503, 5.35 , 2.81 , 4.271 , 5.308 ],\n",
" [12.44 , 13.59 , 0.8462, 5.319 , 2.897 , 4.924 , 5.27 ],\n",
" [11.55 , 13.1 , 0.8455, 5.167 , 2.845 , 6.715 , 4.956 ],\n",
" [11.26 , 13.01 , 0.8355, 5.186 , 2.71 , 5.335 , 5.092 ],\n",
" [12.46 , 13.41 , 0.8706, 5.236 , 3.017 , 4.987 , 5.147 ],\n",
" [11.81 , 13.45 , 0.8198, 5.413 , 2.716 , 4.898 , 5.352 ],\n",
" [11.27 , 12.86 , 0.8563, 5.091 , 2.804 , 3.985 , 5.001 ],\n",
" [12.79 , 13.53 , 0.8786, 5.224 , 3.054 , 5.483 , 4.958 ],\n",
" [12.67 , 13.32 , 0.8977, 4.984 , 3.135 , 2.3 , 4.745 ],\n",
" [11.23 , 12.88 , 0.8511, 5.14 , 2.795 , 4.325 , 5.003 ]])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# 距离计算\n",
"from scipy.cluster.hierarchy import linkage, dendrogram\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# 层次聚类\n",
"mergings = linkage(samples, method='complete')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# 树状图结果\n",
"fig = plt.figure(figsize=(10,6))\n",
"dendrogram(mergings,\n",
" labels=varieties, # 指定标签,x轴\n",
" leaf_rotation=90,\n",
" leaf_font_size=6,\n",
" )\n",
"plt.show() # 从上往下看"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" varieties | \n",
" Canadian wheat | \n",
" Kama wheat | \n",
" Rosa wheat | \n",
"
\n",
" \n",
" labels | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 14 | \n",
" 3 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 14 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 11 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"varieties Canadian wheat Kama wheat Rosa wheat\n",
"labels \n",
"1 14 3 0\n",
"2 0 0 14\n",
"3 0 11 0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 得到标签结果\n",
"from scipy.cluster.hierarchy import fcluster\n",
"#指定结果、高度等,指定在6那里,可以看到上图是分成3类\n",
"labels = fcluster(mergings, 6, criterion='distance') \n",
"\n",
"df = pd.DataFrame({'labels':labels, 'varieties':varieties})\n",
"ct = pd.crosstab(df['labels'], df['varieties'])\n",
"ct # 可以看到只有label 1的结果里面,多分了3个Kama wheat,其它都能分的清"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 不同距离的选择产生不同的结果\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# 获取数据,并补0\n",
"df = pd.read_csv('https://s3.amazonaws.com/assets.datacamp.com/production/course_2072/datasets/eurovision-2016.csv').fillna(0)\n",
"\n",
"# 打分满分12\n",
"scores = pd.crosstab(index=df['From country'], columns=df['To country'], values=df['Televote Points'], aggfunc='first').fillna(12)\n",
"samples = scores.values\n",
"country_names = list(scores.index)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"# 归一化,因为数据可能某些很大,可以看到Points的值小,\n",
"# 越大的值,在计算种贡献越多,算法会认为值越大,越重要\n",
"\n",
"from sklearn.preprocessing import normalize\n",
"samples = normalize(samples) # 数据压缩到0-1之间"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.09449112, 0.56694671, 0. , ..., 0. , 0.28347335,\n",
" 0. ],\n",
" [0.49319696, 0. , 0.16439899, ..., 0. , 0.41099747,\n",
" 0. ],\n",
" [0. , 0.49319696, 0.12329924, ..., 0. , 0.32879797,\n",
" 0.16439899],\n",
" ...,\n",
" [0.32879797, 0.20549873, 0.24659848, ..., 0.49319696, 0.28769823,\n",
" 0. ],\n",
" [0.28769823, 0.16439899, 0. , ..., 0. , 0.49319696,\n",
" 0. ],\n",
" [0. , 0.24659848, 0. , ..., 0. , 0.20549873,\n",
" 0.49319696]])"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"