diff --git a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb index 54946b6..461a327 100644 --- a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb +++ b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb @@ -158,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -187,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -216,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -239,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -251,7 +251,7 @@ " dtype='\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
beautifulbluebrowndogfoxjumpslayzlazylovequickskytoday
0110000000010
1110000001010
2001111010100
3011110010100
4110000000021
5001110100100
\n", + "" + ], + "text/plain": [ + " beautiful blue brown dog fox jumps layz lazy love quick sky \\\n", + "0 1 1 0 0 0 0 0 0 0 0 1 \n", + "1 1 1 0 0 0 0 0 0 1 0 1 \n", + "2 0 0 1 1 1 1 0 1 0 1 0 \n", + "3 0 1 1 1 1 0 0 1 0 1 0 \n", + "4 1 1 0 0 0 0 0 0 0 0 2 \n", + "5 0 0 1 1 1 0 1 0 0 1 0 \n", + "\n", + " today \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 1 \n", + "5 0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vocab = cv.get_feature_names()\n", + "pd.DataFrame(cv_matrix, columns=vocab)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "缺点:只考虑词频,没有考虑到前后逻辑" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## N-Grams模型\n", + "一种语言模型(Language Model,LM),语言模型是一个基于概率的判别模型,它的输入是一句话(单词的顺序序列),输出是这句话的概率,即这些单词的联合概率(joint probability)。" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
beautiful skybeautiful todayblue beautifulblue dogblue skybrown foxdog layzdog lazyfox jumpsfox quickjumps lazylayz brownlazy doglove bluequick bluequick brownsky beautifulsky blue
0001000000000000001
1101000000000010000
2000001001010100100
3000101010100001000
4010010000000000011
5000001100101000000
\n", + "
" + ], + "text/plain": [ + " beautiful sky beautiful today blue beautiful blue dog blue sky \\\n", + "0 0 0 1 0 0 \n", + "1 1 0 1 0 0 \n", + "2 0 0 0 0 0 \n", + "3 0 0 0 1 0 \n", + "4 0 1 0 0 1 \n", + "5 0 0 0 0 0 \n", + "\n", + " brown fox dog layz dog lazy fox jumps fox quick jumps lazy \\\n", + "0 0 0 0 0 0 0 \n", + "1 0 0 0 0 0 0 \n", + "2 1 0 0 1 0 1 \n", + "3 1 0 1 0 1 0 \n", + "4 0 0 0 0 0 0 \n", + "5 1 1 0 0 1 0 \n", + "\n", + " layz brown lazy dog love blue quick blue quick brown sky beautiful \\\n", + "0 0 0 0 0 0 0 \n", + "1 0 0 1 0 0 0 \n", + "2 0 1 0 0 1 0 \n", + "3 0 0 0 1 0 0 \n", + "4 0 0 0 0 0 1 \n", + "5 1 0 0 0 0 0 \n", + "\n", + " sky blue \n", + "0 1 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 1 \n", + "5 0 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bv = CountVectorizer(ngram_range=(2,2)) # ngram_range关注两个词的关系\n", + "bv_matrix = bv.fit_transform(norm_corpus)\n", + "bv_matrix = bv_matrix.toarray()\n", + "vocab = bv.get_feature_names()\n", + "pd.DataFrame(bv_matrix, columns=vocab)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "关注两个两个词的组合,如上面的beautiful sky就是两个词的组合。\n", + "\n", + "第0列的beautiful sky为0,因为上面第一句话中,两个词不是前后关系。\n", + "\n", + "缺点:矩阵过大,且矩阵过于稀疏" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TF-IDF模型\n", + "TF是词频(Term Frequency),IDF是逆文本频率指数(Inverse Document Frequency)。\n", + "\n", + "即:如果词w在一篇文档d中出现的频率高,并且在其他文档中很少出现,则认为词w具有很好的区分能力,适合用来把文章d和其他文章区分开来。" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
beautifulbluebrowndogfoxjumpslayzlazylovequickskytoday
00.600.520.000.000.000.000.000.000.000.000.600.00
10.460.390.000.000.000.000.000.000.660.000.460.00
20.000.000.370.370.370.530.000.430.000.370.000.00
30.000.350.400.400.400.000.000.480.000.400.000.00
40.360.310.000.000.000.000.000.000.000.000.720.52
50.000.000.410.410.410.000.590.000.000.410.000.00
\n", + "
" + ], + "text/plain": [ + " beautiful blue brown dog fox jumps layz lazy love quick sky \\\n", + "0 0.60 0.52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 \n", + "1 0.46 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.66 0.00 0.46 \n", + "2 0.00 0.00 0.37 0.37 0.37 0.53 0.00 0.43 0.00 0.37 0.00 \n", + "3 0.00 0.35 0.40 0.40 0.40 0.00 0.00 0.48 0.00 0.40 0.00 \n", + "4 0.36 0.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 \n", + "5 0.00 0.00 0.41 0.41 0.41 0.00 0.59 0.00 0.00 0.41 0.00 \n", + "\n", + " today \n", + "0 0.00 \n", + "1 0.00 \n", + "2 0.00 \n", + "3 0.00 \n", + "4 0.52 \n", + "5 0.00 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n", + "tv_matrix = tv.fit_transform(norm_corpus)\n", + "tv_matrix = tv_matrix.toarray()\n", + "\n", + "vocab = tv.get_feature_names()\n", + "pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Similarity特征\n", + "\n", + "统计文章的相似性" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345
01.0000000.7531280.0000000.1792560.8075390.000000
10.7531281.0000000.0000000.1350030.6081810.000000
20.0000000.0000001.0000000.7969320.0000000.592459
30.1792560.1350030.7969321.0000000.1059920.654475
40.8075390.6081810.0000000.1059921.0000000.000000
50.0000000.0000000.5924590.6544750.0000001.000000
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5\n", + "0 1.000000 0.753128 0.000000 0.179256 0.807539 0.000000\n", + "1 0.753128 1.000000 0.000000 0.135003 0.608181 0.000000\n", + "2 0.000000 0.000000 1.000000 0.796932 0.000000 0.592459\n", + "3 0.179256 0.135003 0.796932 1.000000 0.105992 0.654475\n", + "4 0.807539 0.608181 0.000000 0.105992 1.000000 0.000000\n", + "5 0.000000 0.000000 0.592459 0.654475 0.000000 1.000000" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "similarity_matrix = cosine_similarity(tv_matrix)\n", + "similarity_df = pd.DataFrame(similarity_matrix)\n", + "similarity_df" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/文本特征处理.ipynb b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/文本特征处理.ipynb index d722bb9..461a327 100644 --- a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/文本特征处理.ipynb +++ b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/文本特征处理.ipynb @@ -158,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -187,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -216,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -239,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -251,7 +251,7 @@ " dtype='\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
beautiful skybeautiful todayblue beautifulblue dogblue skybrown foxdog layzdog lazyfox jumpsfox quickjumps lazylayz brownlazy doglove bluequick bluequick brownsky beautifulsky blue
0001000000000000001
1101000000000010000
2000001001010100100
3000101010100001000
4010010000000000011
5000001100101000000
\n", + "" + ], + "text/plain": [ + " beautiful sky beautiful today blue beautiful blue dog blue sky \\\n", + "0 0 0 1 0 0 \n", + "1 1 0 1 0 0 \n", + "2 0 0 0 0 0 \n", + "3 0 0 0 1 0 \n", + "4 0 1 0 0 1 \n", + "5 0 0 0 0 0 \n", + "\n", + " brown fox dog layz dog lazy fox jumps fox quick jumps lazy \\\n", + "0 0 0 0 0 0 0 \n", + "1 0 0 0 0 0 0 \n", + "2 1 0 0 1 0 1 \n", + "3 1 0 1 0 1 0 \n", + "4 0 0 0 0 0 0 \n", + "5 1 1 0 0 1 0 \n", + "\n", + " layz brown lazy dog love blue quick blue quick brown sky beautiful \\\n", + "0 0 0 0 0 0 0 \n", + "1 0 0 1 0 0 0 \n", + "2 0 1 0 0 1 0 \n", + "3 0 0 0 1 0 0 \n", + "4 0 0 0 0 0 1 \n", + "5 1 0 0 0 0 0 \n", + "\n", + " sky blue \n", + "0 1 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 1 \n", + "5 0 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bv = CountVectorizer(ngram_range=(2,2)) # ngram_range关注两个词的关系\n", + "bv_matrix = bv.fit_transform(norm_corpus)\n", + "bv_matrix = bv_matrix.toarray()\n", + "vocab = bv.get_feature_names()\n", + "pd.DataFrame(bv_matrix, columns=vocab)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "关注两个两个词的组合,如上面的beautiful sky就是两个词的组合。\n", + "\n", + "第0列的beautiful sky为0,因为上面第一句话中,两个词不是前后关系。\n", + "\n", + "缺点:矩阵过大,且矩阵过于稀疏" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TF-IDF模型\n", + "TF是词频(Term Frequency),IDF是逆文本频率指数(Inverse Document Frequency)。\n", + "\n", + "即:如果词w在一篇文档d中出现的频率高,并且在其他文档中很少出现,则认为词w具有很好的区分能力,适合用来把文章d和其他文章区分开来。" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
beautifulbluebrowndogfoxjumpslayzlazylovequickskytoday
00.600.520.000.000.000.000.000.000.000.000.600.00
10.460.390.000.000.000.000.000.000.660.000.460.00
20.000.000.370.370.370.530.000.430.000.370.000.00
30.000.350.400.400.400.000.000.480.000.400.000.00
40.360.310.000.000.000.000.000.000.000.000.720.52
50.000.000.410.410.410.000.590.000.000.410.000.00
\n", + "
" + ], + "text/plain": [ + " beautiful blue brown dog fox jumps layz lazy love quick sky \\\n", + "0 0.60 0.52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 \n", + "1 0.46 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.66 0.00 0.46 \n", + "2 0.00 0.00 0.37 0.37 0.37 0.53 0.00 0.43 0.00 0.37 0.00 \n", + "3 0.00 0.35 0.40 0.40 0.40 0.00 0.00 0.48 0.00 0.40 0.00 \n", + "4 0.36 0.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 \n", + "5 0.00 0.00 0.41 0.41 0.41 0.00 0.59 0.00 0.00 0.41 0.00 \n", + "\n", + " today \n", + "0 0.00 \n", + "1 0.00 \n", + "2 0.00 \n", + "3 0.00 \n", + "4 0.52 \n", + "5 0.00 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n", + "tv_matrix = tv.fit_transform(norm_corpus)\n", + "tv_matrix = tv_matrix.toarray()\n", + "\n", + "vocab = tv.get_feature_names()\n", + "pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Similarity特征\n", + "\n", + "统计文章的相似性" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345
01.0000000.7531280.0000000.1792560.8075390.000000
10.7531281.0000000.0000000.1350030.6081810.000000
20.0000000.0000001.0000000.7969320.0000000.592459
30.1792560.1350030.7969321.0000000.1059920.654475
40.8075390.6081810.0000000.1059921.0000000.000000
50.0000000.0000000.5924590.6544750.0000001.000000
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5\n", + "0 1.000000 0.753128 0.000000 0.179256 0.807539 0.000000\n", + "1 0.753128 1.000000 0.000000 0.135003 0.608181 0.000000\n", + "2 0.000000 0.000000 1.000000 0.796932 0.000000 0.592459\n", + "3 0.179256 0.135003 0.796932 1.000000 0.105992 0.654475\n", + "4 0.807539 0.608181 0.000000 0.105992 1.000000 0.000000\n", + "5 0.000000 0.000000 0.592459 0.654475 0.000000 1.000000" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "similarity_matrix = cosine_similarity(tv_matrix)\n", + "similarity_df = pd.DataFrame(similarity_matrix)\n", + "similarity_df" ] }, {