|
|
@ -158,7 +158,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 5,
|
|
|
|
"execution_count": 4,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
{
|
|
|
@ -187,7 +187,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 10,
|
|
|
|
"execution_count": 5,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
{
|
|
|
@ -216,7 +216,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 11,
|
|
|
|
"execution_count": 6,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
@ -239,7 +239,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 8,
|
|
|
|
"execution_count": 7,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
{
|
|
|
@ -251,7 +251,7 @@
|
|
|
|
" dtype='<U30')"
|
|
|
|
" dtype='<U30')"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"execution_count": 8,
|
|
|
|
"execution_count": 7,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -271,7 +271,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 12,
|
|
|
|
"execution_count": 8,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
{
|
|
|
@ -295,7 +295,7 @@
|
|
|
|
" [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)"
|
|
|
|
" [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"execution_count": 12,
|
|
|
|
"execution_count": 8,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -324,6 +324,722 @@
|
|
|
|
"如:第一句话'sky blue beautiful',在词汇表有第一个词一次,第二个词一次,导数第二个词一次,那么下面的向量表则是[1,1,...,1,0]"
|
|
|
|
"如:第一句话'sky blue beautiful',在词汇表有第一个词一次,第二个词一次,导数第二个词一次,那么下面的向量表则是[1,1,...,1,0]"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 9,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
|
|
"text/html": [
|
|
|
|
|
|
|
|
"<div>\n",
|
|
|
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"</style>\n",
|
|
|
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
|
|
|
" <th>beautiful</th>\n",
|
|
|
|
|
|
|
|
" <th>blue</th>\n",
|
|
|
|
|
|
|
|
" <th>brown</th>\n",
|
|
|
|
|
|
|
|
" <th>dog</th>\n",
|
|
|
|
|
|
|
|
" <th>fox</th>\n",
|
|
|
|
|
|
|
|
" <th>jumps</th>\n",
|
|
|
|
|
|
|
|
" <th>layz</th>\n",
|
|
|
|
|
|
|
|
" <th>lazy</th>\n",
|
|
|
|
|
|
|
|
" <th>love</th>\n",
|
|
|
|
|
|
|
|
" <th>quick</th>\n",
|
|
|
|
|
|
|
|
" <th>sky</th>\n",
|
|
|
|
|
|
|
|
" <th>today</th>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>5</th>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
|
|
|
"</table>\n",
|
|
|
|
|
|
|
|
"</div>"
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"text/plain": [
|
|
|
|
|
|
|
|
" beautiful blue brown dog fox jumps layz lazy love quick sky \\\n",
|
|
|
|
|
|
|
|
"0 1 1 0 0 0 0 0 0 0 0 1 \n",
|
|
|
|
|
|
|
|
"1 1 1 0 0 0 0 0 0 1 0 1 \n",
|
|
|
|
|
|
|
|
"2 0 0 1 1 1 1 0 1 0 1 0 \n",
|
|
|
|
|
|
|
|
"3 0 1 1 1 1 0 0 1 0 1 0 \n",
|
|
|
|
|
|
|
|
"4 1 1 0 0 0 0 0 0 0 0 2 \n",
|
|
|
|
|
|
|
|
"5 0 0 1 1 1 0 1 0 0 1 0 \n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" today \n",
|
|
|
|
|
|
|
|
"0 0 \n",
|
|
|
|
|
|
|
|
"1 0 \n",
|
|
|
|
|
|
|
|
"2 0 \n",
|
|
|
|
|
|
|
|
"3 0 \n",
|
|
|
|
|
|
|
|
"4 1 \n",
|
|
|
|
|
|
|
|
"5 0 "
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"execution_count": 9,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"vocab = cv.get_feature_names()\n",
|
|
|
|
|
|
|
|
"pd.DataFrame(cv_matrix, columns=vocab)"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"缺点:只考虑词频,没有考虑到前后逻辑"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"## N-Grams模型\n",
|
|
|
|
|
|
|
|
"一种语言模型(Language Model,LM),语言模型是一个基于概率的判别模型,它的输入是一句话(单词的顺序序列),输出是这句话的概率,即这些单词的联合概率(joint probability)。"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 11,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
|
|
"text/html": [
|
|
|
|
|
|
|
|
"<div>\n",
|
|
|
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"</style>\n",
|
|
|
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
|
|
|
" <th>beautiful sky</th>\n",
|
|
|
|
|
|
|
|
" <th>beautiful today</th>\n",
|
|
|
|
|
|
|
|
" <th>blue beautiful</th>\n",
|
|
|
|
|
|
|
|
" <th>blue dog</th>\n",
|
|
|
|
|
|
|
|
" <th>blue sky</th>\n",
|
|
|
|
|
|
|
|
" <th>brown fox</th>\n",
|
|
|
|
|
|
|
|
" <th>dog layz</th>\n",
|
|
|
|
|
|
|
|
" <th>dog lazy</th>\n",
|
|
|
|
|
|
|
|
" <th>fox jumps</th>\n",
|
|
|
|
|
|
|
|
" <th>fox quick</th>\n",
|
|
|
|
|
|
|
|
" <th>jumps lazy</th>\n",
|
|
|
|
|
|
|
|
" <th>layz brown</th>\n",
|
|
|
|
|
|
|
|
" <th>lazy dog</th>\n",
|
|
|
|
|
|
|
|
" <th>love blue</th>\n",
|
|
|
|
|
|
|
|
" <th>quick blue</th>\n",
|
|
|
|
|
|
|
|
" <th>quick brown</th>\n",
|
|
|
|
|
|
|
|
" <th>sky beautiful</th>\n",
|
|
|
|
|
|
|
|
" <th>sky blue</th>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>5</th>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
|
|
|
"</table>\n",
|
|
|
|
|
|
|
|
"</div>"
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"text/plain": [
|
|
|
|
|
|
|
|
" beautiful sky beautiful today blue beautiful blue dog blue sky \\\n",
|
|
|
|
|
|
|
|
"0 0 0 1 0 0 \n",
|
|
|
|
|
|
|
|
"1 1 0 1 0 0 \n",
|
|
|
|
|
|
|
|
"2 0 0 0 0 0 \n",
|
|
|
|
|
|
|
|
"3 0 0 0 1 0 \n",
|
|
|
|
|
|
|
|
"4 0 1 0 0 1 \n",
|
|
|
|
|
|
|
|
"5 0 0 0 0 0 \n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" brown fox dog layz dog lazy fox jumps fox quick jumps lazy \\\n",
|
|
|
|
|
|
|
|
"0 0 0 0 0 0 0 \n",
|
|
|
|
|
|
|
|
"1 0 0 0 0 0 0 \n",
|
|
|
|
|
|
|
|
"2 1 0 0 1 0 1 \n",
|
|
|
|
|
|
|
|
"3 1 0 1 0 1 0 \n",
|
|
|
|
|
|
|
|
"4 0 0 0 0 0 0 \n",
|
|
|
|
|
|
|
|
"5 1 1 0 0 1 0 \n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" layz brown lazy dog love blue quick blue quick brown sky beautiful \\\n",
|
|
|
|
|
|
|
|
"0 0 0 0 0 0 0 \n",
|
|
|
|
|
|
|
|
"1 0 0 1 0 0 0 \n",
|
|
|
|
|
|
|
|
"2 0 1 0 0 1 0 \n",
|
|
|
|
|
|
|
|
"3 0 0 0 1 0 0 \n",
|
|
|
|
|
|
|
|
"4 0 0 0 0 0 1 \n",
|
|
|
|
|
|
|
|
"5 1 0 0 0 0 0 \n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" sky blue \n",
|
|
|
|
|
|
|
|
"0 1 \n",
|
|
|
|
|
|
|
|
"1 0 \n",
|
|
|
|
|
|
|
|
"2 0 \n",
|
|
|
|
|
|
|
|
"3 0 \n",
|
|
|
|
|
|
|
|
"4 1 \n",
|
|
|
|
|
|
|
|
"5 0 "
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"execution_count": 11,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"bv = CountVectorizer(ngram_range=(2,2)) # ngram_range关注两个词的关系\n",
|
|
|
|
|
|
|
|
"bv_matrix = bv.fit_transform(norm_corpus)\n",
|
|
|
|
|
|
|
|
"bv_matrix = bv_matrix.toarray()\n",
|
|
|
|
|
|
|
|
"vocab = bv.get_feature_names()\n",
|
|
|
|
|
|
|
|
"pd.DataFrame(bv_matrix, columns=vocab)"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"关注两个两个词的组合,如上面的beautiful sky就是两个词的组合。\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"第0列的beautiful sky为0,因为上面第一句话中,两个词不是前后关系。\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"缺点:矩阵过大,且矩阵过于稀疏"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"## TF-IDF模型\n",
|
|
|
|
|
|
|
|
"TF是词频(Term Frequency),IDF是逆文本频率指数(Inverse Document Frequency)。\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"即:如果词w在一篇文档d中出现的频率高,并且在其他文档中很少出现,则认为词w具有很好的区分能力,适合用来把文章d和其他文章区分开来。"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
|
|
"text/html": [
|
|
|
|
|
|
|
|
"<div>\n",
|
|
|
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"</style>\n",
|
|
|
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
|
|
|
" <th>beautiful</th>\n",
|
|
|
|
|
|
|
|
" <th>blue</th>\n",
|
|
|
|
|
|
|
|
" <th>brown</th>\n",
|
|
|
|
|
|
|
|
" <th>dog</th>\n",
|
|
|
|
|
|
|
|
" <th>fox</th>\n",
|
|
|
|
|
|
|
|
" <th>jumps</th>\n",
|
|
|
|
|
|
|
|
" <th>layz</th>\n",
|
|
|
|
|
|
|
|
" <th>lazy</th>\n",
|
|
|
|
|
|
|
|
" <th>love</th>\n",
|
|
|
|
|
|
|
|
" <th>quick</th>\n",
|
|
|
|
|
|
|
|
" <th>sky</th>\n",
|
|
|
|
|
|
|
|
" <th>today</th>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
|
|
|
" <td>0.60</td>\n",
|
|
|
|
|
|
|
|
" <td>0.52</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.60</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
|
|
|
" <td>0.46</td>\n",
|
|
|
|
|
|
|
|
" <td>0.39</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.66</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.46</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.37</td>\n",
|
|
|
|
|
|
|
|
" <td>0.37</td>\n",
|
|
|
|
|
|
|
|
" <td>0.37</td>\n",
|
|
|
|
|
|
|
|
" <td>0.53</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.43</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.37</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.35</td>\n",
|
|
|
|
|
|
|
|
" <td>0.40</td>\n",
|
|
|
|
|
|
|
|
" <td>0.40</td>\n",
|
|
|
|
|
|
|
|
" <td>0.40</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.48</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.40</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
|
|
|
" <td>0.36</td>\n",
|
|
|
|
|
|
|
|
" <td>0.31</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.72</td>\n",
|
|
|
|
|
|
|
|
" <td>0.52</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>5</th>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.41</td>\n",
|
|
|
|
|
|
|
|
" <td>0.41</td>\n",
|
|
|
|
|
|
|
|
" <td>0.41</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.59</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.41</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" <td>0.00</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
|
|
|
"</table>\n",
|
|
|
|
|
|
|
|
"</div>"
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"text/plain": [
|
|
|
|
|
|
|
|
" beautiful blue brown dog fox jumps layz lazy love quick sky \\\n",
|
|
|
|
|
|
|
|
"0 0.60 0.52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 \n",
|
|
|
|
|
|
|
|
"1 0.46 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.66 0.00 0.46 \n",
|
|
|
|
|
|
|
|
"2 0.00 0.00 0.37 0.37 0.37 0.53 0.00 0.43 0.00 0.37 0.00 \n",
|
|
|
|
|
|
|
|
"3 0.00 0.35 0.40 0.40 0.40 0.00 0.00 0.48 0.00 0.40 0.00 \n",
|
|
|
|
|
|
|
|
"4 0.36 0.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 \n",
|
|
|
|
|
|
|
|
"5 0.00 0.00 0.41 0.41 0.41 0.00 0.59 0.00 0.00 0.41 0.00 \n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" today \n",
|
|
|
|
|
|
|
|
"0 0.00 \n",
|
|
|
|
|
|
|
|
"1 0.00 \n",
|
|
|
|
|
|
|
|
"2 0.00 \n",
|
|
|
|
|
|
|
|
"3 0.00 \n",
|
|
|
|
|
|
|
|
"4 0.52 \n",
|
|
|
|
|
|
|
|
"5 0.00 "
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
|
|
|
|
|
|
"tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n",
|
|
|
|
|
|
|
|
"tv_matrix = tv.fit_transform(norm_corpus)\n",
|
|
|
|
|
|
|
|
"tv_matrix = tv_matrix.toarray()\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"vocab = tv.get_feature_names()\n",
|
|
|
|
|
|
|
|
"pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"## Similarity特征\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"统计文章的相似性"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 13,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
|
|
"text/html": [
|
|
|
|
|
|
|
|
"<div>\n",
|
|
|
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"</style>\n",
|
|
|
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
|
|
|
" <th>5</th>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.753128</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.179256</td>\n",
|
|
|
|
|
|
|
|
" <td>0.807539</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
|
|
|
" <td>0.753128</td>\n",
|
|
|
|
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.135003</td>\n",
|
|
|
|
|
|
|
|
" <td>0.608181</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.796932</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.592459</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
|
|
|
" <td>0.179256</td>\n",
|
|
|
|
|
|
|
|
" <td>0.135003</td>\n",
|
|
|
|
|
|
|
|
" <td>0.796932</td>\n",
|
|
|
|
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.105992</td>\n",
|
|
|
|
|
|
|
|
" <td>0.654475</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
|
|
|
" <td>0.807539</td>\n",
|
|
|
|
|
|
|
|
" <td>0.608181</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.105992</td>\n",
|
|
|
|
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>5</th>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>0.592459</td>\n",
|
|
|
|
|
|
|
|
" <td>0.654475</td>\n",
|
|
|
|
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
|
|
|
"</table>\n",
|
|
|
|
|
|
|
|
"</div>"
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"text/plain": [
|
|
|
|
|
|
|
|
" 0 1 2 3 4 5\n",
|
|
|
|
|
|
|
|
"0 1.000000 0.753128 0.000000 0.179256 0.807539 0.000000\n",
|
|
|
|
|
|
|
|
"1 0.753128 1.000000 0.000000 0.135003 0.608181 0.000000\n",
|
|
|
|
|
|
|
|
"2 0.000000 0.000000 1.000000 0.796932 0.000000 0.592459\n",
|
|
|
|
|
|
|
|
"3 0.179256 0.135003 0.796932 1.000000 0.105992 0.654475\n",
|
|
|
|
|
|
|
|
"4 0.807539 0.608181 0.000000 0.105992 1.000000 0.000000\n",
|
|
|
|
|
|
|
|
"5 0.000000 0.000000 0.592459 0.654475 0.000000 1.000000"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"execution_count": 13,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"similarity_matrix = cosine_similarity(tv_matrix)\n",
|
|
|
|
|
|
|
|
"similarity_df = pd.DataFrame(similarity_matrix)\n",
|
|
|
|
|
|
|
|
"similarity_df"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": null,
|
|
|
|