Add 构建文本向量

pull/2/head
benjas 5 years ago
parent c42f9c8239
commit 75bb7bdd46

@ -158,7 +158,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -187,7 +187,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -216,7 +216,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -239,7 +239,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -251,7 +251,7 @@
" dtype='<U30')" " dtype='<U30')"
] ]
}, },
"execution_count": 8, "execution_count": 7,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -271,7 +271,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -295,7 +295,7 @@
" [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)" " [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)"
] ]
}, },
"execution_count": 12, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -324,6 +324,722 @@
"如:第一句话'sky blue beautiful',在词汇表有第一个词一次,第二个词一次,导数第二个词一次,那么下面的向量表则是[1,1,...,1,0]" "如:第一句话'sky blue beautiful',在词汇表有第一个词一次,第二个词一次,导数第二个词一次,那么下面的向量表则是[1,1,...,1,0]"
] ]
}, },
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>beautiful</th>\n",
" <th>blue</th>\n",
" <th>brown</th>\n",
" <th>dog</th>\n",
" <th>fox</th>\n",
" <th>jumps</th>\n",
" <th>layz</th>\n",
" <th>lazy</th>\n",
" <th>love</th>\n",
" <th>quick</th>\n",
" <th>sky</th>\n",
" <th>today</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" beautiful blue brown dog fox jumps layz lazy love quick sky \\\n",
"0 1 1 0 0 0 0 0 0 0 0 1 \n",
"1 1 1 0 0 0 0 0 0 1 0 1 \n",
"2 0 0 1 1 1 1 0 1 0 1 0 \n",
"3 0 1 1 1 1 0 0 1 0 1 0 \n",
"4 1 1 0 0 0 0 0 0 0 0 2 \n",
"5 0 0 1 1 1 0 1 0 0 1 0 \n",
"\n",
" today \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 1 \n",
"5 0 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab = cv.get_feature_names()\n",
"pd.DataFrame(cv_matrix, columns=vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"缺点:只考虑词频,没有考虑到前后逻辑"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## N-Grams模型\n",
"一种语言模型Language ModelLM语言模型是一个基于概率的判别模型它的输入是一句话单词的顺序序列输出是这句话的概率即这些单词的联合概率joint probability。"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>beautiful sky</th>\n",
" <th>beautiful today</th>\n",
" <th>blue beautiful</th>\n",
" <th>blue dog</th>\n",
" <th>blue sky</th>\n",
" <th>brown fox</th>\n",
" <th>dog layz</th>\n",
" <th>dog lazy</th>\n",
" <th>fox jumps</th>\n",
" <th>fox quick</th>\n",
" <th>jumps lazy</th>\n",
" <th>layz brown</th>\n",
" <th>lazy dog</th>\n",
" <th>love blue</th>\n",
" <th>quick blue</th>\n",
" <th>quick brown</th>\n",
" <th>sky beautiful</th>\n",
" <th>sky blue</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" beautiful sky beautiful today blue beautiful blue dog blue sky \\\n",
"0 0 0 1 0 0 \n",
"1 1 0 1 0 0 \n",
"2 0 0 0 0 0 \n",
"3 0 0 0 1 0 \n",
"4 0 1 0 0 1 \n",
"5 0 0 0 0 0 \n",
"\n",
" brown fox dog layz dog lazy fox jumps fox quick jumps lazy \\\n",
"0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 \n",
"2 1 0 0 1 0 1 \n",
"3 1 0 1 0 1 0 \n",
"4 0 0 0 0 0 0 \n",
"5 1 1 0 0 1 0 \n",
"\n",
" layz brown lazy dog love blue quick blue quick brown sky beautiful \\\n",
"0 0 0 0 0 0 0 \n",
"1 0 0 1 0 0 0 \n",
"2 0 1 0 0 1 0 \n",
"3 0 0 0 1 0 0 \n",
"4 0 0 0 0 0 1 \n",
"5 1 0 0 0 0 0 \n",
"\n",
" sky blue \n",
"0 1 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 1 \n",
"5 0 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bv = CountVectorizer(ngram_range=(2,2)) # ngram_range关注两个词的关系\n",
"bv_matrix = bv.fit_transform(norm_corpus)\n",
"bv_matrix = bv_matrix.toarray()\n",
"vocab = bv.get_feature_names()\n",
"pd.DataFrame(bv_matrix, columns=vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"关注两个两个词的组合如上面的beautiful sky就是两个词的组合。\n",
"\n",
"第0列的beautiful sky为0因为上面第一句话中两个词不是前后关系。\n",
"\n",
"缺点:矩阵过大,且矩阵过于稀疏"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TF-IDF模型\n",
"TF是词频(Term Frequency)IDF是逆文本频率指数(Inverse Document Frequency)。\n",
"\n",
"即如果词w在一篇文档d中出现的频率高并且在其他文档中很少出现则认为词w具有很好的区分能力适合用来把文章d和其他文章区分开来。"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>beautiful</th>\n",
" <th>blue</th>\n",
" <th>brown</th>\n",
" <th>dog</th>\n",
" <th>fox</th>\n",
" <th>jumps</th>\n",
" <th>layz</th>\n",
" <th>lazy</th>\n",
" <th>love</th>\n",
" <th>quick</th>\n",
" <th>sky</th>\n",
" <th>today</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.60</td>\n",
" <td>0.52</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.60</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.46</td>\n",
" <td>0.39</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.66</td>\n",
" <td>0.00</td>\n",
" <td>0.46</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.37</td>\n",
" <td>0.37</td>\n",
" <td>0.37</td>\n",
" <td>0.53</td>\n",
" <td>0.00</td>\n",
" <td>0.43</td>\n",
" <td>0.00</td>\n",
" <td>0.37</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.00</td>\n",
" <td>0.35</td>\n",
" <td>0.40</td>\n",
" <td>0.40</td>\n",
" <td>0.40</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.48</td>\n",
" <td>0.00</td>\n",
" <td>0.40</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.36</td>\n",
" <td>0.31</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.72</td>\n",
" <td>0.52</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.41</td>\n",
" <td>0.41</td>\n",
" <td>0.41</td>\n",
" <td>0.00</td>\n",
" <td>0.59</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.41</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" beautiful blue brown dog fox jumps layz lazy love quick sky \\\n",
"0 0.60 0.52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 \n",
"1 0.46 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.66 0.00 0.46 \n",
"2 0.00 0.00 0.37 0.37 0.37 0.53 0.00 0.43 0.00 0.37 0.00 \n",
"3 0.00 0.35 0.40 0.40 0.40 0.00 0.00 0.48 0.00 0.40 0.00 \n",
"4 0.36 0.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 \n",
"5 0.00 0.00 0.41 0.41 0.41 0.00 0.59 0.00 0.00 0.41 0.00 \n",
"\n",
" today \n",
"0 0.00 \n",
"1 0.00 \n",
"2 0.00 \n",
"3 0.00 \n",
"4 0.52 \n",
"5 0.00 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n",
"tv_matrix = tv.fit_transform(norm_corpus)\n",
"tv_matrix = tv_matrix.toarray()\n",
"\n",
"vocab = tv.get_feature_names()\n",
"pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Similarity特征\n",
"\n",
"统计文章的相似性"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.000000</td>\n",
" <td>0.753128</td>\n",
" <td>0.000000</td>\n",
" <td>0.179256</td>\n",
" <td>0.807539</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.753128</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.135003</td>\n",
" <td>0.608181</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.796932</td>\n",
" <td>0.000000</td>\n",
" <td>0.592459</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.179256</td>\n",
" <td>0.135003</td>\n",
" <td>0.796932</td>\n",
" <td>1.000000</td>\n",
" <td>0.105992</td>\n",
" <td>0.654475</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.807539</td>\n",
" <td>0.608181</td>\n",
" <td>0.000000</td>\n",
" <td>0.105992</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.592459</td>\n",
" <td>0.654475</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5\n",
"0 1.000000 0.753128 0.000000 0.179256 0.807539 0.000000\n",
"1 0.753128 1.000000 0.000000 0.135003 0.608181 0.000000\n",
"2 0.000000 0.000000 1.000000 0.796932 0.000000 0.592459\n",
"3 0.179256 0.135003 0.796932 1.000000 0.105992 0.654475\n",
"4 0.807539 0.608181 0.000000 0.105992 1.000000 0.000000\n",
"5 0.000000 0.000000 0.592459 0.654475 0.000000 1.000000"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.metrics.pairwise import cosine_similarity\n",
"\n",
"similarity_matrix = cosine_similarity(tv_matrix)\n",
"similarity_df = pd.DataFrame(similarity_matrix)\n",
"similarity_df"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

@ -158,7 +158,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -187,7 +187,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -216,7 +216,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -239,7 +239,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -251,7 +251,7 @@
" dtype='<U30')" " dtype='<U30')"
] ]
}, },
"execution_count": 8, "execution_count": 7,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -271,7 +271,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -295,7 +295,7 @@
" [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)" " [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)"
] ]
}, },
"execution_count": 12, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -326,7 +326,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -477,7 +477,7 @@
"5 0 " "5 0 "
] ]
}, },
"execution_count": 14, "execution_count": 9,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -491,7 +491,553 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## N-Grams模型" "缺点:只考虑词频,没有考虑到前后逻辑"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## N-Grams模型\n",
"一种语言模型Language ModelLM语言模型是一个基于概率的判别模型它的输入是一句话单词的顺序序列输出是这句话的概率即这些单词的联合概率joint probability。"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>beautiful sky</th>\n",
" <th>beautiful today</th>\n",
" <th>blue beautiful</th>\n",
" <th>blue dog</th>\n",
" <th>blue sky</th>\n",
" <th>brown fox</th>\n",
" <th>dog layz</th>\n",
" <th>dog lazy</th>\n",
" <th>fox jumps</th>\n",
" <th>fox quick</th>\n",
" <th>jumps lazy</th>\n",
" <th>layz brown</th>\n",
" <th>lazy dog</th>\n",
" <th>love blue</th>\n",
" <th>quick blue</th>\n",
" <th>quick brown</th>\n",
" <th>sky beautiful</th>\n",
" <th>sky blue</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" beautiful sky beautiful today blue beautiful blue dog blue sky \\\n",
"0 0 0 1 0 0 \n",
"1 1 0 1 0 0 \n",
"2 0 0 0 0 0 \n",
"3 0 0 0 1 0 \n",
"4 0 1 0 0 1 \n",
"5 0 0 0 0 0 \n",
"\n",
" brown fox dog layz dog lazy fox jumps fox quick jumps lazy \\\n",
"0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 \n",
"2 1 0 0 1 0 1 \n",
"3 1 0 1 0 1 0 \n",
"4 0 0 0 0 0 0 \n",
"5 1 1 0 0 1 0 \n",
"\n",
" layz brown lazy dog love blue quick blue quick brown sky beautiful \\\n",
"0 0 0 0 0 0 0 \n",
"1 0 0 1 0 0 0 \n",
"2 0 1 0 0 1 0 \n",
"3 0 0 0 1 0 0 \n",
"4 0 0 0 0 0 1 \n",
"5 1 0 0 0 0 0 \n",
"\n",
" sky blue \n",
"0 1 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 1 \n",
"5 0 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bv = CountVectorizer(ngram_range=(2,2)) # ngram_range关注两个词的关系\n",
"bv_matrix = bv.fit_transform(norm_corpus)\n",
"bv_matrix = bv_matrix.toarray()\n",
"vocab = bv.get_feature_names()\n",
"pd.DataFrame(bv_matrix, columns=vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"关注两个两个词的组合如上面的beautiful sky就是两个词的组合。\n",
"\n",
"第0列的beautiful sky为0因为上面第一句话中两个词不是前后关系。\n",
"\n",
"缺点:矩阵过大,且矩阵过于稀疏"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TF-IDF模型\n",
"TF是词频(Term Frequency)IDF是逆文本频率指数(Inverse Document Frequency)。\n",
"\n",
"即如果词w在一篇文档d中出现的频率高并且在其他文档中很少出现则认为词w具有很好的区分能力适合用来把文章d和其他文章区分开来。"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>beautiful</th>\n",
" <th>blue</th>\n",
" <th>brown</th>\n",
" <th>dog</th>\n",
" <th>fox</th>\n",
" <th>jumps</th>\n",
" <th>layz</th>\n",
" <th>lazy</th>\n",
" <th>love</th>\n",
" <th>quick</th>\n",
" <th>sky</th>\n",
" <th>today</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.60</td>\n",
" <td>0.52</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.60</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.46</td>\n",
" <td>0.39</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.66</td>\n",
" <td>0.00</td>\n",
" <td>0.46</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.37</td>\n",
" <td>0.37</td>\n",
" <td>0.37</td>\n",
" <td>0.53</td>\n",
" <td>0.00</td>\n",
" <td>0.43</td>\n",
" <td>0.00</td>\n",
" <td>0.37</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.00</td>\n",
" <td>0.35</td>\n",
" <td>0.40</td>\n",
" <td>0.40</td>\n",
" <td>0.40</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.48</td>\n",
" <td>0.00</td>\n",
" <td>0.40</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.36</td>\n",
" <td>0.31</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.72</td>\n",
" <td>0.52</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.41</td>\n",
" <td>0.41</td>\n",
" <td>0.41</td>\n",
" <td>0.00</td>\n",
" <td>0.59</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.41</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" beautiful blue brown dog fox jumps layz lazy love quick sky \\\n",
"0 0.60 0.52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 \n",
"1 0.46 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.66 0.00 0.46 \n",
"2 0.00 0.00 0.37 0.37 0.37 0.53 0.00 0.43 0.00 0.37 0.00 \n",
"3 0.00 0.35 0.40 0.40 0.40 0.00 0.00 0.48 0.00 0.40 0.00 \n",
"4 0.36 0.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 \n",
"5 0.00 0.00 0.41 0.41 0.41 0.00 0.59 0.00 0.00 0.41 0.00 \n",
"\n",
" today \n",
"0 0.00 \n",
"1 0.00 \n",
"2 0.00 \n",
"3 0.00 \n",
"4 0.52 \n",
"5 0.00 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n",
"tv_matrix = tv.fit_transform(norm_corpus)\n",
"tv_matrix = tv_matrix.toarray()\n",
"\n",
"vocab = tv.get_feature_names()\n",
"pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Similarity特征\n",
"\n",
"统计文章的相似性"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.000000</td>\n",
" <td>0.753128</td>\n",
" <td>0.000000</td>\n",
" <td>0.179256</td>\n",
" <td>0.807539</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.753128</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.135003</td>\n",
" <td>0.608181</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.796932</td>\n",
" <td>0.000000</td>\n",
" <td>0.592459</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.179256</td>\n",
" <td>0.135003</td>\n",
" <td>0.796932</td>\n",
" <td>1.000000</td>\n",
" <td>0.105992</td>\n",
" <td>0.654475</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.807539</td>\n",
" <td>0.608181</td>\n",
" <td>0.000000</td>\n",
" <td>0.105992</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.592459</td>\n",
" <td>0.654475</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5\n",
"0 1.000000 0.753128 0.000000 0.179256 0.807539 0.000000\n",
"1 0.753128 1.000000 0.000000 0.135003 0.608181 0.000000\n",
"2 0.000000 0.000000 1.000000 0.796932 0.000000 0.592459\n",
"3 0.179256 0.135003 0.796932 1.000000 0.105992 0.654475\n",
"4 0.807539 0.608181 0.000000 0.105992 1.000000 0.000000\n",
"5 0.000000 0.000000 0.592459 0.654475 0.000000 1.000000"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.metrics.pairwise import cosine_similarity\n",
"\n",
"similarity_matrix = cosine_similarity(tv_matrix)\n",
"similarity_df = pd.DataFrame(similarity_matrix)\n",
"similarity_df"
] ]
}, },
{ {

Loading…
Cancel
Save