@ -158,7 +158,7 @@
},
{
"cell_type": "code",
"execution_count": 4 ,
"execution_count": 3 ,
"metadata": {},
"outputs": [
{
@ -187,7 +187,7 @@
},
{
"cell_type": "code",
"execution_count": 5 ,
"execution_count": 4 ,
"metadata": {},
"outputs": [
{
@ -216,7 +216,7 @@
},
{
"cell_type": "code",
"execution_count": 6 ,
"execution_count": 5 ,
"metadata": {},
"outputs": [],
"source": [
@ -239,7 +239,7 @@
},
{
"cell_type": "code",
"execution_count": 7 ,
"execution_count": 6 ,
"metadata": {},
"outputs": [
{
@ -251,7 +251,7 @@
" dtype='<U30')"
]
},
"execution_count": 7 ,
"execution_count": 6 ,
"metadata": {},
"output_type": "execute_result"
}
@ -271,7 +271,7 @@
},
{
"cell_type": "code",
"execution_count": 8 ,
"execution_count": 7 ,
"metadata": {},
"outputs": [
{
@ -295,7 +295,7 @@
" [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)"
]
},
"execution_count": 8 ,
"execution_count": 7 ,
"metadata": {},
"output_type": "execute_result"
}
@ -326,7 +326,7 @@
},
{
"cell_type": "code",
"execution_count": 9 ,
"execution_count": 8 ,
"metadata": {},
"outputs": [
{
@ -477,7 +477,7 @@
"5 0 "
]
},
"execution_count": 9 ,
"execution_count": 8 ,
"metadata": {},
"output_type": "execute_result"
}
@ -504,7 +504,7 @@
},
{
"cell_type": "code",
"execution_count": 11 ,
"execution_count": 9 ,
"metadata": {},
"outputs": [
{
@ -713,7 +713,7 @@
"5 0 "
]
},
"execution_count": 11 ,
"execution_count": 9 ,
"metadata": {},
"output_type": "execute_result"
}
@ -749,7 +749,7 @@
},
{
"cell_type": "code",
"execution_count": 12 ,
"execution_count": 10 ,
"metadata": {},
"outputs": [
{
@ -900,7 +900,7 @@
"5 0.00 "
]
},
"execution_count": 12 ,
"execution_count": 10 ,
"metadata": {},
"output_type": "execute_result"
}
@ -926,7 +926,7 @@
},
{
"cell_type": "code",
"execution_count": 13 ,
"execution_count": 11 ,
"metadata": {},
"outputs": [
{
@ -1027,7 +1027,7 @@
"5 0.000000 0.000000 0.592459 0.654475 0.000000 1.000000"
]
},
"execution_count": 13 ,
"execution_count": 11 ,
"metadata": {},
"output_type": "execute_result"
}
@ -1040,6 +1040,491 @@
"similarity_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 聚类特征\n",
"根据K值聚类, 不常用"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Document</th>\n",
" <th>Category</th>\n",
" <th>ClusterLabel</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>The sky is blue and beautiful.</td>\n",
" <td>weather</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Love this blue and beautiful sky!</td>\n",
" <td>weather</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>The quick brown fox jumps over the lazy dog.</td>\n",
" <td>animals</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>The brown fox is quick and the blue dog is lazy!</td>\n",
" <td>animals</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>The sky is very blue and the sky is very beaut...</td>\n",
" <td>weather</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>The dog is layz but the brown fox is quick!</td>\n",
" <td>animals</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Document Category ClusterLabel\n",
"0 The sky is blue and beautiful. weather 0\n",
"1 Love this blue and beautiful sky! weather 0\n",
"2 The quick brown fox jumps over the lazy dog. animals 1\n",
"3 The brown fox is quick and the blue dog is lazy! animals 1\n",
"4 The sky is very blue and the sky is very beaut... weather 0\n",
"5 The dog is layz but the brown fox is quick! animals 1"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.cluster import KMeans\n",
"\n",
"km = KMeans(n_clusters=2) # 聚成两个类别\n",
"km.fit_transform(similarity_df)\n",
"cluster_labels = km.labels_\n",
"cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])\n",
"pd.concat([corpus_df, cluster_labels], axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 主题模型\n",
"不常用"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>T1</th>\n",
" <th>T2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.190518</td>\n",
" <td>0.809482</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.176822</td>\n",
" <td>0.823178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.845623</td>\n",
" <td>0.154377</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.813959</td>\n",
" <td>0.186041</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.180546</td>\n",
" <td>0.819454</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.835616</td>\n",
" <td>0.164384</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" T1 T2\n",
"0 0.190518 0.809482\n",
"1 0.176822 0.823178\n",
"2 0.845623 0.154377\n",
"3 0.813959 0.186041\n",
"4 0.180546 0.819454\n",
"5 0.835616 0.164384"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.decomposition import LatentDirichletAllocation\n",
"\n",
"lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)\n",
"dt_matrix = lda.fit_transform(tv_matrix)\n",
"features = pd.DataFrame(dt_matrix, columns=['T1','T2'])\n",
"features # 得到每句话在两个分类的不同概率"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 主题和词的权重\n",
"不常用"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('brown', 1.661141029696565), ('dog', 1.661141029696565), ('fox', 1.661141029696565), ('quick', 1.661141029696565), ('lazy', 1.3970326617199404), ('layz', 1.0746375777072972), ('jumps', 1.0180791773370004), ('blue', 0.7626278092631464)]\n",
"[('sky', 2.2642769588598863), ('beautiful', 1.906718528224391), ('blue', 1.7982110631451238), ('love', 1.1480290369567938), ('today', 1.00672575634655)]\n"
]
}
],
"source": [
"# 得到每个词的权重\n",
"tt_matrix = lda.components_\n",
"for topic_weights in tt_matrix:\n",
" topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]\n",
" topic = sorted(topic, key=lambda x: -x[1])\n",
" topic = [item for item in topic if item[1] > 0.6]\n",
" print(topic)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 词嵌入模型 word2vec\n",
"目前常用的模型,解决了上面的全部问题,如:上下文关系、将相关的词,在高维中,赋予一定的关系。"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"from gensim.models import word2vec # pip install gensim\n",
"\n",
"wpt = nltk.WordPunctTokenizer()\n",
"tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]\n",
"\n",
"# Set values for various parameters\n",
"feature_size = 10 # Word vector dimensionality编码的纬度\n",
"window_context = 10 # Context window size前面滑动窗口的大小\n",
"min_word_count = 1 # Minimum word count过滤词的大小\n",
"sample = 1e-3 # Downsample setting for frequent words\n",
"\n",
"w2v_model = word2vec.Word2Vec(tokenized_corpus,size=feature_size,\n",
" window=window_context,min_count=min_word_count,\n",
" sample=sample)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.02585954, 0.04979984, -0.00273573, -0.04431831, 0.02668079,\n",
" -0.04765006, -0.00984736, 0.02903971, -0.00389679, 0.01388443],\n",
" dtype=float32)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"w2v_model.wv['sky'] # 把sky编程10维向量"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# 获取一句话中,所有词的维度数据,并做平均值\n",
"# 如: 一句话有3个次, 则3个词10维度数据各种相加并均值, 用平均值向量表示这句话\n",
"def averge_word_vectors(words,model,vocabulary,num_features):\n",
" feature_vector = np.zeros((num_features,),dtype=\"float64\")\n",
" nwords = 0.\n",
" \n",
" for word in words:\n",
" if word in vocabulary:\n",
" nwords = nwords+1.\n",
" feature_vector = np.add(feature_vector, model[word])\n",
" \n",
" if nwords:\n",
" feature_vector = np.divide(feature_vector, nwords)\n",
" \n",
" return feature_vector\n",
"\n",
"\n",
"def averge_word_vectorizer(corpus, model, num_features):\n",
" vocabulary = set(model.wv.index2word)\n",
" features = [averge_word_vectors(tokenized_sentence,model,\n",
" vocabulary,num_features) \n",
" for tokenized_sentence in corpus]\n",
" \n",
" return np.array(features)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:10: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" # Remove the CWD from sys.path while we load stuff.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-0.021780</td>\n",
" <td>0.026497</td>\n",
" <td>0.003405</td>\n",
" <td>-0.025112</td>\n",
" <td>-0.003608</td>\n",
" <td>-0.019199</td>\n",
" <td>-0.008155</td>\n",
" <td>0.017946</td>\n",
" <td>0.011823</td>\n",
" <td>0.001250</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-0.018664</td>\n",
" <td>0.017657</td>\n",
" <td>0.006898</td>\n",
" <td>-0.009205</td>\n",
" <td>0.002988</td>\n",
" <td>-0.008704</td>\n",
" <td>-0.011054</td>\n",
" <td>0.015843</td>\n",
" <td>-0.001813</td>\n",
" <td>-0.009935</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.005042</td>\n",
" <td>0.006801</td>\n",
" <td>0.004798</td>\n",
" <td>-0.006350</td>\n",
" <td>0.004121</td>\n",
" <td>-0.008453</td>\n",
" <td>0.006522</td>\n",
" <td>-0.018066</td>\n",
" <td>-0.008232</td>\n",
" <td>-0.008274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-0.005361</td>\n",
" <td>0.010790</td>\n",
" <td>0.004984</td>\n",
" <td>-0.015889</td>\n",
" <td>0.003737</td>\n",
" <td>-0.017226</td>\n",
" <td>0.004497</td>\n",
" <td>-0.016209</td>\n",
" <td>-0.002678</td>\n",
" <td>-0.006484</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.023125</td>\n",
" <td>0.034736</td>\n",
" <td>0.001525</td>\n",
" <td>-0.030098</td>\n",
" <td>0.000194</td>\n",
" <td>-0.029992</td>\n",
" <td>-0.000846</td>\n",
" <td>0.016776</td>\n",
" <td>-0.002937</td>\n",
" <td>0.010821</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.009492</td>\n",
" <td>0.005832</td>\n",
" <td>0.000876</td>\n",
" <td>-0.009213</td>\n",
" <td>0.002501</td>\n",
" <td>-0.009656</td>\n",
" <td>0.002072</td>\n",
" <td>-0.005229</td>\n",
" <td>-0.004966</td>\n",
" <td>0.008581</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6 \\\n",
"0 -0.021780 0.026497 0.003405 -0.025112 -0.003608 -0.019199 -0.008155 \n",
"1 -0.018664 0.017657 0.006898 -0.009205 0.002988 -0.008704 -0.011054 \n",
"2 -0.005042 0.006801 0.004798 -0.006350 0.004121 -0.008453 0.006522 \n",
"3 -0.005361 0.010790 0.004984 -0.015889 0.003737 -0.017226 0.004497 \n",
"4 -0.023125 0.034736 0.001525 -0.030098 0.000194 -0.029992 -0.000846 \n",
"5 0.009492 0.005832 0.000876 -0.009213 0.002501 -0.009656 0.002072 \n",
"\n",
" 7 8 9 \n",
"0 0.017946 0.011823 0.001250 \n",
"1 0.015843 -0.001813 -0.009935 \n",
"2 -0.018066 -0.008232 -0.008274 \n",
"3 -0.016209 -0.002678 -0.006484 \n",
"4 0.016776 -0.002937 0.010821 \n",
"5 -0.005229 -0.004966 0.008581 "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"w2v_feature_array = averge_word_vectorizer(corpus=tokenized_corpus,\n",
" model=w2v_model,\n",
" num_features=feature_size)\n",
"\n",
"pd.DataFrame(w2v_feature_array)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"单独用平均有些问题,即有的词重要性可能更强,后面会再用到"
]
},
{
"cell_type": "code",
"execution_count": null,