Add word2vec

4 years ago · d2763f264e
parent 75bb7bdd46
commit d2763f264e
2 changed files with 1000 additions and 30 deletions
--- a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb
@ -158,7 +158,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -187,7 +187,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -216,7 +216,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -239,7 +239,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@ -251,7 +251,7 @@
       "      dtype='<U30')"
      ]
     },
-     "execution_count": 7,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -271,7 +271,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -295,7 +295,7 @@
       "       [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -326,7 +326,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@ -477,7 +477,7 @@
       "5      0  "
      ]
     },
-     "execution_count": 9,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -504,7 +504,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -713,7 +713,7 @@
       "5         0  "
      ]
     },
-     "execution_count": 11,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -749,7 +749,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
@ -900,7 +900,7 @@
       "5   0.00  "
      ]
     },
-     "execution_count": 12,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -926,7 +926,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -1027,7 +1027,7 @@
       "5  0.000000  0.000000  0.592459  0.654475  0.000000  1.000000"
      ]
     },
-     "execution_count": 13,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1040,6 +1040,491 @@
    "similarity_df"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 聚类特征\n",
+    "根据K值聚类，不常用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Document</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>ClusterLabel</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>The sky is blue and beautiful.</td>\n",
+       "      <td>weather</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Love this blue and beautiful sky!</td>\n",
+       "      <td>weather</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>The quick brown fox jumps over the lazy dog.</td>\n",
+       "      <td>animals</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>The brown fox is quick and the blue dog is lazy!</td>\n",
+       "      <td>animals</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>The sky is very blue and the sky is very beaut...</td>\n",
+       "      <td>weather</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>The dog is layz but the brown fox is quick!</td>\n",
+       "      <td>animals</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            Document Category  ClusterLabel\n",
+       "0                     The sky is blue and beautiful.  weather             0\n",
+       "1                  Love this blue and beautiful sky!  weather             0\n",
+       "2       The quick brown fox jumps over the lazy dog.  animals             1\n",
+       "3   The brown fox is quick and the blue dog is lazy!  animals             1\n",
+       "4  The sky is very blue and the sky is very beaut...  weather             0\n",
+       "5        The dog is layz but the brown fox is quick!  animals             1"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "\n",
+    "km = KMeans(n_clusters=2)  # 聚成两个类别\n",
+    "km.fit_transform(similarity_df)\n",
+    "cluster_labels = km.labels_\n",
+    "cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])\n",
+    "pd.concat([corpus_df, cluster_labels], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 主题模型\n",
+    "不常用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>T1</th>\n",
+       "      <th>T2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.190518</td>\n",
+       "      <td>0.809482</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.176822</td>\n",
+       "      <td>0.823178</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.845623</td>\n",
+       "      <td>0.154377</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.813959</td>\n",
+       "      <td>0.186041</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.180546</td>\n",
+       "      <td>0.819454</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0.835616</td>\n",
+       "      <td>0.164384</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         T1        T2\n",
+       "0  0.190518  0.809482\n",
+       "1  0.176822  0.823178\n",
+       "2  0.845623  0.154377\n",
+       "3  0.813959  0.186041\n",
+       "4  0.180546  0.819454\n",
+       "5  0.835616  0.164384"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.decomposition import LatentDirichletAllocation\n",
+    "\n",
+    "lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)\n",
+    "dt_matrix = lda.fit_transform(tv_matrix)\n",
+    "features = pd.DataFrame(dt_matrix, columns=['T1','T2'])\n",
+    "features  # 得到每句话在两个分类的不同概率"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 主题和词的权重\n",
+    "不常用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('brown', 1.661141029696565), ('dog', 1.661141029696565), ('fox', 1.661141029696565), ('quick', 1.661141029696565), ('lazy', 1.3970326617199404), ('layz', 1.0746375777072972), ('jumps', 1.0180791773370004), ('blue', 0.7626278092631464)]\n",
+      "[('sky', 2.2642769588598863), ('beautiful', 1.906718528224391), ('blue', 1.7982110631451238), ('love', 1.1480290369567938), ('today', 1.00672575634655)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 得到每个词的权重\n",
+    "tt_matrix = lda.components_\n",
+    "for topic_weights in tt_matrix:\n",
+    "    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]\n",
+    "    topic = sorted(topic, key=lambda x: -x[1])\n",
+    "    topic = [item for item in topic if item[1] > 0.6]\n",
+    "    print(topic)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 词嵌入模型 word2vec\n",
+    "目前常用的模型，解决了上面的全部问题，如：上下文关系、将相关的词，在高维中，赋予一定的关系。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gensim.models import word2vec  # pip install gensim\n",
+    "\n",
+    "wpt = nltk.WordPunctTokenizer()\n",
+    "tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]\n",
+    "\n",
+    "# Set values for various parameters\n",
+    "feature_size = 10  # Word vector dimensionality编码的纬度\n",
+    "window_context = 10  # Context window size前面滑动窗口的大小\n",
+    "min_word_count = 1  # Minimum word count过滤词的大小\n",
+    "sample = 1e-3  # Downsample setting for frequent words\n",
+    "\n",
+    "w2v_model = word2vec.Word2Vec(tokenized_corpus,size=feature_size,\n",
+    "                             window=window_context,min_count=min_word_count,\n",
+    "                             sample=sample)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-0.02585954,  0.04979984, -0.00273573, -0.04431831,  0.02668079,\n",
+       "       -0.04765006, -0.00984736,  0.02903971, -0.00389679,  0.01388443],\n",
+       "      dtype=float32)"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w2v_model.wv['sky']  # 把sky编程10维向量"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 获取一句话中，所有词的维度数据，并做平均值\n",
+    "# 如：一句话有3个次，则3个词10维度数据各种相加并均值，用平均值向量表示这句话\n",
+    "def averge_word_vectors(words,model,vocabulary,num_features):\n",
+    "    feature_vector = np.zeros((num_features,),dtype=\"float64\")\n",
+    "    nwords = 0.\n",
+    "    \n",
+    "    for word in words:\n",
+    "        if word in vocabulary:\n",
+    "            nwords = nwords+1.\n",
+    "            feature_vector = np.add(feature_vector, model[word])\n",
+    "            \n",
+    "    if nwords:\n",
+    "        feature_vector = np.divide(feature_vector, nwords)\n",
+    "        \n",
+    "    return feature_vector\n",
+    "\n",
+    "\n",
+    "def averge_word_vectorizer(corpus, model, num_features):\n",
+    "    vocabulary = set(model.wv.index2word)\n",
+    "    features = [averge_word_vectors(tokenized_sentence,model,\n",
+    "                                    vocabulary,num_features) \n",
+    "                for tokenized_sentence in corpus]\n",
+    "    \n",
+    "    return np.array(features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:10: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
+      "  # Remove the CWD from sys.path while we load stuff.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>9</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-0.021780</td>\n",
+       "      <td>0.026497</td>\n",
+       "      <td>0.003405</td>\n",
+       "      <td>-0.025112</td>\n",
+       "      <td>-0.003608</td>\n",
+       "      <td>-0.019199</td>\n",
+       "      <td>-0.008155</td>\n",
+       "      <td>0.017946</td>\n",
+       "      <td>0.011823</td>\n",
+       "      <td>0.001250</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>-0.018664</td>\n",
+       "      <td>0.017657</td>\n",
+       "      <td>0.006898</td>\n",
+       "      <td>-0.009205</td>\n",
+       "      <td>0.002988</td>\n",
+       "      <td>-0.008704</td>\n",
+       "      <td>-0.011054</td>\n",
+       "      <td>0.015843</td>\n",
+       "      <td>-0.001813</td>\n",
+       "      <td>-0.009935</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>-0.005042</td>\n",
+       "      <td>0.006801</td>\n",
+       "      <td>0.004798</td>\n",
+       "      <td>-0.006350</td>\n",
+       "      <td>0.004121</td>\n",
+       "      <td>-0.008453</td>\n",
+       "      <td>0.006522</td>\n",
+       "      <td>-0.018066</td>\n",
+       "      <td>-0.008232</td>\n",
+       "      <td>-0.008274</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>-0.005361</td>\n",
+       "      <td>0.010790</td>\n",
+       "      <td>0.004984</td>\n",
+       "      <td>-0.015889</td>\n",
+       "      <td>0.003737</td>\n",
+       "      <td>-0.017226</td>\n",
+       "      <td>0.004497</td>\n",
+       "      <td>-0.016209</td>\n",
+       "      <td>-0.002678</td>\n",
+       "      <td>-0.006484</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>-0.023125</td>\n",
+       "      <td>0.034736</td>\n",
+       "      <td>0.001525</td>\n",
+       "      <td>-0.030098</td>\n",
+       "      <td>0.000194</td>\n",
+       "      <td>-0.029992</td>\n",
+       "      <td>-0.000846</td>\n",
+       "      <td>0.016776</td>\n",
+       "      <td>-0.002937</td>\n",
+       "      <td>0.010821</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0.009492</td>\n",
+       "      <td>0.005832</td>\n",
+       "      <td>0.000876</td>\n",
+       "      <td>-0.009213</td>\n",
+       "      <td>0.002501</td>\n",
+       "      <td>-0.009656</td>\n",
+       "      <td>0.002072</td>\n",
+       "      <td>-0.005229</td>\n",
+       "      <td>-0.004966</td>\n",
+       "      <td>0.008581</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          0         1         2         3         4         5         6  \\\n",
+       "0 -0.021780  0.026497  0.003405 -0.025112 -0.003608 -0.019199 -0.008155   \n",
+       "1 -0.018664  0.017657  0.006898 -0.009205  0.002988 -0.008704 -0.011054   \n",
+       "2 -0.005042  0.006801  0.004798 -0.006350  0.004121 -0.008453  0.006522   \n",
+       "3 -0.005361  0.010790  0.004984 -0.015889  0.003737 -0.017226  0.004497   \n",
+       "4 -0.023125  0.034736  0.001525 -0.030098  0.000194 -0.029992 -0.000846   \n",
+       "5  0.009492  0.005832  0.000876 -0.009213  0.002501 -0.009656  0.002072   \n",
+       "\n",
+       "          7         8         9  \n",
+       "0  0.017946  0.011823  0.001250  \n",
+       "1  0.015843 -0.001813 -0.009935  \n",
+       "2 -0.018066 -0.008232 -0.008274  \n",
+       "3 -0.016209 -0.002678 -0.006484  \n",
+       "4  0.016776 -0.002937  0.010821  \n",
+       "5 -0.005229 -0.004966  0.008581  "
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w2v_feature_array = averge_word_vectorizer(corpus=tokenized_corpus,\n",
+    "                                          model=w2v_model,\n",
+    "                                          num_features=feature_size)\n",
+    "\n",
+    "pd.DataFrame(w2v_feature_array)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "单独用平均有些问题，即有的词重要性可能更强，后面会再用到"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/文本特征处理.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/文本特征处理.ipynb
@ -158,7 +158,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -187,7 +187,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -216,7 +216,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -239,7 +239,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@ -251,7 +251,7 @@
       "      dtype='<U30')"
      ]
     },
-     "execution_count": 7,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -271,7 +271,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -295,7 +295,7 @@
       "       [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -326,7 +326,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@ -477,7 +477,7 @@
       "5      0  "
      ]
     },
-     "execution_count": 9,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -504,7 +504,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -713,7 +713,7 @@
       "5         0  "
      ]
     },
-     "execution_count": 11,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -749,7 +749,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
@ -900,7 +900,7 @@
       "5   0.00  "
      ]
     },
-     "execution_count": 12,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -926,7 +926,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -1027,7 +1027,7 @@
       "5  0.000000  0.000000  0.592459  0.654475  0.000000  1.000000"
      ]
     },
-     "execution_count": 13,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1040,6 +1040,491 @@
    "similarity_df"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 聚类特征\n",
+    "根据K值聚类，不常用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Document</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>ClusterLabel</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>The sky is blue and beautiful.</td>\n",
+       "      <td>weather</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Love this blue and beautiful sky!</td>\n",
+       "      <td>weather</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>The quick brown fox jumps over the lazy dog.</td>\n",
+       "      <td>animals</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>The brown fox is quick and the blue dog is lazy!</td>\n",
+       "      <td>animals</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>The sky is very blue and the sky is very beaut...</td>\n",
+       "      <td>weather</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>The dog is layz but the brown fox is quick!</td>\n",
+       "      <td>animals</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            Document Category  ClusterLabel\n",
+       "0                     The sky is blue and beautiful.  weather             0\n",
+       "1                  Love this blue and beautiful sky!  weather             0\n",
+       "2       The quick brown fox jumps over the lazy dog.  animals             1\n",
+       "3   The brown fox is quick and the blue dog is lazy!  animals             1\n",
+       "4  The sky is very blue and the sky is very beaut...  weather             0\n",
+       "5        The dog is layz but the brown fox is quick!  animals             1"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "\n",
+    "km = KMeans(n_clusters=2)  # 聚成两个类别\n",
+    "km.fit_transform(similarity_df)\n",
+    "cluster_labels = km.labels_\n",
+    "cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])\n",
+    "pd.concat([corpus_df, cluster_labels], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 主题模型\n",
+    "不常用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>T1</th>\n",
+       "      <th>T2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.190518</td>\n",
+       "      <td>0.809482</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.176822</td>\n",
+       "      <td>0.823178</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.845623</td>\n",
+       "      <td>0.154377</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.813959</td>\n",
+       "      <td>0.186041</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.180546</td>\n",
+       "      <td>0.819454</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0.835616</td>\n",
+       "      <td>0.164384</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         T1        T2\n",
+       "0  0.190518  0.809482\n",
+       "1  0.176822  0.823178\n",
+       "2  0.845623  0.154377\n",
+       "3  0.813959  0.186041\n",
+       "4  0.180546  0.819454\n",
+       "5  0.835616  0.164384"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.decomposition import LatentDirichletAllocation\n",
+    "\n",
+    "lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)\n",
+    "dt_matrix = lda.fit_transform(tv_matrix)\n",
+    "features = pd.DataFrame(dt_matrix, columns=['T1','T2'])\n",
+    "features  # 得到每句话在两个分类的不同概率"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 主题和词的权重\n",
+    "不常用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('brown', 1.661141029696565), ('dog', 1.661141029696565), ('fox', 1.661141029696565), ('quick', 1.661141029696565), ('lazy', 1.3970326617199404), ('layz', 1.0746375777072972), ('jumps', 1.0180791773370004), ('blue', 0.7626278092631464)]\n",
+      "[('sky', 2.2642769588598863), ('beautiful', 1.906718528224391), ('blue', 1.7982110631451238), ('love', 1.1480290369567938), ('today', 1.00672575634655)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 得到每个词的权重\n",
+    "tt_matrix = lda.components_\n",
+    "for topic_weights in tt_matrix:\n",
+    "    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]\n",
+    "    topic = sorted(topic, key=lambda x: -x[1])\n",
+    "    topic = [item for item in topic if item[1] > 0.6]\n",
+    "    print(topic)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 词嵌入模型 word2vec\n",
+    "目前常用的模型，解决了上面的全部问题，如：上下文关系、将相关的词，在高维中，赋予一定的关系。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gensim.models import word2vec  # pip install gensim\n",
+    "\n",
+    "wpt = nltk.WordPunctTokenizer()\n",
+    "tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]\n",
+    "\n",
+    "# Set values for various parameters\n",
+    "feature_size = 10  # Word vector dimensionality编码的纬度\n",
+    "window_context = 10  # Context window size前面滑动窗口的大小\n",
+    "min_word_count = 1  # Minimum word count过滤词的大小\n",
+    "sample = 1e-3  # Downsample setting for frequent words\n",
+    "\n",
+    "w2v_model = word2vec.Word2Vec(tokenized_corpus,size=feature_size,\n",
+    "                             window=window_context,min_count=min_word_count,\n",
+    "                             sample=sample)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-0.02585954,  0.04979984, -0.00273573, -0.04431831,  0.02668079,\n",
+       "       -0.04765006, -0.00984736,  0.02903971, -0.00389679,  0.01388443],\n",
+       "      dtype=float32)"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w2v_model.wv['sky']  # 把sky编程10维向量"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 获取一句话中，所有词的维度数据，并做平均值\n",
+    "# 如：一句话有3个次，则3个词10维度数据各种相加并均值，用平均值向量表示这句话\n",
+    "def averge_word_vectors(words,model,vocabulary,num_features):\n",
+    "    feature_vector = np.zeros((num_features,),dtype=\"float64\")\n",
+    "    nwords = 0.\n",
+    "    \n",
+    "    for word in words:\n",
+    "        if word in vocabulary:\n",
+    "            nwords = nwords+1.\n",
+    "            feature_vector = np.add(feature_vector, model[word])\n",
+    "            \n",
+    "    if nwords:\n",
+    "        feature_vector = np.divide(feature_vector, nwords)\n",
+    "        \n",
+    "    return feature_vector\n",
+    "\n",
+    "\n",
+    "def averge_word_vectorizer(corpus, model, num_features):\n",
+    "    vocabulary = set(model.wv.index2word)\n",
+    "    features = [averge_word_vectors(tokenized_sentence,model,\n",
+    "                                    vocabulary,num_features) \n",
+    "                for tokenized_sentence in corpus]\n",
+    "    \n",
+    "    return np.array(features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:10: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
+      "  # Remove the CWD from sys.path while we load stuff.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>9</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-0.021780</td>\n",
+       "      <td>0.026497</td>\n",
+       "      <td>0.003405</td>\n",
+       "      <td>-0.025112</td>\n",
+       "      <td>-0.003608</td>\n",
+       "      <td>-0.019199</td>\n",
+       "      <td>-0.008155</td>\n",
+       "      <td>0.017946</td>\n",
+       "      <td>0.011823</td>\n",
+       "      <td>0.001250</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>-0.018664</td>\n",
+       "      <td>0.017657</td>\n",
+       "      <td>0.006898</td>\n",
+       "      <td>-0.009205</td>\n",
+       "      <td>0.002988</td>\n",
+       "      <td>-0.008704</td>\n",
+       "      <td>-0.011054</td>\n",
+       "      <td>0.015843</td>\n",
+       "      <td>-0.001813</td>\n",
+       "      <td>-0.009935</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>-0.005042</td>\n",
+       "      <td>0.006801</td>\n",
+       "      <td>0.004798</td>\n",
+       "      <td>-0.006350</td>\n",
+       "      <td>0.004121</td>\n",
+       "      <td>-0.008453</td>\n",
+       "      <td>0.006522</td>\n",
+       "      <td>-0.018066</td>\n",
+       "      <td>-0.008232</td>\n",
+       "      <td>-0.008274</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>-0.005361</td>\n",
+       "      <td>0.010790</td>\n",
+       "      <td>0.004984</td>\n",
+       "      <td>-0.015889</td>\n",
+       "      <td>0.003737</td>\n",
+       "      <td>-0.017226</td>\n",
+       "      <td>0.004497</td>\n",
+       "      <td>-0.016209</td>\n",
+       "      <td>-0.002678</td>\n",
+       "      <td>-0.006484</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>-0.023125</td>\n",
+       "      <td>0.034736</td>\n",
+       "      <td>0.001525</td>\n",
+       "      <td>-0.030098</td>\n",
+       "      <td>0.000194</td>\n",
+       "      <td>-0.029992</td>\n",
+       "      <td>-0.000846</td>\n",
+       "      <td>0.016776</td>\n",
+       "      <td>-0.002937</td>\n",
+       "      <td>0.010821</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0.009492</td>\n",
+       "      <td>0.005832</td>\n",
+       "      <td>0.000876</td>\n",
+       "      <td>-0.009213</td>\n",
+       "      <td>0.002501</td>\n",
+       "      <td>-0.009656</td>\n",
+       "      <td>0.002072</td>\n",
+       "      <td>-0.005229</td>\n",
+       "      <td>-0.004966</td>\n",
+       "      <td>0.008581</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          0         1         2         3         4         5         6  \\\n",
+       "0 -0.021780  0.026497  0.003405 -0.025112 -0.003608 -0.019199 -0.008155   \n",
+       "1 -0.018664  0.017657  0.006898 -0.009205  0.002988 -0.008704 -0.011054   \n",
+       "2 -0.005042  0.006801  0.004798 -0.006350  0.004121 -0.008453  0.006522   \n",
+       "3 -0.005361  0.010790  0.004984 -0.015889  0.003737 -0.017226  0.004497   \n",
+       "4 -0.023125  0.034736  0.001525 -0.030098  0.000194 -0.029992 -0.000846   \n",
+       "5  0.009492  0.005832  0.000876 -0.009213  0.002501 -0.009656  0.002072   \n",
+       "\n",
+       "          7         8         9  \n",
+       "0  0.017946  0.011823  0.001250  \n",
+       "1  0.015843 -0.001813 -0.009935  \n",
+       "2 -0.018066 -0.008232 -0.008274  \n",
+       "3 -0.016209 -0.002678 -0.006484  \n",
+       "4  0.016776 -0.002937  0.010821  \n",
+       "5 -0.005229 -0.004966  0.008581  "
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w2v_feature_array = averge_word_vectorizer(corpus=tokenized_corpus,\n",
+    "                                          model=w2v_model,\n",
+    "                                          num_features=feature_size)\n",
+    "\n",
+    "pd.DataFrame(w2v_feature_array)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "单独用平均有些问题，即有的词重要性可能更强，后面会再用到"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,