{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "import nltk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 构造一个文本数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document</th>\n",
       "      <th>Category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The sky is blue and beautiful.</td>\n",
       "      <td>weather</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Love this blue and beautiful sky!</td>\n",
       "      <td>weather</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The quick brown fox jumps over the lazy dog.</td>\n",
       "      <td>animals</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The brown fox is quick and the blue dog is lazy!</td>\n",
       "      <td>animals</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The sky is very blue and the sky is very beaut...</td>\n",
       "      <td>weather</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>The dog is layz but the brown fox is quick!</td>\n",
       "      <td>animals</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            Document Category\n",
       "0                     The sky is blue and beautiful.  weather\n",
       "1                  Love this blue and beautiful sky!  weather\n",
       "2       The quick brown fox jumps over the lazy dog.  animals\n",
       "3   The brown fox is quick and the blue dog is lazy!  animals\n",
       "4  The sky is very blue and the sky is very beaut...  weather\n",
       "5        The dog is layz but the brown fox is quick!  animals"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus = ['The sky is blue and beautiful.',\n",
    "         'Love this blue and beautiful sky!',\n",
    "         'The quick brown fox jumps over the lazy dog.',\n",
    "         'The brown fox is quick and the blue dog is lazy!',\n",
    "         'The sky is very blue and the sky is very beautiful today',\n",
    "         'The dog is layz but the brown fox is quick!']\n",
    "\n",
    "labels = ['weather','weather','animals','animals','weather','animals',]\n",
    "corpus = np.array(corpus)\n",
    "corpus_df = pd.DataFrame({'Document': corpus,\n",
    "                         'Category': labels})\n",
    "corpus_df = corpus_df[['Document','Category']]\n",
    "corpus_df  # 有标签，如每句话的主题"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "任务：分类任务，基于一句话分类成相应的标签"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 基本预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.download()  # 下载失败的用这个方法https://blog.csdn.net/qq_37891889/article/details/104418106"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "*** Introductory Examples for the NLTK Book ***\n",
      "Loading text1, ..., text9 and sent1, ..., sent9\n",
      "Type the name of the text or sentence to view it.\n",
      "Type: 'texts()' or 'sents()' to list the materials.\n",
      "text1: Moby Dick by Herman Melville 1851\n",
      "text2: Sense and Sensibility by Jane Austen 1811\n",
      "text3: The Book of Genesis\n",
      "text4: Inaugural Address Corpus\n",
      "text5: Chat Corpus\n",
      "text6: Monty Python and the Holy Grail\n",
      "text7: Wall Street Journal\n",
      "text8: Personals Corpus\n",
      "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
     ]
    }
   ],
   "source": [
    "from nltk.book import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']\n"
     ]
    }
   ],
   "source": [
    "# 词频与停用词\n",
    "wpt = nltk.WordPunctTokenizer()\n",
    "stop_words = nltk.corpus.stopwords.words('english')\n",
    "print(stop_words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "停用词：\n",
    "\n",
    "这里面除了天气和动物信息，其它都基本没用，如i me my等等这些词，这些相当于停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalize_document(doc):\n",
    "    # 预处理\n",
    "    doc = re.sub(r'[^a-zA-Z0-9\\s]', '', doc, re.I)  # 去掉多余字符\n",
    "    doc = doc.lower()  # 统一转小写\n",
    "    doc = doc.strip()  # 去空格\n",
    "    # 分词，切分提取全部词\n",
    "    tokens = wpt.tokenize(doc)  \n",
    "    # 查找停用词，并过滤\n",
    "    filtered_tokens = [token for token in tokens if token not in stop_words]\n",
    "    # 拼接所有的词\n",
    "    doc = ' '.join(filtered_tokens)\n",
    "    return doc\n",
    "\n",
    "\n",
    "normalize_corpus = np.vectorize(normalize_document)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['sky blue beautiful', 'love blue beautiful sky',\n",
       "       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',\n",
       "       'sky blue sky beautiful today', 'dog layz brown fox quick'],\n",
       "      dtype='<U30')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "norm_corpus = normalize_corpus(corpus)\n",
    "norm_corpus  # 处理完成的结果"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 词袋模型\n",
    "将所有词语装进一个袋子里,不考虑其词法和语序的问题,即每个词语都是独立的。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['sky blue beautiful' 'love blue beautiful sky'\n",
      " 'quick brown fox jumps lazy dog' 'brown fox quick blue dog lazy'\n",
      " 'sky blue sky beautiful today' 'dog layz brown fox quick']\n",
      "['beautiful', 'blue', 'brown', 'dog', 'fox', 'jumps', 'layz', 'lazy', 'love', 'quick', 'sky', 'today']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],\n",
       "       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],\n",
       "       [0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],\n",
       "       [0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0],\n",
       "       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1],\n",
       "       [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "print(norm_corpus)\n",
    "# 利用语句中的词，构建成一个词汇表。\n",
    "# min_df表示词频率小于多少不会被当做关键词，max_df则相反\n",
    "# 直接API文档https://scikit-learn.org/stable/modules/classes.html，搜CountVectorizer\n",
    "cv = CountVectorizer(min_df=0., max_df=1.)\n",
    "cv.fit(norm_corpus)\n",
    "print(cv.get_feature_names())\n",
    "# 构建向量表\n",
    "cv_matrix = cv.fit_transform(norm_corpus)\n",
    "cv_matrix = cv_matrix.toarray()\n",
    "cv_matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如上数据中，在词汇表中，是否出现过，有则在相应位置标记为1，有两个则标记为2。\n",
    "\n",
    "如：第一句话'sky blue beautiful'，在词汇表有第一个词一次，第二个词一次，导数第二个词一次，那么下面的向量表则是[1,1,...,1,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>beautiful</th>\n",
       "      <th>blue</th>\n",
       "      <th>brown</th>\n",
       "      <th>dog</th>\n",
       "      <th>fox</th>\n",
       "      <th>jumps</th>\n",
       "      <th>layz</th>\n",
       "      <th>lazy</th>\n",
       "      <th>love</th>\n",
       "      <th>quick</th>\n",
       "      <th>sky</th>\n",
       "      <th>today</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   beautiful  blue  brown  dog  fox  jumps  layz  lazy  love  quick  sky  \\\n",
       "0          1     1      0    0    0      0     0     0     0      0    1   \n",
       "1          1     1      0    0    0      0     0     0     1      0    1   \n",
       "2          0     0      1    1    1      1     0     1     0      1    0   \n",
       "3          0     1      1    1    1      0     0     1     0      1    0   \n",
       "4          1     1      0    0    0      0     0     0     0      0    2   \n",
       "5          0     0      1    1    1      0     1     0     0      1    0   \n",
       "\n",
       "   today  \n",
       "0      0  \n",
       "1      0  \n",
       "2      0  \n",
       "3      0  \n",
       "4      1  \n",
       "5      0  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = cv.get_feature_names()\n",
    "pd.DataFrame(cv_matrix, columns=vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "缺点：只考虑词频，没有考虑到前后逻辑"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## N-Grams模型\n",
    "一种语言模型（Language Model，LM），语言模型是一个基于概率的判别模型，它的输入是一句话（单词的顺序序列），输出是这句话的概率，即这些单词的联合概率（joint probability）。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>beautiful sky</th>\n",
       "      <th>beautiful today</th>\n",
       "      <th>blue beautiful</th>\n",
       "      <th>blue dog</th>\n",
       "      <th>blue sky</th>\n",
       "      <th>brown fox</th>\n",
       "      <th>dog layz</th>\n",
       "      <th>dog lazy</th>\n",
       "      <th>fox jumps</th>\n",
       "      <th>fox quick</th>\n",
       "      <th>jumps lazy</th>\n",
       "      <th>layz brown</th>\n",
       "      <th>lazy dog</th>\n",
       "      <th>love blue</th>\n",
       "      <th>quick blue</th>\n",
       "      <th>quick brown</th>\n",
       "      <th>sky beautiful</th>\n",
       "      <th>sky blue</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   beautiful sky  beautiful today  blue beautiful  blue dog  blue sky  \\\n",
       "0              0                0               1         0         0   \n",
       "1              1                0               1         0         0   \n",
       "2              0                0               0         0         0   \n",
       "3              0                0               0         1         0   \n",
       "4              0                1               0         0         1   \n",
       "5              0                0               0         0         0   \n",
       "\n",
       "   brown fox  dog layz  dog lazy  fox jumps  fox quick  jumps lazy  \\\n",
       "0          0         0         0          0          0           0   \n",
       "1          0         0         0          0          0           0   \n",
       "2          1         0         0          1          0           1   \n",
       "3          1         0         1          0          1           0   \n",
       "4          0         0         0          0          0           0   \n",
       "5          1         1         0          0          1           0   \n",
       "\n",
       "   layz brown  lazy dog  love blue  quick blue  quick brown  sky beautiful  \\\n",
       "0           0         0          0           0            0              0   \n",
       "1           0         0          1           0            0              0   \n",
       "2           0         1          0           0            1              0   \n",
       "3           0         0          0           1            0              0   \n",
       "4           0         0          0           0            0              1   \n",
       "5           1         0          0           0            0              0   \n",
       "\n",
       "   sky blue  \n",
       "0         1  \n",
       "1         0  \n",
       "2         0  \n",
       "3         0  \n",
       "4         1  \n",
       "5         0  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bv = CountVectorizer(ngram_range=(2,2))  # ngram_range关注两个词的关系\n",
    "bv_matrix = bv.fit_transform(norm_corpus)\n",
    "bv_matrix = bv_matrix.toarray()\n",
    "vocab = bv.get_feature_names()\n",
    "pd.DataFrame(bv_matrix, columns=vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "关注两个两个词的组合，如上面的beautiful sky就是两个词的组合。\n",
    "\n",
    "第0列的beautiful sky为0，因为上面第一句话中，两个词不是前后关系。\n",
    "\n",
    "缺点：矩阵过大，且矩阵过于稀疏"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TF-IDF模型\n",
    "TF是词频(Term Frequency)，IDF是逆文本频率指数(Inverse Document Frequency)。\n",
    "\n",
    "即：如果词w在一篇文档d中出现的频率高，并且在其他文档中很少出现，则认为词w具有很好的区分能力，适合用来把文章d和其他文章区分开来。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>beautiful</th>\n",
       "      <th>blue</th>\n",
       "      <th>brown</th>\n",
       "      <th>dog</th>\n",
       "      <th>fox</th>\n",
       "      <th>jumps</th>\n",
       "      <th>layz</th>\n",
       "      <th>lazy</th>\n",
       "      <th>love</th>\n",
       "      <th>quick</th>\n",
       "      <th>sky</th>\n",
       "      <th>today</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.60</td>\n",
       "      <td>0.52</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.60</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.46</td>\n",
       "      <td>0.39</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.66</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.46</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.53</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.43</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.35</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.48</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.36</td>\n",
       "      <td>0.31</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.41</td>\n",
       "      <td>0.41</td>\n",
       "      <td>0.41</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.59</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.41</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   beautiful  blue  brown   dog   fox  jumps  layz  lazy  love  quick   sky  \\\n",
       "0       0.60  0.52   0.00  0.00  0.00   0.00  0.00  0.00  0.00   0.00  0.60   \n",
       "1       0.46  0.39   0.00  0.00  0.00   0.00  0.00  0.00  0.66   0.00  0.46   \n",
       "2       0.00  0.00   0.37  0.37  0.37   0.53  0.00  0.43  0.00   0.37  0.00   \n",
       "3       0.00  0.35   0.40  0.40  0.40   0.00  0.00  0.48  0.00   0.40  0.00   \n",
       "4       0.36  0.31   0.00  0.00  0.00   0.00  0.00  0.00  0.00   0.00  0.72   \n",
       "5       0.00  0.00   0.41  0.41  0.41   0.00  0.59  0.00  0.00   0.41  0.00   \n",
       "\n",
       "   today  \n",
       "0   0.00  \n",
       "1   0.00  \n",
       "2   0.00  \n",
       "3   0.00  \n",
       "4   0.52  \n",
       "5   0.00  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n",
    "tv_matrix = tv.fit_transform(norm_corpus)\n",
    "tv_matrix = tv_matrix.toarray()\n",
    "\n",
    "vocab = tv.get_feature_names()\n",
    "pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Similarity特征\n",
    "\n",
    "统计文章的相似性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.753128</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.179256</td>\n",
       "      <td>0.807539</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.753128</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.135003</td>\n",
       "      <td>0.608181</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.796932</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.592459</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.179256</td>\n",
       "      <td>0.135003</td>\n",
       "      <td>0.796932</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.105992</td>\n",
       "      <td>0.654475</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.807539</td>\n",
       "      <td>0.608181</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.105992</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.592459</td>\n",
       "      <td>0.654475</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          0         1         2         3         4         5\n",
       "0  1.000000  0.753128  0.000000  0.179256  0.807539  0.000000\n",
       "1  0.753128  1.000000  0.000000  0.135003  0.608181  0.000000\n",
       "2  0.000000  0.000000  1.000000  0.796932  0.000000  0.592459\n",
       "3  0.179256  0.135003  0.796932  1.000000  0.105992  0.654475\n",
       "4  0.807539  0.608181  0.000000  0.105992  1.000000  0.000000\n",
       "5  0.000000  0.000000  0.592459  0.654475  0.000000  1.000000"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "similarity_matrix = cosine_similarity(tv_matrix)\n",
    "similarity_df = pd.DataFrame(similarity_matrix)\n",
    "similarity_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 聚类特征\n",
    "根据K值聚类，不常用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document</th>\n",
       "      <th>Category</th>\n",
       "      <th>ClusterLabel</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The sky is blue and beautiful.</td>\n",
       "      <td>weather</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Love this blue and beautiful sky!</td>\n",
       "      <td>weather</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The quick brown fox jumps over the lazy dog.</td>\n",
       "      <td>animals</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The brown fox is quick and the blue dog is lazy!</td>\n",
       "      <td>animals</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The sky is very blue and the sky is very beaut...</td>\n",
       "      <td>weather</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>The dog is layz but the brown fox is quick!</td>\n",
       "      <td>animals</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            Document Category  ClusterLabel\n",
       "0                     The sky is blue and beautiful.  weather             0\n",
       "1                  Love this blue and beautiful sky!  weather             0\n",
       "2       The quick brown fox jumps over the lazy dog.  animals             1\n",
       "3   The brown fox is quick and the blue dog is lazy!  animals             1\n",
       "4  The sky is very blue and the sky is very beaut...  weather             0\n",
       "5        The dog is layz but the brown fox is quick!  animals             1"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "km = KMeans(n_clusters=2)  # 聚成两个类别\n",
    "km.fit_transform(similarity_df)\n",
    "cluster_labels = km.labels_\n",
    "cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])\n",
    "pd.concat([corpus_df, cluster_labels], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 主题模型\n",
    "不常用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>T1</th>\n",
       "      <th>T2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.190518</td>\n",
       "      <td>0.809482</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.176822</td>\n",
       "      <td>0.823178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.845623</td>\n",
       "      <td>0.154377</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.813959</td>\n",
       "      <td>0.186041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.180546</td>\n",
       "      <td>0.819454</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.835616</td>\n",
       "      <td>0.164384</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         T1        T2\n",
       "0  0.190518  0.809482\n",
       "1  0.176822  0.823178\n",
       "2  0.845623  0.154377\n",
       "3  0.813959  0.186041\n",
       "4  0.180546  0.819454\n",
       "5  0.835616  0.164384"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "\n",
    "lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)\n",
    "dt_matrix = lda.fit_transform(tv_matrix)\n",
    "features = pd.DataFrame(dt_matrix, columns=['T1','T2'])\n",
    "features  # 得到每句话在两个分类的不同概率"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 主题和词的权重\n",
    "不常用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('brown', 1.661141029696565), ('dog', 1.661141029696565), ('fox', 1.661141029696565), ('quick', 1.661141029696565), ('lazy', 1.3970326617199404), ('layz', 1.0746375777072972), ('jumps', 1.0180791773370004), ('blue', 0.7626278092631464)]\n",
      "[('sky', 2.2642769588598863), ('beautiful', 1.906718528224391), ('blue', 1.7982110631451238), ('love', 1.1480290369567938), ('today', 1.00672575634655)]\n"
     ]
    }
   ],
   "source": [
    "# 得到每个词的权重\n",
    "tt_matrix = lda.components_\n",
    "for topic_weights in tt_matrix:\n",
    "    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]\n",
    "    topic = sorted(topic, key=lambda x: -x[1])\n",
    "    topic = [item for item in topic if item[1] > 0.6]\n",
    "    print(topic)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 词嵌入模型 word2vec\n",
    "目前常用的模型，解决了上面的全部问题，如：上下文关系、将相关的词，在高维中，赋予一定的关系。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models import word2vec  # pip install gensim\n",
    "\n",
    "wpt = nltk.WordPunctTokenizer()\n",
    "tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]\n",
    "\n",
    "# Set values for various parameters\n",
    "feature_size = 10  # Word vector dimensionality编码的纬度\n",
    "window_context = 10  # Context window size前面滑动窗口的大小\n",
    "min_word_count = 1  # Minimum word count过滤词的大小\n",
    "sample = 1e-3  # Downsample setting for frequent words\n",
    "\n",
    "w2v_model = word2vec.Word2Vec(tokenized_corpus,size=feature_size,\n",
    "                             window=window_context,min_count=min_word_count,\n",
    "                             sample=sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.02585954,  0.04979984, -0.00273573, -0.04431831,  0.02668079,\n",
       "       -0.04765006, -0.00984736,  0.02903971, -0.00389679,  0.01388443],\n",
       "      dtype=float32)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w2v_model.wv['sky']  # 把sky编程10维向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取一句话中，所有词的维度数据，并做平均值\n",
    "# 如：一句话有3个次，则3个词10维度数据各种相加并均值，用平均值向量表示这句话\n",
    "def averge_word_vectors(words,model,vocabulary,num_features):\n",
    "    feature_vector = np.zeros((num_features,),dtype=\"float64\")\n",
    "    nwords = 0.\n",
    "    \n",
    "    for word in words:\n",
    "        if word in vocabulary:\n",
    "            nwords = nwords+1.\n",
    "            feature_vector = np.add(feature_vector, model[word])\n",
    "            \n",
    "    if nwords:\n",
    "        feature_vector = np.divide(feature_vector, nwords)\n",
    "        \n",
    "    return feature_vector\n",
    "\n",
    "\n",
    "def averge_word_vectorizer(corpus, model, num_features):\n",
    "    vocabulary = set(model.wv.index2word)\n",
    "    features = [averge_word_vectors(tokenized_sentence,model,\n",
    "                                    vocabulary,num_features) \n",
    "                for tokenized_sentence in corpus]\n",
    "    \n",
    "    return np.array(features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:10: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  # Remove the CWD from sys.path while we load stuff.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.021780</td>\n",
       "      <td>0.026497</td>\n",
       "      <td>0.003405</td>\n",
       "      <td>-0.025112</td>\n",
       "      <td>-0.003608</td>\n",
       "      <td>-0.019199</td>\n",
       "      <td>-0.008155</td>\n",
       "      <td>0.017946</td>\n",
       "      <td>0.011823</td>\n",
       "      <td>0.001250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.018664</td>\n",
       "      <td>0.017657</td>\n",
       "      <td>0.006898</td>\n",
       "      <td>-0.009205</td>\n",
       "      <td>0.002988</td>\n",
       "      <td>-0.008704</td>\n",
       "      <td>-0.011054</td>\n",
       "      <td>0.015843</td>\n",
       "      <td>-0.001813</td>\n",
       "      <td>-0.009935</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.005042</td>\n",
       "      <td>0.006801</td>\n",
       "      <td>0.004798</td>\n",
       "      <td>-0.006350</td>\n",
       "      <td>0.004121</td>\n",
       "      <td>-0.008453</td>\n",
       "      <td>0.006522</td>\n",
       "      <td>-0.018066</td>\n",
       "      <td>-0.008232</td>\n",
       "      <td>-0.008274</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.005361</td>\n",
       "      <td>0.010790</td>\n",
       "      <td>0.004984</td>\n",
       "      <td>-0.015889</td>\n",
       "      <td>0.003737</td>\n",
       "      <td>-0.017226</td>\n",
       "      <td>0.004497</td>\n",
       "      <td>-0.016209</td>\n",
       "      <td>-0.002678</td>\n",
       "      <td>-0.006484</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.023125</td>\n",
       "      <td>0.034736</td>\n",
       "      <td>0.001525</td>\n",
       "      <td>-0.030098</td>\n",
       "      <td>0.000194</td>\n",
       "      <td>-0.029992</td>\n",
       "      <td>-0.000846</td>\n",
       "      <td>0.016776</td>\n",
       "      <td>-0.002937</td>\n",
       "      <td>0.010821</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.009492</td>\n",
       "      <td>0.005832</td>\n",
       "      <td>0.000876</td>\n",
       "      <td>-0.009213</td>\n",
       "      <td>0.002501</td>\n",
       "      <td>-0.009656</td>\n",
       "      <td>0.002072</td>\n",
       "      <td>-0.005229</td>\n",
       "      <td>-0.004966</td>\n",
       "      <td>0.008581</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          0         1         2         3         4         5         6  \\\n",
       "0 -0.021780  0.026497  0.003405 -0.025112 -0.003608 -0.019199 -0.008155   \n",
       "1 -0.018664  0.017657  0.006898 -0.009205  0.002988 -0.008704 -0.011054   \n",
       "2 -0.005042  0.006801  0.004798 -0.006350  0.004121 -0.008453  0.006522   \n",
       "3 -0.005361  0.010790  0.004984 -0.015889  0.003737 -0.017226  0.004497   \n",
       "4 -0.023125  0.034736  0.001525 -0.030098  0.000194 -0.029992 -0.000846   \n",
       "5  0.009492  0.005832  0.000876 -0.009213  0.002501 -0.009656  0.002072   \n",
       "\n",
       "          7         8         9  \n",
       "0  0.017946  0.011823  0.001250  \n",
       "1  0.015843 -0.001813 -0.009935  \n",
       "2 -0.018066 -0.008232 -0.008274  \n",
       "3 -0.016209 -0.002678 -0.006484  \n",
       "4  0.016776 -0.002937  0.010821  \n",
       "5 -0.005229 -0.004966  0.008581  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w2v_feature_array = averge_word_vectorizer(corpus=tokenized_corpus,\n",
    "                                          model=w2v_model,\n",
    "                                          num_features=feature_size)\n",
    "\n",
    "pd.DataFrame(w2v_feature_array)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "单独用平均有些问题，即有的词重要性可能更强，后面会再用到"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}