{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import nltk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 构造一个文本数据集" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DocumentCategory
0The sky is blue and beautiful.weather
1Love this blue and beautiful sky!weather
2The quick brown fox jumps over the lazy dog.animals
3The brown fox is quick and the blue dog is lazy!animals
4The sky is very blue and the sky is very beaut...weather
5The dog is layz but the brown fox is quick!animals
\n", "
" ], "text/plain": [ " Document Category\n", "0 The sky is blue and beautiful. weather\n", "1 Love this blue and beautiful sky! weather\n", "2 The quick brown fox jumps over the lazy dog. animals\n", "3 The brown fox is quick and the blue dog is lazy! animals\n", "4 The sky is very blue and the sky is very beaut... weather\n", "5 The dog is layz but the brown fox is quick! animals" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corpus = ['The sky is blue and beautiful.',\n", " 'Love this blue and beautiful sky!',\n", " 'The quick brown fox jumps over the lazy dog.',\n", " 'The brown fox is quick and the blue dog is lazy!',\n", " 'The sky is very blue and the sky is very beautiful today',\n", " 'The dog is layz but the brown fox is quick!']\n", "\n", "labels = ['weather','weather','animals','animals','weather','animals',]\n", "corpus = np.array(corpus)\n", "corpus_df = pd.DataFrame({'Document': corpus,\n", " 'Category': labels})\n", "corpus_df = corpus_df[['Document','Category']]\n", "corpus_df # 有标签,如每句话的主题" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "任务:分类任务,基于一句话分类成相应的标签" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 基本预处理" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download() # 下载失败的用这个方法https://blog.csdn.net/qq_37891889/article/details/104418106" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "*** Introductory Examples for the NLTK Book ***\n", "Loading text1, ..., text9 and sent1, ..., sent9\n", "Type the name of the text or sentence to view it.\n", "Type: 'texts()' or 'sents()' to list the materials.\n", "text1: Moby Dick by Herman Melville 1851\n", "text2: Sense and Sensibility by Jane Austen 1811\n", "text3: The Book of Genesis\n", "text4: Inaugural Address Corpus\n", "text5: Chat Corpus\n", "text6: Monty Python and the Holy Grail\n", "text7: Wall Street Journal\n", "text8: Personals Corpus\n", "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n" ] } ], "source": [ "from nltk.book import *" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']\n" ] } ], "source": [ "# 词频与停用词\n", "wpt = nltk.WordPunctTokenizer()\n", "stop_words = nltk.corpus.stopwords.words('english')\n", "print(stop_words)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "停用词:\n", "\n", "这里面除了天气和动物信息,其它都基本没用,如i me my等等这些词,这些相当于停用词" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def normalize_document(doc):\n", " # 预处理\n", " doc = re.sub(r'[^a-zA-Z0-9\\s]', '', doc, re.I) # 去掉多余字符\n", " doc = doc.lower() # 统一转小写\n", " doc = doc.strip() # 去空格\n", " # 分词,切分提取全部词\n", " tokens = wpt.tokenize(doc) \n", " # 查找停用词,并过滤\n", " filtered_tokens = [token for token in tokens if token not in stop_words]\n", " # 拼接所有的词\n", " doc = ' '.join(filtered_tokens)\n", " return doc\n", "\n", "\n", "normalize_corpus = np.vectorize(normalize_document)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['sky blue beautiful', 'love blue beautiful sky',\n", " 'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',\n", " 'sky blue sky beautiful today', 'dog layz brown fox quick'],\n", " dtype='\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
beautifulbluebrowndogfoxjumpslayzlazylovequickskytoday
0110000000010
1110000001010
2001111010100
3011110010100
4110000000021
5001110100100
\n", "" ], "text/plain": [ " beautiful blue brown dog fox jumps layz lazy love quick sky \\\n", "0 1 1 0 0 0 0 0 0 0 0 1 \n", "1 1 1 0 0 0 0 0 0 1 0 1 \n", "2 0 0 1 1 1 1 0 1 0 1 0 \n", "3 0 1 1 1 1 0 0 1 0 1 0 \n", "4 1 1 0 0 0 0 0 0 0 0 2 \n", "5 0 0 1 1 1 0 1 0 0 1 0 \n", "\n", " today \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 1 \n", "5 0 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab = cv.get_feature_names()\n", "pd.DataFrame(cv_matrix, columns=vocab)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "缺点:只考虑词频,没有考虑到前后逻辑" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## N-Grams模型\n", "一种语言模型(Language Model,LM),语言模型是一个基于概率的判别模型,它的输入是一句话(单词的顺序序列),输出是这句话的概率,即这些单词的联合概率(joint probability)。" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
beautiful skybeautiful todayblue beautifulblue dogblue skybrown foxdog layzdog lazyfox jumpsfox quickjumps lazylayz brownlazy doglove bluequick bluequick brownsky beautifulsky blue
0001000000000000001
1101000000000010000
2000001001010100100
3000101010100001000
4010010000000000011
5000001100101000000
\n", "
" ], "text/plain": [ " beautiful sky beautiful today blue beautiful blue dog blue sky \\\n", "0 0 0 1 0 0 \n", "1 1 0 1 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 1 0 \n", "4 0 1 0 0 1 \n", "5 0 0 0 0 0 \n", "\n", " brown fox dog layz dog lazy fox jumps fox quick jumps lazy \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 \n", "2 1 0 0 1 0 1 \n", "3 1 0 1 0 1 0 \n", "4 0 0 0 0 0 0 \n", "5 1 1 0 0 1 0 \n", "\n", " layz brown lazy dog love blue quick blue quick brown sky beautiful \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 1 0 0 0 \n", "2 0 1 0 0 1 0 \n", "3 0 0 0 1 0 0 \n", "4 0 0 0 0 0 1 \n", "5 1 0 0 0 0 0 \n", "\n", " sky blue \n", "0 1 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 1 \n", "5 0 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bv = CountVectorizer(ngram_range=(2,2)) # ngram_range关注两个词的关系\n", "bv_matrix = bv.fit_transform(norm_corpus)\n", "bv_matrix = bv_matrix.toarray()\n", "vocab = bv.get_feature_names()\n", "pd.DataFrame(bv_matrix, columns=vocab)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "关注两个两个词的组合,如上面的beautiful sky就是两个词的组合。\n", "\n", "第0列的beautiful sky为0,因为上面第一句话中,两个词不是前后关系。\n", "\n", "缺点:矩阵过大,且矩阵过于稀疏" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TF-IDF模型\n", "TF是词频(Term Frequency),IDF是逆文本频率指数(Inverse Document Frequency)。\n", "\n", "即:如果词w在一篇文档d中出现的频率高,并且在其他文档中很少出现,则认为词w具有很好的区分能力,适合用来把文章d和其他文章区分开来。" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
beautifulbluebrowndogfoxjumpslayzlazylovequickskytoday
00.600.520.000.000.000.000.000.000.000.000.600.00
10.460.390.000.000.000.000.000.000.660.000.460.00
20.000.000.370.370.370.530.000.430.000.370.000.00
30.000.350.400.400.400.000.000.480.000.400.000.00
40.360.310.000.000.000.000.000.000.000.000.720.52
50.000.000.410.410.410.000.590.000.000.410.000.00
\n", "
" ], "text/plain": [ " beautiful blue brown dog fox jumps layz lazy love quick sky \\\n", "0 0.60 0.52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 \n", "1 0.46 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.66 0.00 0.46 \n", "2 0.00 0.00 0.37 0.37 0.37 0.53 0.00 0.43 0.00 0.37 0.00 \n", "3 0.00 0.35 0.40 0.40 0.40 0.00 0.00 0.48 0.00 0.40 0.00 \n", "4 0.36 0.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 \n", "5 0.00 0.00 0.41 0.41 0.41 0.00 0.59 0.00 0.00 0.41 0.00 \n", "\n", " today \n", "0 0.00 \n", "1 0.00 \n", "2 0.00 \n", "3 0.00 \n", "4 0.52 \n", "5 0.00 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n", "tv_matrix = tv.fit_transform(norm_corpus)\n", "tv_matrix = tv_matrix.toarray()\n", "\n", "vocab = tv.get_feature_names()\n", "pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Similarity特征\n", "\n", "统计文章的相似性" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345
01.0000000.7531280.0000000.1792560.8075390.000000
10.7531281.0000000.0000000.1350030.6081810.000000
20.0000000.0000001.0000000.7969320.0000000.592459
30.1792560.1350030.7969321.0000000.1059920.654475
40.8075390.6081810.0000000.1059921.0000000.000000
50.0000000.0000000.5924590.6544750.0000001.000000
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5\n", "0 1.000000 0.753128 0.000000 0.179256 0.807539 0.000000\n", "1 0.753128 1.000000 0.000000 0.135003 0.608181 0.000000\n", "2 0.000000 0.000000 1.000000 0.796932 0.000000 0.592459\n", "3 0.179256 0.135003 0.796932 1.000000 0.105992 0.654475\n", "4 0.807539 0.608181 0.000000 0.105992 1.000000 0.000000\n", "5 0.000000 0.000000 0.592459 0.654475 0.000000 1.000000" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "similarity_matrix = cosine_similarity(tv_matrix)\n", "similarity_df = pd.DataFrame(similarity_matrix)\n", "similarity_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 聚类特征\n", "根据K值聚类,不常用" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DocumentCategoryClusterLabel
0The sky is blue and beautiful.weather0
1Love this blue and beautiful sky!weather0
2The quick brown fox jumps over the lazy dog.animals1
3The brown fox is quick and the blue dog is lazy!animals1
4The sky is very blue and the sky is very beaut...weather0
5The dog is layz but the brown fox is quick!animals1
\n", "
" ], "text/plain": [ " Document Category ClusterLabel\n", "0 The sky is blue and beautiful. weather 0\n", "1 Love this blue and beautiful sky! weather 0\n", "2 The quick brown fox jumps over the lazy dog. animals 1\n", "3 The brown fox is quick and the blue dog is lazy! animals 1\n", "4 The sky is very blue and the sky is very beaut... weather 0\n", "5 The dog is layz but the brown fox is quick! animals 1" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.cluster import KMeans\n", "\n", "km = KMeans(n_clusters=2) # 聚成两个类别\n", "km.fit_transform(similarity_df)\n", "cluster_labels = km.labels_\n", "cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])\n", "pd.concat([corpus_df, cluster_labels], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 主题模型\n", "不常用" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
T1T2
00.1905180.809482
10.1768220.823178
20.8456230.154377
30.8139590.186041
40.1805460.819454
50.8356160.164384
\n", "
" ], "text/plain": [ " T1 T2\n", "0 0.190518 0.809482\n", "1 0.176822 0.823178\n", "2 0.845623 0.154377\n", "3 0.813959 0.186041\n", "4 0.180546 0.819454\n", "5 0.835616 0.164384" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.decomposition import LatentDirichletAllocation\n", "\n", "lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)\n", "dt_matrix = lda.fit_transform(tv_matrix)\n", "features = pd.DataFrame(dt_matrix, columns=['T1','T2'])\n", "features # 得到每句话在两个分类的不同概率" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 主题和词的权重\n", "不常用" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('brown', 1.661141029696565), ('dog', 1.661141029696565), ('fox', 1.661141029696565), ('quick', 1.661141029696565), ('lazy', 1.3970326617199404), ('layz', 1.0746375777072972), ('jumps', 1.0180791773370004), ('blue', 0.7626278092631464)]\n", "[('sky', 2.2642769588598863), ('beautiful', 1.906718528224391), ('blue', 1.7982110631451238), ('love', 1.1480290369567938), ('today', 1.00672575634655)]\n" ] } ], "source": [ "# 得到每个词的权重\n", "tt_matrix = lda.components_\n", "for topic_weights in tt_matrix:\n", " topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]\n", " topic = sorted(topic, key=lambda x: -x[1])\n", " topic = [item for item in topic if item[1] > 0.6]\n", " print(topic)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 词嵌入模型 word2vec\n", "目前常用的模型,解决了上面的全部问题,如:上下文关系、将相关的词,在高维中,赋予一定的关系。" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "from gensim.models import word2vec # pip install gensim\n", "\n", "wpt = nltk.WordPunctTokenizer()\n", "tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]\n", "\n", "# Set values for various parameters\n", "feature_size = 10 # Word vector dimensionality编码的纬度\n", "window_context = 10 # Context window size前面滑动窗口的大小\n", "min_word_count = 1 # Minimum word count过滤词的大小\n", "sample = 1e-3 # Downsample setting for frequent words\n", "\n", "w2v_model = word2vec.Word2Vec(tokenized_corpus,size=feature_size,\n", " window=window_context,min_count=min_word_count,\n", " sample=sample)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-0.02585954, 0.04979984, -0.00273573, -0.04431831, 0.02668079,\n", " -0.04765006, -0.00984736, 0.02903971, -0.00389679, 0.01388443],\n", " dtype=float32)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w2v_model.wv['sky'] # 把sky编程10维向量" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# 获取一句话中,所有词的维度数据,并做平均值\n", "# 如:一句话有3个次,则3个词10维度数据各种相加并均值,用平均值向量表示这句话\n", "def averge_word_vectors(words,model,vocabulary,num_features):\n", " feature_vector = np.zeros((num_features,),dtype=\"float64\")\n", " nwords = 0.\n", " \n", " for word in words:\n", " if word in vocabulary:\n", " nwords = nwords+1.\n", " feature_vector = np.add(feature_vector, model[word])\n", " \n", " if nwords:\n", " feature_vector = np.divide(feature_vector, nwords)\n", " \n", " return feature_vector\n", "\n", "\n", "def averge_word_vectorizer(corpus, model, num_features):\n", " vocabulary = set(model.wv.index2word)\n", " features = [averge_word_vectors(tokenized_sentence,model,\n", " vocabulary,num_features) \n", " for tokenized_sentence in corpus]\n", " \n", " return np.array(features)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:10: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n", " # Remove the CWD from sys.path while we load stuff.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
0-0.0217800.0264970.003405-0.025112-0.003608-0.019199-0.0081550.0179460.0118230.001250
1-0.0186640.0176570.006898-0.0092050.002988-0.008704-0.0110540.015843-0.001813-0.009935
2-0.0050420.0068010.004798-0.0063500.004121-0.0084530.006522-0.018066-0.008232-0.008274
3-0.0053610.0107900.004984-0.0158890.003737-0.0172260.004497-0.016209-0.002678-0.006484
4-0.0231250.0347360.001525-0.0300980.000194-0.029992-0.0008460.016776-0.0029370.010821
50.0094920.0058320.000876-0.0092130.002501-0.0096560.002072-0.005229-0.0049660.008581
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "0 -0.021780 0.026497 0.003405 -0.025112 -0.003608 -0.019199 -0.008155 \n", "1 -0.018664 0.017657 0.006898 -0.009205 0.002988 -0.008704 -0.011054 \n", "2 -0.005042 0.006801 0.004798 -0.006350 0.004121 -0.008453 0.006522 \n", "3 -0.005361 0.010790 0.004984 -0.015889 0.003737 -0.017226 0.004497 \n", "4 -0.023125 0.034736 0.001525 -0.030098 0.000194 -0.029992 -0.000846 \n", "5 0.009492 0.005832 0.000876 -0.009213 0.002501 -0.009656 0.002072 \n", "\n", " 7 8 9 \n", "0 0.017946 0.011823 0.001250 \n", "1 0.015843 -0.001813 -0.009935 \n", "2 -0.018066 -0.008232 -0.008274 \n", "3 -0.016209 -0.002678 -0.006484 \n", "4 0.016776 -0.002937 0.010821 \n", "5 -0.005229 -0.004966 0.008581 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w2v_feature_array = averge_word_vectorizer(corpus=tokenized_corpus,\n", " model=w2v_model,\n", " num_features=feature_size)\n", "\n", "pd.DataFrame(w2v_feature_array)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "单独用平均有些问题,即有的词重要性可能更强,后面会再用到" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }