In [1]:
import pandas as pd
import numpy as np
import re
import nltk

## 构造一个文本数据集

In [2]:
corpus = ['The sky is blue and beautiful.',
         'Love this blue and beautiful sky!',
         'The quick brown fox jumps over the lazy dog.',
         'The brown fox is quick and the blue dog is lazy!',
         'The sky is very blue and the sky is very beautiful today',
         'The dog is layz but the brown fox is quick!']

labels = ['weather','weather','animals','animals','weather','animals',]
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus,
                         'Category': labels})
corpus_df = corpus_df[['Document','Category']]
corpus_df  # 有标签，如每句话的主题

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is layz but the brown fox is quick!,animals


任务：分类任务，基于一句话分类成相应的标签

## 基本预处理

In [3]:
nltk.download()  # 下载失败的用这个方法https://blog.csdn.net/qq_37891889/article/details/104418106

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [4]:
# 词频与停用词
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

停用词：

这里面除了天气和动物信息，其它都基本没用，如i me my等等这些词，这些相当于停用词

In [5]:
def normalize_document(doc):
    # 预处理
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)  # 去掉多余字符
    doc = doc.lower()  # 统一转小写
    doc = doc.strip()  # 去空格
    # 分词，切分提取全部词
    tokens = wpt.tokenize(doc)  
    # 查找停用词，并过滤
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # 拼接所有的词
    doc = ' '.join(filtered_tokens)
    return doc


normalize_corpus = np.vectorize(normalize_document)

In [6]:
norm_corpus = normalize_corpus(corpus)
norm_corpus  # 处理完成的结果

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog layz brown fox quick'],
      dtype='<U30')

## 词袋模型
将所有词语装进一个袋子里,不考虑其词法和语序的问题,即每个词语都是独立的。

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
print(norm_corpus)
# 利用语句中的词，构建成一个词汇表。
# min_df表示词频率小于多少不会被当做关键词，max_df则相反
# 直接API文档https://scikit-learn.org/stable/modules/classes.html，搜CountVectorizer
cv = CountVectorizer(min_df=0., max_df=1.)
cv.fit(norm_corpus)
print(cv.get_feature_names())
# 构建向量表
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

['sky blue beautiful' 'love blue beautiful sky'
 'quick brown fox jumps lazy dog' 'brown fox quick blue dog lazy'
 'sky blue sky beautiful today' 'dog layz brown fox quick']
['beautiful', 'blue', 'brown', 'dog', 'fox', 'jumps', 'layz', 'lazy', 'love', 'quick', 'sky', 'today']


array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1],
       [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

如上数据中，在词汇表中，是否出现过，有则在相应位置标记为1，有两个则标记为2。

如：第一句话'sky blue beautiful'，在词汇表有第一个词一次，第二个词一次，导数第二个词一次，那么下面的向量表则是[1,1,...,1,0]

In [8]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,layz,lazy,love,quick,sky,today
0,1,1,0,0,0,0,0,0,0,0,1,0
1,1,1,0,0,0,0,0,0,1,0,1,0
2,0,0,1,1,1,1,0,1,0,1,0,0
3,0,1,1,1,1,0,0,1,0,1,0,0
4,1,1,0,0,0,0,0,0,0,0,2,1
5,0,0,1,1,1,0,1,0,0,1,0,0


缺点：只考虑词频，没有考虑到前后逻辑

## N-Grams模型
一种语言模型（Language Model，LM），语言模型是一个基于概率的判别模型，它的输入是一句话（单词的顺序序列），输出是这句话的概率，即这些单词的联合概率（joint probability）。

In [9]:
bv = CountVectorizer(ngram_range=(2,2))  # ngram_range关注两个词的关系
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,brown fox,dog layz,dog lazy,fox jumps,fox quick,jumps lazy,layz brown,lazy dog,love blue,quick blue,quick brown,sky beautiful,sky blue
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0
3,0,0,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
5,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0


关注两个两个词的组合，如上面的beautiful sky就是两个词的组合。

第0列的beautiful sky为0，因为上面第一句话中，两个词不是前后关系。

缺点：矩阵过大，且矩阵过于稀疏

## TF-IDF模型
TF是词频(Term Frequency)，IDF是逆文本频率指数(Inverse Document Frequency)。

即：如果词w在一篇文档d中出现的频率高，并且在其他文档中很少出现，则认为词w具有很好的区分能力，适合用来把文章d和其他文章区分开来。

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,layz,lazy,love,quick,sky,today
0,0.6,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0
1,0.46,0.39,0.0,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.46,0.0
2,0.0,0.0,0.37,0.37,0.37,0.53,0.0,0.43,0.0,0.37,0.0,0.0
3,0.0,0.35,0.4,0.4,0.4,0.0,0.0,0.48,0.0,0.4,0.0,0.0
4,0.36,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.52
5,0.0,0.0,0.41,0.41,0.41,0.0,0.59,0.0,0.0,0.41,0.0,0.0


## Similarity特征

统计文章的相似性

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.753128,0.0,0.179256,0.807539,0.0
1,0.753128,1.0,0.0,0.135003,0.608181,0.0
2,0.0,0.0,1.0,0.796932,0.0,0.592459
3,0.179256,0.135003,0.796932,1.0,0.105992,0.654475
4,0.807539,0.608181,0.0,0.105992,1.0,0.0
5,0.0,0.0,0.592459,0.654475,0.0,1.0


## 聚类特征
根据K值聚类，不常用

In [12]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2)  # 聚成两个类别
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is layz but the brown fox is quick!,animals,1


## 主题模型
不常用

In [16]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1','T2'])
features  # 得到每句话在两个分类的不同概率

Unnamed: 0,T1,T2
0,0.190518,0.809482
1,0.176822,0.823178
2,0.845623,0.154377
3,0.813959,0.186041
4,0.180546,0.819454
5,0.835616,0.164384


## 主题和词的权重
不常用

In [17]:
# 得到每个词的权重
tt_matrix = lda.components_
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
    topic = sorted(topic, key=lambda x: -x[1])
    topic = [item for item in topic if item[1] > 0.6]
    print(topic)

[('brown', 1.661141029696565), ('dog', 1.661141029696565), ('fox', 1.661141029696565), ('quick', 1.661141029696565), ('lazy', 1.3970326617199404), ('layz', 1.0746375777072972), ('jumps', 1.0180791773370004), ('blue', 0.7626278092631464)]
[('sky', 2.2642769588598863), ('beautiful', 1.906718528224391), ('blue', 1.7982110631451238), ('love', 1.1480290369567938), ('today', 1.00672575634655)]


## 词嵌入模型 word2vec
目前常用的模型，解决了上面的全部问题，如：上下文关系、将相关的词，在高维中，赋予一定的关系。

In [19]:
from gensim.models import word2vec  # pip install gensim

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

# Set values for various parameters
feature_size = 10  # Word vector dimensionality编码的纬度
window_context = 10  # Context window size前面滑动窗口的大小
min_word_count = 1  # Minimum word count过滤词的大小
sample = 1e-3  # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus,size=feature_size,
                             window=window_context,min_count=min_word_count,
                             sample=sample)

In [20]:
w2v_model.wv['sky']  # 把sky编程10维向量

array([-0.02585954,  0.04979984, -0.00273573, -0.04431831,  0.02668079,
       -0.04765006, -0.00984736,  0.02903971, -0.00389679,  0.01388443],
      dtype=float32)

In [23]:
# 获取一句话中，所有词的维度数据，并做平均值
# 如：一句话有3个次，则3个词10维度数据各种相加并均值，用平均值向量表示这句话
def averge_word_vectors(words,model,vocabulary,num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary:
            nwords = nwords+1.
            feature_vector = np.add(feature_vector, model[word])
            
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector


def averge_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [averge_word_vectors(tokenized_sentence,model,
                                    vocabulary,num_features) 
                for tokenized_sentence in corpus]
    
    return np.array(features)

In [22]:
w2v_feature_array = averge_word_vectorizer(corpus=tokenized_corpus,
                                          model=w2v_model,
                                          num_features=feature_size)

pd.DataFrame(w2v_feature_array)

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.02178,0.026497,0.003405,-0.025112,-0.003608,-0.019199,-0.008155,0.017946,0.011823,0.00125
1,-0.018664,0.017657,0.006898,-0.009205,0.002988,-0.008704,-0.011054,0.015843,-0.001813,-0.009935
2,-0.005042,0.006801,0.004798,-0.00635,0.004121,-0.008453,0.006522,-0.018066,-0.008232,-0.008274
3,-0.005361,0.01079,0.004984,-0.015889,0.003737,-0.017226,0.004497,-0.016209,-0.002678,-0.006484
4,-0.023125,0.034736,0.001525,-0.030098,0.000194,-0.029992,-0.000846,0.016776,-0.002937,0.010821
5,0.009492,0.005832,0.000876,-0.009213,0.002501,-0.009656,0.002072,-0.005229,-0.004966,0.008581


单独用平均有些问题，即有的词重要性可能更强，后面会再用到