parent
22af266e65
commit
611f1ec2d3
@ -0,0 +1,180 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import re\n",
|
||||
"import nltk"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 构造一个文本数据集"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Document</th>\n",
|
||||
" <th>Category</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>The sky is blue and beautiful.</td>\n",
|
||||
" <td>weather</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>Love this blue and beautiful sky!</td>\n",
|
||||
" <td>weather</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>The quick brown fox jumps over the lazy dog.</td>\n",
|
||||
" <td>animals</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>The brown fox is quick and the blue dog is lazy!</td>\n",
|
||||
" <td>animals</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>The sky is very blue and the sky is very beaut...</td>\n",
|
||||
" <td>weather</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>The dog is layz but the brown fox is quick!</td>\n",
|
||||
" <td>animals</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Document Category\n",
|
||||
"0 The sky is blue and beautiful. weather\n",
|
||||
"1 Love this blue and beautiful sky! weather\n",
|
||||
"2 The quick brown fox jumps over the lazy dog. animals\n",
|
||||
"3 The brown fox is quick and the blue dog is lazy! animals\n",
|
||||
"4 The sky is very blue and the sky is very beaut... weather\n",
|
||||
"5 The dog is layz but the brown fox is quick! animals"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"corpus = ['The sky is blue and beautiful.',\n",
|
||||
" 'Love this blue and beautiful sky!',\n",
|
||||
" 'The quick brown fox jumps over the lazy dog.',\n",
|
||||
" 'The brown fox is quick and the blue dog is lazy!',\n",
|
||||
" 'The sky is very blue and the sky is very beautiful today',\n",
|
||||
" 'The dog is layz but the brown fox is quick!']\n",
|
||||
"\n",
|
||||
"labels = ['weather','weather','animals','animals','weather','animals',]\n",
|
||||
"corpus = np.array(corpus)\n",
|
||||
"corpus_df = pd.DataFrame({'Document': corpus,\n",
|
||||
" 'Category': labels})\n",
|
||||
"corpus_df = corpus_df[['Document','Category']]\n",
|
||||
"corpus_df # 有标签,如每句话的主题"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"任务:分类任务,基于一句话分类成相应的标签"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 基本预处理"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nltk.download()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 词频与停用词"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in new issue