Create 文本特征处理-checkpoint.ipynb

pull/2/head
benjas 5 years ago
parent 22af266e65
commit 611f1ec2d3

@ -0,0 +1,180 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
"import nltk"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 构造一个文本数据集"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Document</th>\n",
" <th>Category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>The sky is blue and beautiful.</td>\n",
" <td>weather</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Love this blue and beautiful sky!</td>\n",
" <td>weather</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>The quick brown fox jumps over the lazy dog.</td>\n",
" <td>animals</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>The brown fox is quick and the blue dog is lazy!</td>\n",
" <td>animals</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>The sky is very blue and the sky is very beaut...</td>\n",
" <td>weather</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>The dog is layz but the brown fox is quick!</td>\n",
" <td>animals</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Document Category\n",
"0 The sky is blue and beautiful. weather\n",
"1 Love this blue and beautiful sky! weather\n",
"2 The quick brown fox jumps over the lazy dog. animals\n",
"3 The brown fox is quick and the blue dog is lazy! animals\n",
"4 The sky is very blue and the sky is very beaut... weather\n",
"5 The dog is layz but the brown fox is quick! animals"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"corpus = ['The sky is blue and beautiful.',\n",
" 'Love this blue and beautiful sky!',\n",
" 'The quick brown fox jumps over the lazy dog.',\n",
" 'The brown fox is quick and the blue dog is lazy!',\n",
" 'The sky is very blue and the sky is very beautiful today',\n",
" 'The dog is layz but the brown fox is quick!']\n",
"\n",
"labels = ['weather','weather','animals','animals','weather','animals',]\n",
"corpus = np.array(corpus)\n",
"corpus_df = pd.DataFrame({'Document': corpus,\n",
" 'Category': labels})\n",
"corpus_df = corpus_df[['Document','Category']]\n",
"corpus_df # 有标签,如每句话的主题"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"任务:分类任务,基于一句话分类成相应的标签"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 基本预处理"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
]
}
],
"source": [
"nltk.download()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 词频与停用词"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading…
Cancel
Save