Create 文本特征处理-checkpoint.ipynb

5 years ago · 611f1ec2d3
parent 22af266e65
commit 611f1ec2d3
1 changed files with 180 additions and 0 deletions
--- a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb
+++ b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb
@ -0,0 +1,180 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import nltk"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 构造一个文本数据集"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Document</th>\n",
+       "      <th>Category</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>The sky is blue and beautiful.</td>\n",
+       "      <td>weather</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Love this blue and beautiful sky!</td>\n",
+       "      <td>weather</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>The quick brown fox jumps over the lazy dog.</td>\n",
+       "      <td>animals</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>The brown fox is quick and the blue dog is lazy!</td>\n",
+       "      <td>animals</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>The sky is very blue and the sky is very beaut...</td>\n",
+       "      <td>weather</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>The dog is layz but the brown fox is quick!</td>\n",
+       "      <td>animals</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            Document Category\n",
+       "0                     The sky is blue and beautiful.  weather\n",
+       "1                  Love this blue and beautiful sky!  weather\n",
+       "2       The quick brown fox jumps over the lazy dog.  animals\n",
+       "3   The brown fox is quick and the blue dog is lazy!  animals\n",
+       "4  The sky is very blue and the sky is very beaut...  weather\n",
+       "5        The dog is layz but the brown fox is quick!  animals"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "corpus = ['The sky is blue and beautiful.',\n",
+    "         'Love this blue and beautiful sky!',\n",
+    "         'The quick brown fox jumps over the lazy dog.',\n",
+    "         'The brown fox is quick and the blue dog is lazy!',\n",
+    "         'The sky is very blue and the sky is very beautiful today',\n",
+    "         'The dog is layz but the brown fox is quick!']\n",
+    "\n",
+    "labels = ['weather','weather','animals','animals','weather','animals',]\n",
+    "corpus = np.array(corpus)\n",
+    "corpus_df = pd.DataFrame({'Document': corpus,\n",
+    "                         'Category': labels})\n",
+    "corpus_df = corpus_df[['Document','Category']]\n",
+    "corpus_df  # 有标签，如每句话的主题"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "任务：分类任务，基于一句话分类成相应的标签"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 基本预处理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
+     ]
+    }
+   ],
+   "source": [
+    "nltk.download()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 词频与停用词"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}