diff --git a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb
new file mode 100644
index 0000000..9292cd8
--- /dev/null
+++ b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb
@@ -0,0 +1,180 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import re\n",
+ "import nltk"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 构造一个文本数据集"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Document | \n",
+ " Category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " The sky is blue and beautiful. | \n",
+ " weather | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Love this blue and beautiful sky! | \n",
+ " weather | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " The quick brown fox jumps over the lazy dog. | \n",
+ " animals | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " The brown fox is quick and the blue dog is lazy! | \n",
+ " animals | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " The sky is very blue and the sky is very beaut... | \n",
+ " weather | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " The dog is layz but the brown fox is quick! | \n",
+ " animals | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Document Category\n",
+ "0 The sky is blue and beautiful. weather\n",
+ "1 Love this blue and beautiful sky! weather\n",
+ "2 The quick brown fox jumps over the lazy dog. animals\n",
+ "3 The brown fox is quick and the blue dog is lazy! animals\n",
+ "4 The sky is very blue and the sky is very beaut... weather\n",
+ "5 The dog is layz but the brown fox is quick! animals"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "corpus = ['The sky is blue and beautiful.',\n",
+ " 'Love this blue and beautiful sky!',\n",
+ " 'The quick brown fox jumps over the lazy dog.',\n",
+ " 'The brown fox is quick and the blue dog is lazy!',\n",
+ " 'The sky is very blue and the sky is very beautiful today',\n",
+ " 'The dog is layz but the brown fox is quick!']\n",
+ "\n",
+ "labels = ['weather','weather','animals','animals','weather','animals',]\n",
+ "corpus = np.array(corpus)\n",
+ "corpus_df = pd.DataFrame({'Document': corpus,\n",
+ " 'Category': labels})\n",
+ "corpus_df = corpus_df[['Document','Category']]\n",
+ "corpus_df # 有标签,如每句话的主题"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "任务:分类任务,基于一句话分类成相应的标签"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 基本预处理"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
+ ]
+ }
+ ],
+ "source": [
+ "nltk.download()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 词频与停用词"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}