From 611f1ec2d327a18e72b371d26b317197409a0129 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Fri, 18 Dec 2020 10:34:04 +0800 Subject: [PATCH] =?UTF-8?q?Create=20=E6=96=87=E6=9C=AC=E7=89=B9=E5=BE=81?= =?UTF-8?q?=E5=A4=84=E7=90=86-checkpoint.ipynb?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../文本特征处理-checkpoint.ipynb | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb diff --git a/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb new file mode 100644 index 0000000..9292cd8 --- /dev/null +++ b/机器学习竞赛实战_优胜解决方案/常用特征构建方法/.ipynb_checkpoints/文本特征处理-checkpoint.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import re\n", + "import nltk" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 构造一个文本数据集" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DocumentCategory
0The sky is blue and beautiful.weather
1Love this blue and beautiful sky!weather
2The quick brown fox jumps over the lazy dog.animals
3The brown fox is quick and the blue dog is lazy!animals
4The sky is very blue and the sky is very beaut...weather
5The dog is layz but the brown fox is quick!animals
\n", + "
" + ], + "text/plain": [ + " Document Category\n", + "0 The sky is blue and beautiful. weather\n", + "1 Love this blue and beautiful sky! weather\n", + "2 The quick brown fox jumps over the lazy dog. animals\n", + "3 The brown fox is quick and the blue dog is lazy! animals\n", + "4 The sky is very blue and the sky is very beaut... weather\n", + "5 The dog is layz but the brown fox is quick! animals" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corpus = ['The sky is blue and beautiful.',\n", + " 'Love this blue and beautiful sky!',\n", + " 'The quick brown fox jumps over the lazy dog.',\n", + " 'The brown fox is quick and the blue dog is lazy!',\n", + " 'The sky is very blue and the sky is very beautiful today',\n", + " 'The dog is layz but the brown fox is quick!']\n", + "\n", + "labels = ['weather','weather','animals','animals','weather','animals',]\n", + "corpus = np.array(corpus)\n", + "corpus_df = pd.DataFrame({'Document': corpus,\n", + " 'Category': labels})\n", + "corpus_df = corpus_df[['Document','Category']]\n", + "corpus_df # 有标签,如每句话的主题" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "任务:分类任务,基于一句话分类成相应的标签" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 基本预处理" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n" + ] + } + ], + "source": [ + "nltk.download()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 词频与停用词" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}