"positiveFiles = ['./training_data/positiveReviews/' + f for f in listdir('./training_data/positiveReviews/') if isfile(join('./training_data/positiveReviews/', f))]\n",
"negativeFiles = ['./training_data/negativeReviews/' + f for f in listdir('./training_data/negativeReviews/') if isfile(join('./training_data/negativeReviews/', f))]\n",
"numWords = []\n",
"for pf in positiveFiles:\n",
" with open(pf, \"r\", encoding='utf-8') as f:\n",
" line=f.readline()\n",
" counter = len(line.split())\n",
" numWords.append(counter) \n",
"print('Positive files finished')\n",
"\n",
"for nf in negativeFiles:\n",
" with open(nf, \"r\", encoding='utf-8') as f:\n",
" line=f.readline()\n",
" counter = len(line.split())\n",
" numWords.append(counter) \n",
"print('Negative files finished')\n",
"\n",
"numFiles = len(numWords)\n",
"print('The total number of files is', numFiles)\n",
"print('The total number of words in the files is', sum(numWords))\n",
"print('The average number of words in the files is', sum(numWords)/len(numWords))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"plt.hist(numWords, 50)\n",
"plt.xlabel('Sequence Length')\n",
"plt.ylabel('Frequency')\n",
"plt.axis([0, 1200, 0, 8000])\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"从直方图和句子的平均单词数,我们认为将句子最大长度设置为绝大多数的长度 250 是可行的。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"maxSeqLength = 250"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 查看其中一条评论\n",
"fname = positiveFiles[3] #Can use any valid index (not just 3)\n",