You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
124 lines
3.6 KiB
124 lines
3.6 KiB
4 years ago
|
{
|
||
|
"metadata": {
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
4 years ago
|
"version": "3.7.0"
|
||
4 years ago
|
},
|
||
4 years ago
|
"orig_nbformat": 4,
|
||
|
"kernelspec": {
|
||
|
"name": "python3",
|
||
|
"display_name": "Python 3.7.0 64-bit ('3.7')"
|
||
|
},
|
||
|
"interpreter": {
|
||
|
"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
|
||
|
}
|
||
4 years ago
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2,
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 4,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Load the hotel reviews from CSV (you can )\n",
|
||
4 years ago
|
"import pandas as pd \n",
|
||
|
"\n",
|
||
4 years ago
|
"df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 5,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# We want to find the most useful tags to keep\n",
|
||
|
"# Remove opening and closing brackets\n",
|
||
|
"df.Tags = df.Tags.str.strip(\"[']\")\n",
|
||
|
"# remove all quotes too\n",
|
||
|
"df.Tags = df.Tags.str.replace(\" ', '\", \",\", regex = False)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 6,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# removing this to take advantage of the 'already a phrase' fact of the dataset \n",
|
||
|
"# Now split the strings into a list\n",
|
||
|
"tag_list_df = df.Tags.str.split(',', expand = True)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 7,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Remove leading and trailing spaces\n",
|
||
|
"df[\"Tag_1\"] = tag_list_df[0].str.strip()\n",
|
||
|
"df[\"Tag_2\"] = tag_list_df[1].str.strip()\n",
|
||
|
"df[\"Tag_3\"] = tag_list_df[2].str.strip()\n",
|
||
|
"df[\"Tag_4\"] = tag_list_df[3].str.strip()\n",
|
||
|
"df[\"Tag_5\"] = tag_list_df[4].str.strip()\n",
|
||
|
"df[\"Tag_6\"] = tag_list_df[5].str.strip()\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 8,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Merge the 6 columns into one with melt\n",
|
||
|
"df_tags = df.melt(value_vars=[\"Tag_1\", \"Tag_2\", \"Tag_3\", \"Tag_4\", \"Tag_5\", \"Tag_6\"])\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 9,
|
||
4 years ago
|
"metadata": {},
|
||
4 years ago
|
"outputs": [
|
||
|
{
|
||
|
"output_type": "stream",
|
||
|
"name": "stdout",
|
||
|
"text": [
|
||
|
"The shape of the tags with no filtering: (2514684, 2)\n",
|
||
|
" index count\n",
|
||
|
"0 Leisure trip 338423\n",
|
||
|
"1 Couple 205305\n",
|
||
|
"2 Solo traveler 89779\n",
|
||
|
"3 Business trip 68176\n",
|
||
|
"4 Group 51593\n",
|
||
|
"5 Family with young children 49318\n",
|
||
|
"6 Family with older children 21509\n",
|
||
|
"7 Travelers with friends 1610\n",
|
||
|
"8 With a pet 1078\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
4 years ago
|
"source": [
|
||
|
"# Get the value counts\n",
|
||
|
"tag_vc = df_tags.value.value_counts()\n",
|
||
|
"# print(tag_vc)\n",
|
||
|
"print(\"The shape of the tags with no filtering:\", str(df_tags.shape))\n",
|
||
|
"# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000\n",
|
||
|
"df_tags = df_tags[~df_tags.value.str.contains(\"Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double\", na=False, case=False)]\n",
|
||
|
"tag_vc = df_tags.value.value_counts().reset_index(name=\"count\").query(\"count > 1000\")\n",
|
||
|
"# Print the top 10 (there should only be 9 and we'll use these in the filtering section)\n",
|
||
|
"print(tag_vc[:10])"
|
||
|
]
|
||
|
}
|
||
|
]
|
||
|
}
|