You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
137 lines
4.5 KiB
137 lines
4.5 KiB
{
|
|
"metadata": {
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.0"
|
|
},
|
|
"orig_nbformat": 4,
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3.7.0 64-bit ('3.7')"
|
|
},
|
|
"interpreter": {
|
|
"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
|
|
},
|
|
"coopTranslator": {
|
|
"original_hash": "341efc86325ec2a214f682f57a189dfd",
|
|
"translation_date": "2025-09-04T03:10:22+00:00",
|
|
"source_file": "6-NLP/5-Hotel-Reviews-2/solution/2-notebook.ipynb",
|
|
"language_code": "de"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2,
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load the hotel reviews from CSV (you can )\n",
|
|
"import pandas as pd \n",
|
|
"\n",
|
|
"df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# We want to find the most useful tags to keep\n",
|
|
"# Remove opening and closing brackets\n",
|
|
"df.Tags = df.Tags.str.strip(\"[']\")\n",
|
|
"# remove all quotes too\n",
|
|
"df.Tags = df.Tags.str.replace(\" ', '\", \",\", regex = False)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# removing this to take advantage of the 'already a phrase' fact of the dataset \n",
|
|
"# Now split the strings into a list\n",
|
|
"tag_list_df = df.Tags.str.split(',', expand = True)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Remove leading and trailing spaces\n",
|
|
"df[\"Tag_1\"] = tag_list_df[0].str.strip()\n",
|
|
"df[\"Tag_2\"] = tag_list_df[1].str.strip()\n",
|
|
"df[\"Tag_3\"] = tag_list_df[2].str.strip()\n",
|
|
"df[\"Tag_4\"] = tag_list_df[3].str.strip()\n",
|
|
"df[\"Tag_5\"] = tag_list_df[4].str.strip()\n",
|
|
"df[\"Tag_6\"] = tag_list_df[5].str.strip()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Merge the 6 columns into one with melt\n",
|
|
"df_tags = df.melt(value_vars=[\"Tag_1\", \"Tag_2\", \"Tag_3\", \"Tag_4\", \"Tag_5\", \"Tag_6\"])\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"The shape of the tags with no filtering: (2514684, 2)\n",
|
|
" index count\n",
|
|
"0 Leisure trip 338423\n",
|
|
"1 Couple 205305\n",
|
|
"2 Solo traveler 89779\n",
|
|
"3 Business trip 68176\n",
|
|
"4 Group 51593\n",
|
|
"5 Family with young children 49318\n",
|
|
"6 Family with older children 21509\n",
|
|
"7 Travelers with friends 1610\n",
|
|
"8 With a pet 1078\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Get the value counts\n",
|
|
"tag_vc = df_tags.value.value_counts()\n",
|
|
"# print(tag_vc)\n",
|
|
"print(\"The shape of the tags with no filtering:\", str(df_tags.shape))\n",
|
|
"# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000\n",
|
|
"df_tags = df_tags[~df_tags.value.str.contains(\"Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double\", na=False, case=False)]\n",
|
|
"tag_vc = df_tags.value.value_counts().reset_index(name=\"count\").query(\"count > 1000\")\n",
|
|
"# Print the top 10 (there should only be 9 and we'll use these in the filtering section)\n",
|
|
"print(tag_vc[:10])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"\n---\n\n**Haftungsausschluss**: \nDieses Dokument wurde mit dem KI-Übersetzungsdienst [Co-op Translator](https://github.com/Azure/co-op-translator) übersetzt. Obwohl wir uns um Genauigkeit bemühen, beachten Sie bitte, dass automatisierte Übersetzungen Fehler oder Ungenauigkeiten enthalten können. Das Originaldokument in seiner ursprünglichen Sprache sollte als maßgebliche Quelle betrachtet werden. Für kritische Informationen wird eine professionelle menschliche Übersetzung empfohlen. Wir übernehmen keine Haftung für Missverständnisse oder Fehlinterpretationen, die sich aus der Nutzung dieser Übersetzung ergeben.\n"
|
|
]
|
|
}
|
|
]
|
|
} |