ML-For-Beginners/6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb

{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "orig_nbformat": 4,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.7.0 64-bit ('3.7')"
  },
  "interpreter": {
   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import time\n",
    "import ast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_address(row):\n",
    "    if \"Netherlands\" in row[\"Hotel_Address\"]:\n",
    "        return \"Amsterdam, Netherlands\"\n",
    "    elif \"Barcelona\" in row[\"Hotel_Address\"]:\n",
    "        return \"Barcelona, Spain\"\n",
    "    elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n",
    "        return \"London, United Kingdom\"\n",
    "    elif \"Milan\" in row[\"Hotel_Address\"]:        \n",
    "        return \"Milan, Italy\"\n",
    "    elif \"France\" in row[\"Hotel_Address\"]:\n",
    "        return \"Paris, France\"\n",
    "    elif \"Vienna\" in row[\"Hotel_Address\"]:\n",
    "        return \"Vienna, Austria\" \n",
    "    else:\n",
    "        return row.Hotel_Address\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the hotel reviews from CSV\n",
    "start = time.time()\n",
    "df = pd.read_csv('../../data/Hotel_Reviews.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dropping columns we will not use:\n",
    "df.drop([\"lat\", \"lng\"], axis = 1, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace all the addresses with a shortened, more useful form\n",
    "df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop `Additional_Number_of_Scoring`\n",
    "df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace=True)\n",
    "# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values\n",
    "df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')\n",
    "df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process the Tags into new columns\n",
    "# The file Hotel_Reviews_Tags.py, identifies the most important tags\n",
    "# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n",
    "# Family with young children, Family with older children, With a pet\n",
    "df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n",
    "df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n",
    "df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n",
    "df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n",
    "df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n",
    "df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n",
    "df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n",
    "df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# No longer need any of these columns\n",
    "df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Saving results to Hotel_Reviews_Filtered.csv\n",
      "Filtering took 23.74 seconds\n"
     ]
    }
   ],
   "source": [
    "# Saving new data file with calculated columns\n",
    "print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",
    "df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n",
    "end = time.time()\n",
    "print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"
   ]
  }
 ]
}
moving files from py to notebook 4 years ago			`{`
			`"metadata": {`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
notebook edits 4 years ago			`"version": "3.7.0"`
moving files from py to notebook 4 years ago			`},`
notebook edits 4 years ago			`"orig_nbformat": 4,`
			`"kernelspec": {`
			`"name": "python3",`
			`"display_name": "Python 3.7.0 64-bit ('3.7')"`
			`},`
			`"interpreter": {`
			`"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"`
			`}`
moving files from py to notebook 4 years ago			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2,`
			`"cells": [`
			`{`
			`"cell_type": "code",`
notebook edits 4 years ago			`"execution_count": 1,`
moving files from py to notebook 4 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import pandas as pd\n",`
			`"import time\n",`
			`"import ast"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 4 years ago			`"execution_count": 2,`
moving files from py to notebook 4 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def replace_address(row):\n",`
			`" if \"Netherlands\" in row[\"Hotel_Address\"]:\n",`
			`" return \"Amsterdam, Netherlands\"\n",`
			`" elif \"Barcelona\" in row[\"Hotel_Address\"]:\n",`
			`" return \"Barcelona, Spain\"\n",`
			`" elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n",`
			`" return \"London, United Kingdom\"\n",`
			`" elif \"Milan\" in row[\"Hotel_Address\"]: \n",`
			`" return \"Milan, Italy\"\n",`
			`" elif \"France\" in row[\"Hotel_Address\"]:\n",`
			`" return \"Paris, France\"\n",`
			`" elif \"Vienna\" in row[\"Hotel_Address\"]:\n",`
			`" return \"Vienna, Austria\" \n",`
			`" else:\n",`
			`" return row.Hotel_Address\n",`
			`" "`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 4 years ago			`"execution_count": 3,`
moving files from py to notebook 4 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Load the hotel reviews from CSV\n",`
			`"start = time.time()\n",`
			`"df = pd.read_csv('../../data/Hotel_Reviews.csv')\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 4 years ago			`"execution_count": 4,`
moving files from py to notebook 4 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# dropping columns we will not use:\n",`
			`"df.drop([\"lat\", \"lng\"], axis = 1, inplace=True)\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 4 years ago			`"execution_count": 5,`
moving files from py to notebook 4 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Replace all the addresses with a shortened, more useful form\n",`
			`"df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 4 years ago			`"execution_count": 6,`
moving files from py to notebook 4 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			"# Drop `Additional_Number_of_Scoring`\n",
			`"df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace=True)\n",`
			"# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values\n",
			`"df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')\n",`
			`"df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 4 years ago			`"execution_count": 7,`
moving files from py to notebook 4 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Process the Tags into new columns\n",`
			`"# The file Hotel_Reviews_Tags.py, identifies the most important tags\n",`
			`"# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n",`
			`"# Family with young children, Family with older children, With a pet\n",`
			`"df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n",`
			`"df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n",`
			`"df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n",`
			`"df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n",`
			`"df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n",`
			`"df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n",`
			`"df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n",`
			`"df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 4 years ago			`"execution_count": 8,`
moving files from py to notebook 4 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# No longer need any of these columns\n",`
notebook edits 4 years ago			`"df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"`
moving files from py to notebook 4 years ago			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 4 years ago			`"execution_count": 9,`
moving files from py to notebook 4 years ago			`"metadata": {},`
notebook edits 4 years ago			`"outputs": [`
			`{`
			`"output_type": "stream",`
			`"name": "stdout",`
			`"text": [`
			`"Saving results to Hotel_Reviews_Filtered.csv\n",`
			`"Filtering took 23.74 seconds\n"`
			`]`
			`}`
			`],`
moving files from py to notebook 4 years ago			`"source": [`
			`"# Saving new data file with calculated columns\n",`
			`"print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",`
notebook edits 4 years ago			`"df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n",`
moving files from py to notebook 4 years ago			`"end = time.time()\n",`
			`"print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"`
			`]`
			`}`
			`]`
			`}`