diff --git a/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb b/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb index 87c33b5e..4592429f 100644 --- a/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb +++ b/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb @@ -103,8 +103,8 @@ "cell_type": "code", "execution_count": null, "source": [ - "suppressWarnings(if (!require(\"pacman\"))install.packages(\"pacman\"))\n", - "\n", + "suppressWarnings(if (!require(\"pacman\"))install.packages(\"pacman\"))\r\n", + "\r\n", "pacman::p_load(tidyverse, tidymodels, DataExplorer, themis, here)" ], "outputs": [], @@ -138,12 +138,12 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Import data\n", - "df <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/4-Classification/data/cuisines.csv\")\n", - "\n", - "# View the first 5 rows\n", - "df %>% \n", - " slice_head(n = 5)\n" + "# Import data\r\n", + "df <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/4-Classification/data/cuisines.csv\")\r\n", + "\r\n", + "# View the first 5 rows\r\n", + "df %>% \r\n", + " slice_head(n = 5)\r\n" ], "outputs": [], "metadata": { @@ -218,7 +218,7 @@ "\r\n", "Next, let's assign each cuisine into its individual tibble and find out how much data is available (rows, columns) per cuisine.\r\n", "\r\n", - "> A tibble is a modern reimagining of the data frame, keeping what time has proven to be effective, and throwing out what is not.\r\n", + "> A [tibble](https://tibble.tidyverse.org/) is a modern data frame.\r\n", "\r\n", "
\r\n", " % select(-1) %>% \n", - " # Transpose data to a long format\n", - " pivot_longer(!cuisine, names_to = \"ingredients\", values_to = \"count\") %>% \n", - " # Find the top most ingredients for a particular cuisine\n", - " group_by(ingredients) %>% \n", - " summarise(n_instances = sum(count)) %>% \n", - " filter(n_instances != 0) %>% \n", - " # Arrange by descending order\n", - " arrange(desc(n_instances)) %>% \n", - " mutate(ingredients = factor(ingredients) %>% fct_inorder())\n", - " \n", - " \n", - " return(ingredient_df)\n", + "# Creates a functions that returns the top ingredients by class\r\n", + "\r\n", + "create_ingredient <- function(df){\r\n", + " \r\n", + " # Drop the id column which is the first colum\r\n", + " ingredient_df = df %>% select(-1) %>% \r\n", + " # Transpose data to a long format\r\n", + " pivot_longer(!cuisine, names_to = \"ingredients\", values_to = \"count\") %>% \r\n", + " # Find the top most ingredients for a particular cuisine\r\n", + " group_by(ingredients) %>% \r\n", + " summarise(n_instances = sum(count)) %>% \r\n", + " filter(n_instances != 0) %>% \r\n", + " # Arrange by descending order\r\n", + " arrange(desc(n_instances)) %>% \r\n", + " mutate(ingredients = factor(ingredients) %>% fct_inorder())\r\n", + " \r\n", + " \r\n", + " return(ingredient_df)\r\n", "} # End of function" ], "outputs": [], @@ -343,10 +343,10 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Call create_ingredient and display popular ingredients\n", - "thai_ingredient_df <- create_ingredient(df = thai_df)\n", - "\n", - "thai_ingredient_df %>% \n", + "# Call create_ingredient and display popular ingredients\r\n", + "thai_ingredient_df <- create_ingredient(df = thai_df)\r\n", + "\r\n", + "thai_ingredient_df %>% \r\n", " slice_head(n = 10)" ], "outputs": [], @@ -367,11 +367,11 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Make a bar chart for popular thai cuisines\n", - "thai_ingredient_df %>% \n", - " slice_head(n = 10) %>% \n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"steelblue\") +\n", + "# Make a bar chart for popular thai cuisines\r\n", + "thai_ingredient_df %>% \r\n", + " slice_head(n = 10) %>% \r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"steelblue\") +\r\n", " xlab(\"\") + ylab(\"\")" ], "outputs": [], @@ -392,12 +392,12 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Get popular ingredients for Japanese cuisines and make bar chart\n", - "create_ingredient(df = japanese_df) %>% \n", - " slice_head(n = 10) %>%\n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"darkorange\", alpha = 0.8) +\n", - " xlab(\"\") + ylab(\"\")\n" + "# Get popular ingredients for Japanese cuisines and make bar chart\r\n", + "create_ingredient(df = japanese_df) %>% \r\n", + " slice_head(n = 10) %>%\r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"darkorange\", alpha = 0.8) +\r\n", + " xlab(\"\") + ylab(\"\")\r\n" ], "outputs": [], "metadata": { @@ -417,11 +417,11 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Get popular ingredients for Chinese cuisines and make bar chart\n", - "create_ingredient(df = chinese_df) %>% \n", - " slice_head(n = 10) %>%\n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"cyan4\", alpha = 0.8) +\n", + "# Get popular ingredients for Chinese cuisines and make bar chart\r\n", + "create_ingredient(df = chinese_df) %>% \r\n", + " slice_head(n = 10) %>%\r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"cyan4\", alpha = 0.8) +\r\n", " xlab(\"\") + ylab(\"\")" ], "outputs": [], @@ -442,11 +442,11 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Get popular ingredients for Indian cuisines and make bar chart\n", - "create_ingredient(df = indian_df) %>% \n", - " slice_head(n = 10) %>%\n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"#041E42FF\", alpha = 0.8) +\n", + "# Get popular ingredients for Indian cuisines and make bar chart\r\n", + "create_ingredient(df = indian_df) %>% \r\n", + " slice_head(n = 10) %>%\r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"#041E42FF\", alpha = 0.8) +\r\n", " xlab(\"\") + ylab(\"\")" ], "outputs": [], @@ -467,11 +467,11 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Get popular ingredients for Korean cuisines and make bar chart\n", - "create_ingredient(df = korean_df) %>% \n", - " slice_head(n = 10) %>%\n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"#852419FF\", alpha = 0.8) +\n", + "# Get popular ingredients for Korean cuisines and make bar chart\r\n", + "create_ingredient(df = korean_df) %>% \r\n", + " slice_head(n = 10) %>%\r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"#852419FF\", alpha = 0.8) +\r\n", " xlab(\"\") + ylab(\"\")" ], "outputs": [], @@ -494,12 +494,12 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Drop id column, rice, garlic and ginger from our original data set\n", - "df_select <- df %>% \n", - " select(-c(1, rice, garlic, ginger))\n", - "\n", - "# Display new data set\n", - "df_select %>% \n", + "# Drop id column, rice, garlic and ginger from our original data set\r\n", + "df_select <- df %>% \r\n", + " select(-c(1, rice, garlic, ginger))\r\n", + "\r\n", + "# Display new data set\r\n", + "df_select %>% \r\n", " slice_head(n = 5)" ], "outputs": [], @@ -510,16 +510,16 @@ { "cell_type": "markdown", "source": [ - "## Preprocessing data using recipes 👩🍳👨🍳 - Dealing with imbalanced data ⚖️\n", - "\n", - "
\n",
- " \n",
- "
\r\n",
+ " \r\n",
+ "
\n",
- " \n",
- "
\r\n",
+ " \r\n",
+ "