diff --git a/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb b/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb index 87c33b5e..4592429f 100644 --- a/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb +++ b/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb @@ -103,8 +103,8 @@ "cell_type": "code", "execution_count": null, "source": [ - "suppressWarnings(if (!require(\"pacman\"))install.packages(\"pacman\"))\n", - "\n", + "suppressWarnings(if (!require(\"pacman\"))install.packages(\"pacman\"))\r\n", + "\r\n", "pacman::p_load(tidyverse, tidymodels, DataExplorer, themis, here)" ], "outputs": [], @@ -138,12 +138,12 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Import data\n", - "df <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/4-Classification/data/cuisines.csv\")\n", - "\n", - "# View the first 5 rows\n", - "df %>% \n", - " slice_head(n = 5)\n" + "# Import data\r\n", + "df <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/4-Classification/data/cuisines.csv\")\r\n", + "\r\n", + "# View the first 5 rows\r\n", + "df %>% \r\n", + " slice_head(n = 5)\r\n" ], "outputs": [], "metadata": { @@ -218,7 +218,7 @@ "\r\n", "Next, let's assign each cuisine into its individual tibble and find out how much data is available (rows, columns) per cuisine.\r\n", "\r\n", - "> A tibble is a modern reimagining of the data frame, keeping what time has proven to be effective, and throwing out what is not.\r\n", + "> A [tibble](https://tibble.tidyverse.org/) is a modern data frame.\r\n", "\r\n", "

\r\n", " % select(-1) %>% \n", - " # Transpose data to a long format\n", - " pivot_longer(!cuisine, names_to = \"ingredients\", values_to = \"count\") %>% \n", - " # Find the top most ingredients for a particular cuisine\n", - " group_by(ingredients) %>% \n", - " summarise(n_instances = sum(count)) %>% \n", - " filter(n_instances != 0) %>% \n", - " # Arrange by descending order\n", - " arrange(desc(n_instances)) %>% \n", - " mutate(ingredients = factor(ingredients) %>% fct_inorder())\n", - " \n", - " \n", - " return(ingredient_df)\n", + "# Creates a functions that returns the top ingredients by class\r\n", + "\r\n", + "create_ingredient <- function(df){\r\n", + " \r\n", + " # Drop the id column which is the first colum\r\n", + " ingredient_df = df %>% select(-1) %>% \r\n", + " # Transpose data to a long format\r\n", + " pivot_longer(!cuisine, names_to = \"ingredients\", values_to = \"count\") %>% \r\n", + " # Find the top most ingredients for a particular cuisine\r\n", + " group_by(ingredients) %>% \r\n", + " summarise(n_instances = sum(count)) %>% \r\n", + " filter(n_instances != 0) %>% \r\n", + " # Arrange by descending order\r\n", + " arrange(desc(n_instances)) %>% \r\n", + " mutate(ingredients = factor(ingredients) %>% fct_inorder())\r\n", + " \r\n", + " \r\n", + " return(ingredient_df)\r\n", "} # End of function" ], "outputs": [], @@ -343,10 +343,10 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Call create_ingredient and display popular ingredients\n", - "thai_ingredient_df <- create_ingredient(df = thai_df)\n", - "\n", - "thai_ingredient_df %>% \n", + "# Call create_ingredient and display popular ingredients\r\n", + "thai_ingredient_df <- create_ingredient(df = thai_df)\r\n", + "\r\n", + "thai_ingredient_df %>% \r\n", " slice_head(n = 10)" ], "outputs": [], @@ -367,11 +367,11 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Make a bar chart for popular thai cuisines\n", - "thai_ingredient_df %>% \n", - " slice_head(n = 10) %>% \n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"steelblue\") +\n", + "# Make a bar chart for popular thai cuisines\r\n", + "thai_ingredient_df %>% \r\n", + " slice_head(n = 10) %>% \r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"steelblue\") +\r\n", " xlab(\"\") + ylab(\"\")" ], "outputs": [], @@ -392,12 +392,12 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Get popular ingredients for Japanese cuisines and make bar chart\n", - "create_ingredient(df = japanese_df) %>% \n", - " slice_head(n = 10) %>%\n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"darkorange\", alpha = 0.8) +\n", - " xlab(\"\") + ylab(\"\")\n" + "# Get popular ingredients for Japanese cuisines and make bar chart\r\n", + "create_ingredient(df = japanese_df) %>% \r\n", + " slice_head(n = 10) %>%\r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"darkorange\", alpha = 0.8) +\r\n", + " xlab(\"\") + ylab(\"\")\r\n" ], "outputs": [], "metadata": { @@ -417,11 +417,11 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Get popular ingredients for Chinese cuisines and make bar chart\n", - "create_ingredient(df = chinese_df) %>% \n", - " slice_head(n = 10) %>%\n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"cyan4\", alpha = 0.8) +\n", + "# Get popular ingredients for Chinese cuisines and make bar chart\r\n", + "create_ingredient(df = chinese_df) %>% \r\n", + " slice_head(n = 10) %>%\r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"cyan4\", alpha = 0.8) +\r\n", " xlab(\"\") + ylab(\"\")" ], "outputs": [], @@ -442,11 +442,11 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Get popular ingredients for Indian cuisines and make bar chart\n", - "create_ingredient(df = indian_df) %>% \n", - " slice_head(n = 10) %>%\n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"#041E42FF\", alpha = 0.8) +\n", + "# Get popular ingredients for Indian cuisines and make bar chart\r\n", + "create_ingredient(df = indian_df) %>% \r\n", + " slice_head(n = 10) %>%\r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"#041E42FF\", alpha = 0.8) +\r\n", " xlab(\"\") + ylab(\"\")" ], "outputs": [], @@ -467,11 +467,11 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Get popular ingredients for Korean cuisines and make bar chart\n", - "create_ingredient(df = korean_df) %>% \n", - " slice_head(n = 10) %>%\n", - " ggplot(aes(x = n_instances, y = ingredients)) +\n", - " geom_bar(stat = \"identity\", width = 0.5, fill = \"#852419FF\", alpha = 0.8) +\n", + "# Get popular ingredients for Korean cuisines and make bar chart\r\n", + "create_ingredient(df = korean_df) %>% \r\n", + " slice_head(n = 10) %>%\r\n", + " ggplot(aes(x = n_instances, y = ingredients)) +\r\n", + " geom_bar(stat = \"identity\", width = 0.5, fill = \"#852419FF\", alpha = 0.8) +\r\n", " xlab(\"\") + ylab(\"\")" ], "outputs": [], @@ -494,12 +494,12 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Drop id column, rice, garlic and ginger from our original data set\n", - "df_select <- df %>% \n", - " select(-c(1, rice, garlic, ginger))\n", - "\n", - "# Display new data set\n", - "df_select %>% \n", + "# Drop id column, rice, garlic and ginger from our original data set\r\n", + "df_select <- df %>% \r\n", + " select(-c(1, rice, garlic, ginger))\r\n", + "\r\n", + "# Display new data set\r\n", + "df_select %>% \r\n", " slice_head(n = 5)" ], "outputs": [], @@ -510,16 +510,16 @@ { "cell_type": "markdown", "source": [ - "## Preprocessing data using recipes 👩‍🍳👨‍🍳 - Dealing with imbalanced data ⚖️\n", - "\n", - "

\n", - " \n", - "

Artwork by @allison_horst
\n", - "\n", - "Given that this lesson is about cuisines, we have to put `recipes` into context .\n", - "\n", - "Tidymodels provides yet another neat package: `recipes`- a package for preprocessing data.\n" + "## Preprocessing data using recipes 👩‍🍳👨‍🍳 - Dealing with imbalanced data ⚖️\r\n", + "\r\n", + "

\r\n", + " \r\n", + "

Artwork by @allison_horst
\r\n", + "\r\n", + "Given that this lesson is about cuisines, we have to put `recipes` into context .\r\n", + "\r\n", + "Tidymodels provides yet another neat package: `recipes`- a package for preprocessing data.\r\n" ], "metadata": { "id": "kkFd-JxdIaL6" @@ -538,11 +538,11 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Distribution of cuisines\n", - "old_label_count <- df_select %>% \n", - " count(cuisine) %>% \n", - " arrange(desc(n))\n", - "\n", + "# Distribution of cuisines\r\n", + "old_label_count <- df_select %>% \r\n", + " count(cuisine) %>% \r\n", + " arrange(desc(n))\r\n", + "\r\n", "old_label_count" ], "outputs": [], @@ -572,13 +572,13 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Load themis package for dealing with imbalanced data\n", - "library(themis)\n", - "\n", - "# Create a recipe for preprocessing data\n", - "cuisines_recipe <- recipe(cuisine ~ ., data = df_select) %>% \n", - " step_smote(cuisine)\n", - "\n", + "# Load themis package for dealing with imbalanced data\r\n", + "library(themis)\r\n", + "\r\n", + "# Create a recipe for preprocessing data\r\n", + "cuisines_recipe <- recipe(cuisine ~ ., data = df_select) %>% \r\n", + " step_smote(cuisine)\r\n", + "\r\n", "cuisines_recipe" ], "outputs": [], @@ -609,18 +609,18 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Prep and bake the recipe\n", - "preprocessed_df <- cuisines_recipe %>% \n", - " prep() %>% \n", - " bake(new_data = NULL) %>% \n", - " relocate(cuisine)\n", - "\n", - "# Display data\n", - "preprocessed_df %>% \n", - " slice_head(n = 5)\n", - "\n", - "# Quick summary stats\n", - "preprocessed_df %>% \n", + "# Prep and bake the recipe\r\n", + "preprocessed_df <- cuisines_recipe %>% \r\n", + " prep() %>% \r\n", + " bake(new_data = NULL) %>% \r\n", + " relocate(cuisine)\r\n", + "\r\n", + "# Display data\r\n", + "preprocessed_df %>% \r\n", + " slice_head(n = 5)\r\n", + "\r\n", + "# Quick summary stats\r\n", + "preprocessed_df %>% \r\n", " introduce()" ], "outputs": [], @@ -641,12 +641,12 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Distribution of cuisines\n", - "new_label_count <- preprocessed_df %>% \n", - " count(cuisine) %>% \n", - " arrange(desc(n))\n", - "\n", - "list(new_label_count = new_label_count,\n", + "# Distribution of cuisines\r\n", + "new_label_count <- preprocessed_df %>% \r\n", + " count(cuisine) %>% \r\n", + " arrange(desc(n))\r\n", + "\r\n", + "list(new_label_count = new_label_count,\r\n", " old_label_count = old_label_count)" ], "outputs": [], @@ -675,7 +675,7 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Save preprocessed data\n", + "# Save preprocessed data\r\n", "write_csv(preprocessed_df, \"../../../data/cleaned_cuisines_R.csv\")" ], "outputs": [], @@ -686,32 +686,32 @@ { "cell_type": "markdown", "source": [ - "This fresh CSV can now be found in the root data folder.\n", - "\n", - "**🚀Challenge**\n", - "\n", - "This curriculum contains several interesting datasets. Dig through the `data` folders and see if any contain datasets that would be appropriate for binary or multi-class classification? What questions would you ask of this dataset?\n", - "\n", - "## [**Post-lecture quiz**](https://white-water-09ec41f0f.azurestaticapps.net/quiz/20/)\n", - "\n", - "## **Review & Self Study**\n", - "\n", - "- Check out [package themis](https://github.com/tidymodels/themis). What other techniques could we use to deal with imbalanced data?\n", - "\n", - "- Tidy models [reference website](https://www.tidymodels.org/start/).\n", - "\n", - "- H. Wickham and G. Grolemund, [*R for Data Science: Visualize, Model, Transform, Tidy, and Import Data*](https://r4ds.had.co.nz/).\n", - "\n", - "#### THANK YOU TO:\n", - "\n", - "[`Allison Horst`](https://twitter.com/allison_horst/) for creating the amazing illustrations that make R more welcoming and engaging. Find more illustrations at her [gallery](https://www.google.com/url?q=https://github.com/allisonhorst/stats-illustrations&sa=D&source=editors&ust=1626380772530000&usg=AOvVaw3zcfyCizFQZpkSLzxiiQEM).\n", - "\n", - "[Cassie Breviu](https://www.twitter.com/cassieview) and [Jen Looper](https://www.twitter.com/jenlooper) for creating the original Python version of this module ♥️\n", - "\n", - "

\n", - " \n", - "

Artwork by @allison_horst
\n" + "This fresh CSV can now be found in the root data folder.\r\n", + "\r\n", + "**🚀Challenge**\r\n", + "\r\n", + "This curriculum contains several interesting datasets. Dig through the `data` folders and see if any contain datasets that would be appropriate for binary or multi-class classification? What questions would you ask of this dataset?\r\n", + "\r\n", + "## [**Post-lecture quiz**](https://white-water-09ec41f0f.azurestaticapps.net/quiz/20/)\r\n", + "\r\n", + "## **Review & Self Study**\r\n", + "\r\n", + "- Check out [package themis](https://github.com/tidymodels/themis). What other techniques could we use to deal with imbalanced data?\r\n", + "\r\n", + "- Tidy models [reference website](https://www.tidymodels.org/start/).\r\n", + "\r\n", + "- H. Wickham and G. Grolemund, [*R for Data Science: Visualize, Model, Transform, Tidy, and Import Data*](https://r4ds.had.co.nz/).\r\n", + "\r\n", + "#### THANK YOU TO:\r\n", + "\r\n", + "[`Allison Horst`](https://twitter.com/allison_horst/) for creating the amazing illustrations that make R more welcoming and engaging. Find more illustrations at her [gallery](https://www.google.com/url?q=https://github.com/allisonhorst/stats-illustrations&sa=D&source=editors&ust=1626380772530000&usg=AOvVaw3zcfyCizFQZpkSLzxiiQEM).\r\n", + "\r\n", + "[Cassie Breviu](https://www.twitter.com/cassieview) and [Jen Looper](https://www.twitter.com/jenlooper) for creating the original Python version of this module ♥️\r\n", + "\r\n", + "

\r\n", + " \r\n", + "

Artwork by @allison_horst
\r\n" ], "metadata": { "id": "WQs5621pMGwf"