Update lesson_10-R.ipynb

4 years ago · 6dded01820
parent 74bf75f37f
commit 6dded01820
1 changed files with 130 additions and 130 deletions
--- a/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb
+++ b/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb
@ -103,8 +103,8 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "suppressWarnings(if (!require(\"pacman\"))install.packages(\"pacman\"))\n",
+        "suppressWarnings(if (!require(\"pacman\"))install.packages(\"pacman\"))\r\n",
-        "\n",
+        "\r\n",
        "pacman::p_load(tidyverse, tidymodels, DataExplorer, themis, here)"
      ],
      "outputs": [],
@ -138,12 +138,12 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Import data\n",
+        "# Import data\r\n",
-        "df <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/4-Classification/data/cuisines.csv\")\n",
+        "df <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/4-Classification/data/cuisines.csv\")\r\n",
-        "\n",
+        "\r\n",
-        "# View the first 5 rows\n",
+        "# View the first 5 rows\r\n",
-        "df %>% \n",
+        "df %>% \r\n",
-        "  slice_head(n = 5)\n"
+        "  slice_head(n = 5)\r\n"
      ],
      "outputs": [],
      "metadata": {
@ -218,7 +218,7 @@
        "\r\n",
        "Next, let's assign each cuisine into its individual tibble and find out how much data is available (rows, columns) per cuisine.\r\n",
        "\r\n",
-        "> A tibble is a modern reimagining of the data frame, keeping what time has proven to be effective, and throwing out what is not.\r\n",
+        "> A [tibble](https://tibble.tidyverse.org/) is a modern data frame.\r\n",
        "\r\n",
        "<p >\r\n",
        "   <img src=\"../../images/dplyr_filter.jpg\"\r\n",
@ -305,24 +305,24 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Creates a functions that returns the top ingredients by class\n",
+        "# Creates a functions that returns the top ingredients by class\r\n",
-        "\n",
+        "\r\n",
-        "create_ingredient <- function(df){\n",
+        "create_ingredient <- function(df){\r\n",
-        "  \n",
+        "  \r\n",
-        "  # Drop the id column which is the first colum\n",
+        "  # Drop the id column which is the first colum\r\n",
-        "  ingredient_df = df %>% select(-1) %>% \n",
+        "  ingredient_df = df %>% select(-1) %>% \r\n",
-        "  # Transpose data to a long format\n",
+        "  # Transpose data to a long format\r\n",
-        "    pivot_longer(!cuisine, names_to = \"ingredients\", values_to = \"count\") %>% \n",
+        "    pivot_longer(!cuisine, names_to = \"ingredients\", values_to = \"count\") %>% \r\n",
-        "  # Find the top most ingredients for a particular cuisine\n",
+        "  # Find the top most ingredients for a particular cuisine\r\n",
-        "    group_by(ingredients) %>% \n",
+        "    group_by(ingredients) %>% \r\n",
-        "    summarise(n_instances = sum(count)) %>% \n",
+        "    summarise(n_instances = sum(count)) %>% \r\n",
-        "    filter(n_instances != 0) %>% \n",
+        "    filter(n_instances != 0) %>% \r\n",
-        "  # Arrange by descending order\n",
+        "  # Arrange by descending order\r\n",
-        "    arrange(desc(n_instances)) %>% \n",
+        "    arrange(desc(n_instances)) %>% \r\n",
-        "    mutate(ingredients = factor(ingredients) %>% fct_inorder())\n",
+        "    mutate(ingredients = factor(ingredients) %>% fct_inorder())\r\n",
-        "  \n",
+        "  \r\n",
-        "  \n",
+        "  \r\n",
-        "  return(ingredient_df)\n",
+        "  return(ingredient_df)\r\n",
        "} # End of function"
      ],
      "outputs": [],
@ -343,10 +343,10 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Call create_ingredient and display popular ingredients\n",
+        "# Call create_ingredient and display popular ingredients\r\n",
-        "thai_ingredient_df <- create_ingredient(df = thai_df)\n",
+        "thai_ingredient_df <- create_ingredient(df = thai_df)\r\n",
-        "\n",
+        "\r\n",
-        "thai_ingredient_df %>% \n",
+        "thai_ingredient_df %>% \r\n",
        "  slice_head(n = 10)"
      ],
      "outputs": [],
@ -367,11 +367,11 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Make a bar chart for popular thai cuisines\n",
+        "# Make a bar chart for popular thai cuisines\r\n",
-        "thai_ingredient_df %>% \n",
+        "thai_ingredient_df %>% \r\n",
-        "  slice_head(n = 10) %>% \n",
+        "  slice_head(n = 10) %>% \r\n",
-        "  ggplot(aes(x = n_instances, y = ingredients)) +\n",
+        "  ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
-        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"steelblue\") +\n",
+        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"steelblue\") +\r\n",
        "  xlab(\"\") + ylab(\"\")"
      ],
      "outputs": [],
@ -392,12 +392,12 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Get popular ingredients for Japanese cuisines and make bar chart\n",
+        "# Get popular ingredients for Japanese cuisines and make bar chart\r\n",
-        "create_ingredient(df = japanese_df) %>% \n",
+        "create_ingredient(df = japanese_df) %>% \r\n",
-        "  slice_head(n = 10) %>%\n",
+        "  slice_head(n = 10) %>%\r\n",
-        "  ggplot(aes(x = n_instances, y = ingredients)) +\n",
+        "  ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
-        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"darkorange\", alpha = 0.8) +\n",
+        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"darkorange\", alpha = 0.8) +\r\n",
-        "  xlab(\"\") + ylab(\"\")\n"
+        "  xlab(\"\") + ylab(\"\")\r\n"
      ],
      "outputs": [],
      "metadata": {
@ -417,11 +417,11 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Get popular ingredients for Chinese cuisines and make bar chart\n",
+        "# Get popular ingredients for Chinese cuisines and make bar chart\r\n",
-        "create_ingredient(df = chinese_df) %>% \n",
+        "create_ingredient(df = chinese_df) %>% \r\n",
-        "  slice_head(n = 10) %>%\n",
+        "  slice_head(n = 10) %>%\r\n",
-        "  ggplot(aes(x = n_instances, y = ingredients)) +\n",
+        "  ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
-        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"cyan4\", alpha = 0.8) +\n",
+        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"cyan4\", alpha = 0.8) +\r\n",
        "  xlab(\"\") + ylab(\"\")"
      ],
      "outputs": [],
@ -442,11 +442,11 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Get popular ingredients for Indian cuisines and make bar chart\n",
+        "# Get popular ingredients for Indian cuisines and make bar chart\r\n",
-        "create_ingredient(df = indian_df) %>% \n",
+        "create_ingredient(df = indian_df) %>% \r\n",
-        "  slice_head(n = 10) %>%\n",
+        "  slice_head(n = 10) %>%\r\n",
-        "  ggplot(aes(x = n_instances, y = ingredients)) +\n",
+        "  ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
-        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"#041E42FF\", alpha = 0.8) +\n",
+        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"#041E42FF\", alpha = 0.8) +\r\n",
        "  xlab(\"\") + ylab(\"\")"
      ],
      "outputs": [],
@ -467,11 +467,11 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Get popular ingredients for Korean cuisines and make bar chart\n",
+        "# Get popular ingredients for Korean cuisines and make bar chart\r\n",
-        "create_ingredient(df = korean_df) %>% \n",
+        "create_ingredient(df = korean_df) %>% \r\n",
-        "  slice_head(n = 10) %>%\n",
+        "  slice_head(n = 10) %>%\r\n",
-        "  ggplot(aes(x = n_instances, y = ingredients)) +\n",
+        "  ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
-        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"#852419FF\", alpha = 0.8) +\n",
+        "  geom_bar(stat = \"identity\", width = 0.5, fill = \"#852419FF\", alpha = 0.8) +\r\n",
        "  xlab(\"\") + ylab(\"\")"
      ],
      "outputs": [],
@ -494,12 +494,12 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Drop id column, rice, garlic and ginger from our original data set\n",
+        "# Drop id column, rice, garlic and ginger from our original data set\r\n",
-        "df_select <- df %>% \n",
+        "df_select <- df %>% \r\n",
-        "  select(-c(1, rice, garlic, ginger))\n",
+        "  select(-c(1, rice, garlic, ginger))\r\n",
-        "\n",
+        "\r\n",
-        "# Display new data set\n",
+        "# Display new data set\r\n",
-        "df_select %>% \n",
+        "df_select %>% \r\n",
        "  slice_head(n = 5)"
      ],
      "outputs": [],
@ -510,16 +510,16 @@
    {
      "cell_type": "markdown",
      "source": [
-        "## Preprocessing data using recipes 👩‍🍳👨‍🍳 - Dealing with imbalanced data ⚖️\n",
+        "## Preprocessing data using recipes 👩‍🍳👨‍🍳 - Dealing with imbalanced data ⚖️\r\n",
-        "\n",
+        "\r\n",
-        "<p >\n",
+        "<p >\r\n",
-        "   <img src=\"../../images/recipes.png\"\n",
+        "   <img src=\"../../images/recipes.png\"\r\n",
-        "   width=\"600\"/>\n",
+        "   width=\"600\"/>\r\n",
-        "   <figcaption>Artwork by @allison_horst</figcaption>\n",
+        "   <figcaption>Artwork by @allison_horst</figcaption>\r\n",
-        "\n",
+        "\r\n",
-        "Given that this lesson is about cuisines, we have to put `recipes` into context .\n",
+        "Given that this lesson is about cuisines, we have to put `recipes` into context .\r\n",
-        "\n",
+        "\r\n",
-        "Tidymodels provides yet another neat package: `recipes`- a package for preprocessing data.\n"
+        "Tidymodels provides yet another neat package: `recipes`- a package for preprocessing data.\r\n"
      ],
      "metadata": {
        "id": "kkFd-JxdIaL6"
@ -538,11 +538,11 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Distribution of cuisines\n",
+        "# Distribution of cuisines\r\n",
-        "old_label_count <- df_select %>% \n",
+        "old_label_count <- df_select %>% \r\n",
-        "  count(cuisine) %>% \n",
+        "  count(cuisine) %>% \r\n",
-        "  arrange(desc(n))\n",
+        "  arrange(desc(n))\r\n",
-        "\n",
+        "\r\n",
        "old_label_count"
      ],
      "outputs": [],
@ -572,13 +572,13 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Load themis package for dealing with imbalanced data\n",
+        "# Load themis package for dealing with imbalanced data\r\n",
-        "library(themis)\n",
+        "library(themis)\r\n",
-        "\n",
+        "\r\n",
-        "# Create a recipe for preprocessing data\n",
+        "# Create a recipe for preprocessing data\r\n",
-        "cuisines_recipe <- recipe(cuisine ~ ., data = df_select) %>% \n",
+        "cuisines_recipe <- recipe(cuisine ~ ., data = df_select) %>% \r\n",
-        "  step_smote(cuisine)\n",
+        "  step_smote(cuisine)\r\n",
-        "\n",
+        "\r\n",
        "cuisines_recipe"
      ],
      "outputs": [],
@ -609,18 +609,18 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Prep and bake the recipe\n",
+        "# Prep and bake the recipe\r\n",
-        "preprocessed_df <- cuisines_recipe %>% \n",
+        "preprocessed_df <- cuisines_recipe %>% \r\n",
-        "  prep() %>% \n",
+        "  prep() %>% \r\n",
-        "  bake(new_data = NULL) %>% \n",
+        "  bake(new_data = NULL) %>% \r\n",
-        "  relocate(cuisine)\n",
+        "  relocate(cuisine)\r\n",
-        "\n",
+        "\r\n",
-        "# Display data\n",
+        "# Display data\r\n",
-        "preprocessed_df %>% \n",
+        "preprocessed_df %>% \r\n",
-        "  slice_head(n = 5)\n",
+        "  slice_head(n = 5)\r\n",
-        "\n",
+        "\r\n",
-        "# Quick summary stats\n",
+        "# Quick summary stats\r\n",
-        "preprocessed_df %>% \n",
+        "preprocessed_df %>% \r\n",
        "  introduce()"
      ],
      "outputs": [],
@ -641,12 +641,12 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Distribution of cuisines\n",
+        "# Distribution of cuisines\r\n",
-        "new_label_count <- preprocessed_df %>% \n",
+        "new_label_count <- preprocessed_df %>% \r\n",
-        "  count(cuisine) %>% \n",
+        "  count(cuisine) %>% \r\n",
-        "  arrange(desc(n))\n",
+        "  arrange(desc(n))\r\n",
-        "\n",
+        "\r\n",
-        "list(new_label_count = new_label_count,\n",
+        "list(new_label_count = new_label_count,\r\n",
        "     old_label_count = old_label_count)"
      ],
      "outputs": [],
@ -675,7 +675,7 @@
      "cell_type": "code",
      "execution_count": null,
      "source": [
-        "# Save preprocessed data\n",
+        "# Save preprocessed data\r\n",
        "write_csv(preprocessed_df, \"../../../data/cleaned_cuisines_R.csv\")"
      ],
      "outputs": [],
@ -686,32 +686,32 @@
    {
      "cell_type": "markdown",
      "source": [
-        "This fresh CSV can now be found in the root data folder.\n",
+        "This fresh CSV can now be found in the root data folder.\r\n",
-        "\n",
+        "\r\n",
-        "**🚀Challenge**\n",
+        "**🚀Challenge**\r\n",
-        "\n",
+        "\r\n",
-        "This curriculum contains several interesting datasets. Dig through the `data` folders and see if any contain datasets that would be appropriate for binary or multi-class classification? What questions would you ask of this dataset?\n",
+        "This curriculum contains several interesting datasets. Dig through the `data` folders and see if any contain datasets that would be appropriate for binary or multi-class classification? What questions would you ask of this dataset?\r\n",
-        "\n",
+        "\r\n",
-        "## [**Post-lecture quiz**](https://white-water-09ec41f0f.azurestaticapps.net/quiz/20/)\n",
+        "## [**Post-lecture quiz**](https://white-water-09ec41f0f.azurestaticapps.net/quiz/20/)\r\n",
-        "\n",
+        "\r\n",
-        "## **Review & Self Study**\n",
+        "## **Review & Self Study**\r\n",
-        "\n",
+        "\r\n",
-        "-   Check out [package themis](https://github.com/tidymodels/themis). What other techniques could we use to deal with imbalanced data?\n",
+        "-   Check out [package themis](https://github.com/tidymodels/themis). What other techniques could we use to deal with imbalanced data?\r\n",
-        "\n",
+        "\r\n",
-        "-   Tidy models [reference website](https://www.tidymodels.org/start/).\n",
+        "-   Tidy models [reference website](https://www.tidymodels.org/start/).\r\n",
-        "\n",
+        "\r\n",
-        "-   H. Wickham and G. Grolemund, [*R for Data Science: Visualize, Model, Transform, Tidy, and Import Data*](https://r4ds.had.co.nz/).\n",
+        "-   H. Wickham and G. Grolemund, [*R for Data Science: Visualize, Model, Transform, Tidy, and Import Data*](https://r4ds.had.co.nz/).\r\n",
-        "\n",
+        "\r\n",
-        "#### THANK YOU TO:\n",
+        "#### THANK YOU TO:\r\n",
-        "\n",
+        "\r\n",
-        "[`Allison Horst`](https://twitter.com/allison_horst/) for creating the amazing illustrations that make R more welcoming and engaging. Find more illustrations at her [gallery](https://www.google.com/url?q=https://github.com/allisonhorst/stats-illustrations&sa=D&source=editors&ust=1626380772530000&usg=AOvVaw3zcfyCizFQZpkSLzxiiQEM).\n",
+        "[`Allison Horst`](https://twitter.com/allison_horst/) for creating the amazing illustrations that make R more welcoming and engaging. Find more illustrations at her [gallery](https://www.google.com/url?q=https://github.com/allisonhorst/stats-illustrations&sa=D&source=editors&ust=1626380772530000&usg=AOvVaw3zcfyCizFQZpkSLzxiiQEM).\r\n",
-        "\n",
+        "\r\n",
-        "[Cassie Breviu](https://www.twitter.com/cassieview) and [Jen Looper](https://www.twitter.com/jenlooper) for creating the original Python version of this module ♥️\n",
+        "[Cassie Breviu](https://www.twitter.com/cassieview) and [Jen Looper](https://www.twitter.com/jenlooper) for creating the original Python version of this module ♥️\r\n",
-        "\n",
+        "\r\n",
-        "<p >\n",
+        "<p >\r\n",
-        "   <img src=\"../../images/r_learners_sm.jpeg\"\n",
+        "   <img src=\"../../images/r_learners_sm.jpeg\"\r\n",
-        "   width=\"600\"/>\n",
+        "   width=\"600\"/>\r\n",
-        "   <figcaption>Artwork by @allison_horst</figcaption>\n"
+        "   <figcaption>Artwork by @allison_horst</figcaption>\r\n"
      ],
      "metadata": {
        "id": "WQs5621pMGwf"