diff --git a/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb b/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb index 87c33b5ea..4592429f9 100644 --- a/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb +++ b/4-Classification/1-Introduction/solution/R/lesson_10-R.ipynb @@ -103,8 +103,8 @@ "cell_type": "code", "execution_count": null, "source": [ - "suppressWarnings(if (!require(\"pacman\"))install.packages(\"pacman\"))\n", - "\n", + "suppressWarnings(if (!require(\"pacman\"))install.packages(\"pacman\"))\r\n", + "\r\n", "pacman::p_load(tidyverse, tidymodels, DataExplorer, themis, here)" ], "outputs": [], @@ -138,12 +138,12 @@ "cell_type": "code", "execution_count": null, "source": [ - "# Import data\n", - "df <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/4-Classification/data/cuisines.csv\")\n", - "\n", - "# View the first 5 rows\n", - "df %>% \n", - " slice_head(n = 5)\n" + "# Import data\r\n", + "df <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/4-Classification/data/cuisines.csv\")\r\n", + "\r\n", + "# View the first 5 rows\r\n", + "df %>% \r\n", + " slice_head(n = 5)\r\n" ], "outputs": [], "metadata": { @@ -218,7 +218,7 @@ "\r\n", "Next, let's assign each cuisine into its individual tibble and find out how much data is available (rows, columns) per cuisine.\r\n", "\r\n", - "> A tibble is a modern reimagining of the data frame, keeping what time has proven to be effective, and throwing out what is not.\r\n", + "> A [tibble](https://tibble.tidyverse.org/) is a modern data frame.\r\n", "\r\n", "
\r\n",
"
% select(-1) %>% \n",
- " # Transpose data to a long format\n",
- " pivot_longer(!cuisine, names_to = \"ingredients\", values_to = \"count\") %>% \n",
- " # Find the top most ingredients for a particular cuisine\n",
- " group_by(ingredients) %>% \n",
- " summarise(n_instances = sum(count)) %>% \n",
- " filter(n_instances != 0) %>% \n",
- " # Arrange by descending order\n",
- " arrange(desc(n_instances)) %>% \n",
- " mutate(ingredients = factor(ingredients) %>% fct_inorder())\n",
- " \n",
- " \n",
- " return(ingredient_df)\n",
+ "# Creates a functions that returns the top ingredients by class\r\n",
+ "\r\n",
+ "create_ingredient <- function(df){\r\n",
+ " \r\n",
+ " # Drop the id column which is the first colum\r\n",
+ " ingredient_df = df %>% select(-1) %>% \r\n",
+ " # Transpose data to a long format\r\n",
+ " pivot_longer(!cuisine, names_to = \"ingredients\", values_to = \"count\") %>% \r\n",
+ " # Find the top most ingredients for a particular cuisine\r\n",
+ " group_by(ingredients) %>% \r\n",
+ " summarise(n_instances = sum(count)) %>% \r\n",
+ " filter(n_instances != 0) %>% \r\n",
+ " # Arrange by descending order\r\n",
+ " arrange(desc(n_instances)) %>% \r\n",
+ " mutate(ingredients = factor(ingredients) %>% fct_inorder())\r\n",
+ " \r\n",
+ " \r\n",
+ " return(ingredient_df)\r\n",
"} # End of function"
],
"outputs": [],
@@ -343,10 +343,10 @@
"cell_type": "code",
"execution_count": null,
"source": [
- "# Call create_ingredient and display popular ingredients\n",
- "thai_ingredient_df <- create_ingredient(df = thai_df)\n",
- "\n",
- "thai_ingredient_df %>% \n",
+ "# Call create_ingredient and display popular ingredients\r\n",
+ "thai_ingredient_df <- create_ingredient(df = thai_df)\r\n",
+ "\r\n",
+ "thai_ingredient_df %>% \r\n",
" slice_head(n = 10)"
],
"outputs": [],
@@ -367,11 +367,11 @@
"cell_type": "code",
"execution_count": null,
"source": [
- "# Make a bar chart for popular thai cuisines\n",
- "thai_ingredient_df %>% \n",
- " slice_head(n = 10) %>% \n",
- " ggplot(aes(x = n_instances, y = ingredients)) +\n",
- " geom_bar(stat = \"identity\", width = 0.5, fill = \"steelblue\") +\n",
+ "# Make a bar chart for popular thai cuisines\r\n",
+ "thai_ingredient_df %>% \r\n",
+ " slice_head(n = 10) %>% \r\n",
+ " ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
+ " geom_bar(stat = \"identity\", width = 0.5, fill = \"steelblue\") +\r\n",
" xlab(\"\") + ylab(\"\")"
],
"outputs": [],
@@ -392,12 +392,12 @@
"cell_type": "code",
"execution_count": null,
"source": [
- "# Get popular ingredients for Japanese cuisines and make bar chart\n",
- "create_ingredient(df = japanese_df) %>% \n",
- " slice_head(n = 10) %>%\n",
- " ggplot(aes(x = n_instances, y = ingredients)) +\n",
- " geom_bar(stat = \"identity\", width = 0.5, fill = \"darkorange\", alpha = 0.8) +\n",
- " xlab(\"\") + ylab(\"\")\n"
+ "# Get popular ingredients for Japanese cuisines and make bar chart\r\n",
+ "create_ingredient(df = japanese_df) %>% \r\n",
+ " slice_head(n = 10) %>%\r\n",
+ " ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
+ " geom_bar(stat = \"identity\", width = 0.5, fill = \"darkorange\", alpha = 0.8) +\r\n",
+ " xlab(\"\") + ylab(\"\")\r\n"
],
"outputs": [],
"metadata": {
@@ -417,11 +417,11 @@
"cell_type": "code",
"execution_count": null,
"source": [
- "# Get popular ingredients for Chinese cuisines and make bar chart\n",
- "create_ingredient(df = chinese_df) %>% \n",
- " slice_head(n = 10) %>%\n",
- " ggplot(aes(x = n_instances, y = ingredients)) +\n",
- " geom_bar(stat = \"identity\", width = 0.5, fill = \"cyan4\", alpha = 0.8) +\n",
+ "# Get popular ingredients for Chinese cuisines and make bar chart\r\n",
+ "create_ingredient(df = chinese_df) %>% \r\n",
+ " slice_head(n = 10) %>%\r\n",
+ " ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
+ " geom_bar(stat = \"identity\", width = 0.5, fill = \"cyan4\", alpha = 0.8) +\r\n",
" xlab(\"\") + ylab(\"\")"
],
"outputs": [],
@@ -442,11 +442,11 @@
"cell_type": "code",
"execution_count": null,
"source": [
- "# Get popular ingredients for Indian cuisines and make bar chart\n",
- "create_ingredient(df = indian_df) %>% \n",
- " slice_head(n = 10) %>%\n",
- " ggplot(aes(x = n_instances, y = ingredients)) +\n",
- " geom_bar(stat = \"identity\", width = 0.5, fill = \"#041E42FF\", alpha = 0.8) +\n",
+ "# Get popular ingredients for Indian cuisines and make bar chart\r\n",
+ "create_ingredient(df = indian_df) %>% \r\n",
+ " slice_head(n = 10) %>%\r\n",
+ " ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
+ " geom_bar(stat = \"identity\", width = 0.5, fill = \"#041E42FF\", alpha = 0.8) +\r\n",
" xlab(\"\") + ylab(\"\")"
],
"outputs": [],
@@ -467,11 +467,11 @@
"cell_type": "code",
"execution_count": null,
"source": [
- "# Get popular ingredients for Korean cuisines and make bar chart\n",
- "create_ingredient(df = korean_df) %>% \n",
- " slice_head(n = 10) %>%\n",
- " ggplot(aes(x = n_instances, y = ingredients)) +\n",
- " geom_bar(stat = \"identity\", width = 0.5, fill = \"#852419FF\", alpha = 0.8) +\n",
+ "# Get popular ingredients for Korean cuisines and make bar chart\r\n",
+ "create_ingredient(df = korean_df) %>% \r\n",
+ " slice_head(n = 10) %>%\r\n",
+ " ggplot(aes(x = n_instances, y = ingredients)) +\r\n",
+ " geom_bar(stat = \"identity\", width = 0.5, fill = \"#852419FF\", alpha = 0.8) +\r\n",
" xlab(\"\") + ylab(\"\")"
],
"outputs": [],
@@ -494,12 +494,12 @@
"cell_type": "code",
"execution_count": null,
"source": [
- "# Drop id column, rice, garlic and ginger from our original data set\n",
- "df_select <- df %>% \n",
- " select(-c(1, rice, garlic, ginger))\n",
- "\n",
- "# Display new data set\n",
- "df_select %>% \n",
+ "# Drop id column, rice, garlic and ginger from our original data set\r\n",
+ "df_select <- df %>% \r\n",
+ " select(-c(1, rice, garlic, ginger))\r\n",
+ "\r\n",
+ "# Display new data set\r\n",
+ "df_select %>% \r\n",
" slice_head(n = 5)"
],
"outputs": [],
@@ -510,16 +510,16 @@
{
"cell_type": "markdown",
"source": [
- "## Preprocessing data using recipes 👩🍳👨🍳 - Dealing with imbalanced data ⚖️\n",
- "\n",
- "
\n",
- "
\n",
- "
\r\n",
+ "
\r\n",
+ "
\n",
- "
\n",
- "
\r\n",
+ "
\r\n",
+ "