From de35c7e78b5681be889ea66ee1a82720dfa03a92 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 3 Oct 2025 13:30:35 +0000
Subject: [PATCH 02/10] Add real-world data quality checks to data cleaning
 lesson

Co-authored-by: leestott <2511341+leestott@users.noreply.github.com>
---
 .../08-data-preparation/notebook.ipynb        | 502 +++++++++++++++++-
 1 file changed, 501 insertions(+), 1 deletion(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index 31898613..2a3ef0fe 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -3687,6 +3687,506 @@
       "source": [
         "> **Takeaway:** Removing duplicate data is an essential part of almost every data-science project. Duplicate data can change the results of your analyses and give you inaccurate results!"
       ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Real-World Data Quality Checks\n",
+        "\n",
+        "> **Learning goal:** By the end of this section, you should be comfortable detecting and correcting common real-world data quality issues including inconsistent categorical values, abnormal numeric values (outliers), and duplicate entities with variations.\n",
+        "\n",
+        "While missing values and exact duplicates are common issues, real-world datasets often contain more subtle problems:\n",
+        "\n",
+        "1. **Inconsistent categorical values**: The same category spelled differently (e.g., \"USA\", \"U.S.A\", \"United States\")\n",
+        "2. **Abnormal numeric values**: Extreme outliers that indicate data entry errors (e.g., age = 999)\n",
+        "3. **Near-duplicate rows**: Records that represent the same entity with slight variations\n",
+        "\n",
+        "Let's explore techniques to detect and handle these issues."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Creating a Sample \"Dirty\" Dataset\n",
+        "\n",
+        "First, let's create a sample dataset that contains the types of issues we commonly encounter in real-world data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "\n",
+        "# Create a sample dataset with quality issues\n",
+        "dirty_data = pd.DataFrame({\n",
+        "    'customer_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],\n",
+        "    'name': ['John Smith', 'Jane Doe', 'John Smith', 'Bob Johnson', \n",
+        "             'Alice Williams', 'Charlie Brown', 'John  Smith', 'Eva Martinez',\n",
+        "             'Bob Johnson', 'Diana Prince', 'Frank Castle', 'Alice Williams'],\n",
+        "    'age': [25, 32, 25, 45, 28, 199, 25, 31, 45, 27, -5, 28],\n",
+        "    'country': ['USA', 'UK', 'U.S.A', 'Canada', 'USA', 'United Kingdom',\n",
+        "                'United States', 'Mexico', 'canada', 'USA', 'UK', 'usa'],\n",
+        "    'purchase_amount': [100.50, 250.00, 105.00, 320.00, 180.00, 90.00,\n",
+        "                       102.00, 275.00, 325.00, 195.00, 410.00, 185.00]\n",
+        "})\n",
+        "\n",
+        "print(\"Sample 'Dirty' Dataset:\")\n",
+        "print(dirty_data)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 1. Detecting Inconsistent Categorical Values\n",
+        "\n",
+        "Notice the `country` column has multiple representations for the same countries. Let's identify these inconsistencies:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check unique values in the country column\n",
+        "print(\"Unique country values:\")\n",
+        "print(dirty_data['country'].unique())\n",
+        "print(f\"\\nTotal unique values: {dirty_data['country'].nunique()}\")\n",
+        "\n",
+        "# Count occurrences of each variation\n",
+        "print(\"\\nValue counts:\")\n",
+        "print(dirty_data['country'].value_counts())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### Standardizing Categorical Values\n",
+        "\n",
+        "We can create a mapping to standardize these values. A simple approach is to convert to lowercase and create a mapping dictionary:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Create a standardization mapping\n",
+        "country_mapping = {\n",
+        "    'usa': 'USA',\n",
+        "    'u.s.a': 'USA',\n",
+        "    'united states': 'USA',\n",
+        "    'uk': 'UK',\n",
+        "    'united kingdom': 'UK',\n",
+        "    'canada': 'Canada',\n",
+        "    'mexico': 'Mexico'\n",
+        "}\n",
+        "\n",
+        "# Standardize the country column\n",
+        "dirty_data['country_clean'] = dirty_data['country'].str.lower().map(country_mapping)\n",
+        "\n",
+        "print(\"Before standardization:\")\n",
+        "print(dirty_data[['country']].value_counts())\n",
+        "print(\"\\nAfter standardization:\")\n",
+        "print(dirty_data[['country_clean']].value_counts())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "**Alternative: Using Fuzzy Matching**\n",
+        "\n",
+        "For more complex cases, we can use fuzzy string matching with the `rapidfuzz` library to automatically detect similar strings:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Note: You may need to install rapidfuzz: pip install rapidfuzz\n",
+        "from rapidfuzz import process, fuzz\n",
+        "\n",
+        "# Get unique countries\n",
+        "unique_countries = dirty_data['country'].unique()\n",
+        "\n",
+        "# For each country, find similar matches\n",
+        "print(\"Finding similar country names (similarity > 70%):\")\n",
+        "for country in unique_countries:\n",
+        "    matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n",
+        "    # Filter matches with similarity > 70 and not identical\n",
+        "    similar = [m for m in matches if m[1] > 70 and m[0] != country]\n",
+        "    if similar:\n",
+        "        print(f\"\\n'{country}' is similar to:\")\n",
+        "        for match, score, _ in similar:\n",
+        "            print(f\"  - '{match}' (similarity: {score}%)\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 2. Detecting Abnormal Numeric Values (Outliers)\n",
+        "\n",
+        "Looking at the `age` column, we have some suspicious values like 199 and -5. Let's use statistical methods to detect these outliers."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Display basic statistics\n",
+        "print(\"Age column statistics:\")\n",
+        "print(dirty_data['age'].describe())\n",
+        "\n",
+        "# Identify impossible values using domain knowledge\n",
+        "print(\"\\nRows with impossible age values (< 0 or > 120):\")\n",
+        "impossible_ages = dirty_data[(dirty_data['age'] < 0) | (dirty_data['age'] > 120)]\n",
+        "print(impossible_ages[['customer_id', 'name', 'age']])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### Using IQR (Interquartile Range) Method\n",
+        "\n",
+        "The IQR method is a robust statistical technique for outlier detection that is less sensitive to extreme values:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Calculate IQR for age (excluding impossible values)\n",
+        "valid_ages = dirty_data[(dirty_data['age'] >= 0) & (dirty_data['age'] <= 120)]['age']\n",
+        "\n",
+        "Q1 = valid_ages.quantile(0.25)\n",
+        "Q3 = valid_ages.quantile(0.75)\n",
+        "IQR = Q3 - Q1\n",
+        "\n",
+        "# Define outlier bounds\n",
+        "lower_bound = Q1 - 1.5 * IQR\n",
+        "upper_bound = Q3 + 1.5 * IQR\n",
+        "\n",
+        "print(f\"IQR-based outlier bounds for age: [{lower_bound:.2f}, {upper_bound:.2f}]\")\n",
+        "\n",
+        "# Identify outliers\n",
+        "age_outliers = dirty_data[(dirty_data['age'] < lower_bound) | (dirty_data['age'] > upper_bound)]\n",
+        "print(f\"\\nRows with age outliers:\")\n",
+        "print(age_outliers[['customer_id', 'name', 'age']])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### Using Z-Score Method\n",
+        "\n",
+        "The Z-score method identifies outliers based on standard deviations from the mean:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from scipy import stats\n",
+        "\n",
+        "# Calculate Z-scores for age\n",
+        "dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n",
+        "\n",
+        "# Typically, Z-score > 3 indicates an outlier\n",
+        "print(\"Rows with age Z-score > 3:\")\n",
+        "zscore_outliers = dirty_data[dirty_data['age_zscore'] > 3]\n",
+        "print(zscore_outliers[['customer_id', 'name', 'age', 'age_zscore']])\n",
+        "\n",
+        "# Clean up the temporary column\n",
+        "dirty_data = dirty_data.drop('age_zscore', axis=1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### Handling Outliers\n",
+        "\n",
+        "Once detected, outliers can be handled in several ways:\n",
+        "1. **Remove**: Drop rows with outliers (if they're errors)\n",
+        "2. **Cap**: Replace with boundary values\n",
+        "3. **Replace with NaN**: Treat as missing data and use imputation techniques\n",
+        "4. **Keep**: If they're legitimate extreme values"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Create a cleaned version by replacing impossible ages with NaN\n",
+        "dirty_data['age_clean'] = dirty_data['age'].apply(\n",
+        "    lambda x: np.nan if (x < 0 or x > 120) else x\n",
+        ")\n",
+        "\n",
+        "print(\"Age column before and after cleaning:\")\n",
+        "print(dirty_data[['customer_id', 'name', 'age', 'age_clean']])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 3. Detecting Near-Duplicate Rows\n",
+        "\n",
+        "Notice that our dataset has multiple entries for \"John Smith\" with slightly different values. Let's identify potential duplicates based on name similarity."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# First, let's look at exact name matches (ignoring extra whitespace)\n",
+        "dirty_data['name_normalized'] = dirty_data['name'].str.strip().str.lower()\n",
+        "\n",
+        "print(\"Checking for duplicate names:\")\n",
+        "duplicate_names = dirty_data[dirty_data.duplicated(['name_normalized'], keep=False)]\n",
+        "print(duplicate_names.sort_values('name_normalized')[['customer_id', 'name', 'age', 'country']])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### Finding Near-Duplicates with Fuzzy Matching\n",
+        "\n",
+        "For more sophisticated duplicate detection, we can use fuzzy matching to find similar names:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from rapidfuzz import process, fuzz\n",
+        "\n",
+        "# Function to find potential duplicates\n",
+        "def find_near_duplicates(df, column, threshold=90):\n",
+        "    \"\"\"\n",
+        "    Find near-duplicate entries in a column using fuzzy matching.\n",
+        "    \n",
+        "    Parameters:\n",
+        "    - df: DataFrame\n",
+        "    - column: Column name to check for duplicates\n",
+        "    - threshold: Similarity threshold (0-100)\n",
+        "    \n",
+        "    Returns: List of potential duplicate groups\n",
+        "    \"\"\"\n",
+        "    values = df[column].unique()\n",
+        "    duplicate_groups = []\n",
+        "    checked = set()\n",
+        "    \n",
+        "    for value in values:\n",
+        "        if value in checked:\n",
+        "            continue\n",
+        "            \n",
+        "        # Find similar values\n",
+        "        matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n",
+        "        similar = [m[0] for m in matches if m[1] >= threshold]\n",
+        "        \n",
+        "        if len(similar) > 1:\n",
+        "            duplicate_groups.append(similar)\n",
+        "            checked.update(similar)\n",
+        "    \n",
+        "    return duplicate_groups\n",
+        "\n",
+        "# Find near-duplicate names\n",
+        "duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
+        "\n",
+        "print(\"Potential duplicate groups:\")\n",
+        "for i, group in enumerate(duplicate_groups, 1):\n",
+        "    print(f\"\\nGroup {i}:\")\n",
+        "    for name in group:\n",
+        "        matching_rows = dirty_data[dirty_data['name'] == name]\n",
+        "        print(f\"  '{name}': {len(matching_rows)} occurrence(s)\")\n",
+        "        for _, row in matching_rows.iterrows():\n",
+        "            print(f\"    - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### Handling Duplicates\n",
+        "\n",
+        "Once identified, you need to decide how to handle duplicates:\n",
+        "1. **Keep the first occurrence**: Use `drop_duplicates(keep='first')`\n",
+        "2. **Keep the last occurrence**: Use `drop_duplicates(keep='last')`\n",
+        "3. **Aggregate information**: Combine information from duplicate rows\n",
+        "4. **Manual review**: Flag for human review"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Example: Remove duplicates based on normalized name, keeping first occurrence\n",
+        "cleaned_data = dirty_data.drop_duplicates(subset=['name_normalized'], keep='first')\n",
+        "\n",
+        "print(f\"Original dataset: {len(dirty_data)} rows\")\n",
+        "print(f\"After removing name duplicates: {len(cleaned_data)} rows\")\n",
+        "print(f\"Removed: {len(dirty_data) - len(cleaned_data)} duplicate rows\")\n",
+        "\n",
+        "print(\"\\nCleaned dataset:\")\n",
+        "print(cleaned_data[['customer_id', 'name', 'age', 'country_clean']])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Summary: Complete Data Cleaning Pipeline\n",
+        "\n",
+        "Let's put it all together into a comprehensive cleaning pipeline:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def clean_dataset(df):\n",
+        "    \"\"\"\n",
+        "    Comprehensive data cleaning function.\n",
+        "    \"\"\"\n",
+        "    # Create a copy to avoid modifying the original\n",
+        "    cleaned = df.copy()\n",
+        "    \n",
+        "    # 1. Standardize categorical values (country)\n",
+        "    country_mapping = {\n",
+        "        'usa': 'USA', 'u.s.a': 'USA', 'united states': 'USA',\n",
+        "        'uk': 'UK', 'united kingdom': 'UK',\n",
+        "        'canada': 'Canada', 'mexico': 'Mexico'\n",
+        "    }\n",
+        "    cleaned['country'] = cleaned['country'].str.lower().map(country_mapping)\n",
+        "    \n",
+        "    # 2. Clean abnormal age values\n",
+        "    cleaned['age'] = cleaned['age'].apply(\n",
+        "        lambda x: np.nan if (x < 0 or x > 120) else x\n",
+        "    )\n",
+        "    \n",
+        "    # 3. Remove near-duplicate names (normalize whitespace)\n",
+        "    cleaned['name'] = cleaned['name'].str.strip()\n",
+        "    cleaned = cleaned.drop_duplicates(subset=['name'], keep='first')\n",
+        "    \n",
+        "    return cleaned\n",
+        "\n",
+        "# Apply the cleaning pipeline\n",
+        "final_cleaned_data = clean_dataset(dirty_data)\n",
+        "\n",
+        "print(\"Before cleaning:\")\n",
+        "print(f\"  Rows: {len(dirty_data)}\")\n",
+        "print(f\"  Unique countries: {dirty_data['country'].nunique()}\")\n",
+        "print(f\"  Invalid ages: {((dirty_data['age'] < 0) | (dirty_data['age'] > 120)).sum()}\")\n",
+        "\n",
+        "print(\"\\nAfter cleaning:\")\n",
+        "print(f\"  Rows: {len(final_cleaned_data)}\")\n",
+        "print(f\"  Unique countries: {final_cleaned_data['country'].nunique()}\")\n",
+        "print(f\"  Invalid ages: {((final_cleaned_data['age'] < 0) | (final_cleaned_data['age'] > 120)).sum()}\")\n",
+        "\n",
+        "print(\"\\nCleaned dataset:\")\n",
+        "print(final_cleaned_data[['customer_id', 'name', 'age', 'country', 'purchase_amount']])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 🎯 Challenge Exercise\n",
+        "\n",
+        "Now it's your turn! Below is a new row of data with multiple quality issues. Can you:\n",
+        "\n",
+        "1. Identify all the issues in this row\n",
+        "2. Write code to clean each issue\n",
+        "3. Add the cleaned row to the dataset\n",
+        "\n",
+        "Here's the problematic data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# New problematic row\n",
+        "new_row = pd.DataFrame({\n",
+        "    'customer_id': [13],\n",
+        "    'name': ['  Diana  Prince  '],  # Extra whitespace\n",
+        "    'age': [250],  # Impossible age\n",
+        "    'country': ['U.S.A.'],  # Inconsistent format\n",
+        "    'purchase_amount': [150.00]\n",
+        "})\n",
+        "\n",
+        "print(\"New row to clean:\")\n",
+        "print(new_row)\n",
+        "\n",
+        "# TODO: Your code here to clean this row\n",
+        "# Hints:\n",
+        "# 1. Strip whitespace from the name\n",
+        "# 2. Check if the name is a duplicate (Diana Prince already exists)\n",
+        "# 3. Handle the impossible age value\n",
+        "# 4. Standardize the country name\n",
+        "\n",
+        "# Example solution (uncomment and modify as needed):\n",
+        "# new_row_cleaned = new_row.copy()\n",
+        "# new_row_cleaned['name'] = new_row_cleaned['name'].str.strip()\n",
+        "# new_row_cleaned['age'] = np.nan  # Invalid age\n",
+        "# new_row_cleaned['country'] = 'USA'  # Standardized\n",
+        "# print(\"\\nCleaned row:\")\n",
+        "# print(new_row_cleaned)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Key Takeaways\n",
+        "\n",
+        "1. **Inconsistent categories** are common in real-world data. Always check unique values and standardize them using mappings or fuzzy matching.\n",
+        "\n",
+        "2. **Outliers** can significantly affect your analysis. Use domain knowledge combined with statistical methods (IQR, Z-score) to detect them.\n",
+        "\n",
+        "3. **Near-duplicates** are harder to detect than exact duplicates. Consider using fuzzy matching and normalizing data (lowercasing, stripping whitespace) to identify them.\n",
+        "\n",
+        "4. **Data cleaning is iterative**. You may need to apply multiple techniques and review the results before finalizing your cleaned dataset.\n",
+        "\n",
+        "5. **Document your decisions**. Keep track of what cleaning steps you applied and why, as this is important for reproducibility and transparency.\n",
+        "\n",
+        "> **Best Practice:** Always keep a copy of your original \"dirty\" data. Never overwrite your source data files - create cleaned versions with clear naming conventions like `data_cleaned.csv`."
+      ]
     }
   ],
   "metadata": {
@@ -3715,4 +4215,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}
\ No newline at end of file

From c7982edc029e09183dfd75bc3f07bca371d82c64 Mon Sep 17 00:00:00 2001
From: Lee Stott <leestott@microsoft.com>
Date: Fri, 3 Oct 2025 14:47:48 +0100
Subject: [PATCH 03/10] Update
 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 2-Working-With-Data/08-data-preparation/notebook.ipynb | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index 2a3ef0fe..22c9a757 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -3815,8 +3815,10 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# Note: You may need to install rapidfuzz: pip install rapidfuzz\n",
-        "from rapidfuzz import process, fuzz\n",
+        "try:\n",
+        "    from rapidfuzz import process, fuzz\n",
+        "except ImportError:\n",
+        "    raise ImportError(\"The 'rapidfuzz' library is required for fuzzy matching. Please install it with 'pip install rapidfuzz' and rerun this cell.\")\n",
         "\n",
         "# Get unique countries\n",
         "unique_countries = dirty_data['country'].unique()\n",

From 95ded644faba6d4bd979b0ab8646b32cb303f423 Mon Sep 17 00:00:00 2001
From: Lee Stott <leestott@microsoft.com>
Date: Fri, 3 Oct 2025 14:48:08 +0100
Subject: [PATCH 04/10] Update
 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../08-data-preparation/notebook.ipynb        | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index 22c9a757..bcdf44c7 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -3909,18 +3909,21 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "from scipy import stats\n",
-        "\n",
-        "# Calculate Z-scores for age\n",
-        "dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n",
+        "try:\n",
+        "    from scipy import stats\n",
+        "except ImportError:\n",
+        "    print(\"scipy is required for Z-score calculation. Please install it with 'pip install scipy' and rerun this cell.\")\n",
+        "else:\n",
+        "    # Calculate Z-scores for age\n",
+        "    dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n",
         "\n",
-        "# Typically, Z-score > 3 indicates an outlier\n",
-        "print(\"Rows with age Z-score > 3:\")\n",
-        "zscore_outliers = dirty_data[dirty_data['age_zscore'] > 3]\n",
-        "print(zscore_outliers[['customer_id', 'name', 'age', 'age_zscore']])\n",
+        "    # Typically, Z-score > 3 indicates an outlier\n",
+        "    print(\"Rows with age Z-score > 3:\")\n",
+        "    zscore_outliers = dirty_data[dirty_data['age_zscore'] > 3]\n",
+        "    print(zscore_outliers[['customer_id', 'name', 'age', 'age_zscore']])\n",
         "\n",
-        "# Clean up the temporary column\n",
-        "dirty_data = dirty_data.drop('age_zscore', axis=1)"
+        "    # Clean up the temporary column\n",
+        "    dirty_data = dirty_data.drop('age_zscore', axis=1)"
       ]
     },
     {

From 9f59dfaede61d03cd2047413101cf324952e3bc6 Mon Sep 17 00:00:00 2001
From: Lee Stott <leestott@microsoft.com>
Date: Fri, 3 Oct 2025 14:48:22 +0100
Subject: [PATCH 05/10] Update
 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 2-Working-With-Data/08-data-preparation/notebook.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index bcdf44c7..5e21785f 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -3795,7 +3795,7 @@
         "dirty_data['country_clean'] = dirty_data['country'].str.lower().map(country_mapping)\n",
         "\n",
         "print(\"Before standardization:\")\n",
-        "print(dirty_data[['country']].value_counts())\n",
+        "print(dirty_data['country'].value_counts())\n",
         "print(\"\\nAfter standardization:\")\n",
         "print(dirty_data[['country_clean']].value_counts())"
       ]

From 97d1ea036a062d4c7e3da99c5894ce2838932d2a Mon Sep 17 00:00:00 2001
From: Lee Stott <leestott@microsoft.com>
Date: Fri, 3 Oct 2025 14:52:35 +0100
Subject: [PATCH 06/10] Update
 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 2-Working-With-Data/08-data-preparation/notebook.ipynb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index 5e21785f..de5ec916 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -3815,10 +3815,10 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "try:\n",
-        "    from rapidfuzz import process, fuzz\n",
-        "except ImportError:\n",
-        "    raise ImportError(\"The 'rapidfuzz' library is required for fuzzy matching. Please install it with 'pip install rapidfuzz' and rerun this cell.\")\n",
+        "# rapidfuzz was already imported in an earlier cell\n",
+        "# from rapidfuzz import process, fuzz\n",
+        "\n",
+        "\n",
         "\n",
         "# Get unique countries\n",
         "unique_countries = dirty_data['country'].unique()\n",

From 37aa09a160d74d550bf5e27510e8c7f3b3ab132b Mon Sep 17 00:00:00 2001
From: Lee Stott <leestott@microsoft.com>
Date: Fri, 3 Oct 2025 14:54:50 +0100
Subject: [PATCH 07/10] Update
 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 2-Working-With-Data/08-data-preparation/notebook.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index de5ec916..1023b7f3 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -3816,7 +3816,7 @@
       "outputs": [],
       "source": [
         "# rapidfuzz was already imported in an earlier cell\n",
-        "# from rapidfuzz import process, fuzz\n",
+        "from rapidfuzz import process, fuzz\n",
         "\n",
         "\n",
         "\n",

From 186327158e9707c5897d0ba70aee6f45af747725 Mon Sep 17 00:00:00 2001
From: Lee Stott <leestott@microsoft.com>
Date: Fri, 3 Oct 2025 14:55:12 +0100
Subject: [PATCH 08/10] Update
 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../08-data-preparation/notebook.ipynb        | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index 1023b7f3..fe10a4f6 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -3815,24 +3815,29 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# rapidfuzz was already imported in an earlier cell\n",
-        "from rapidfuzz import process, fuzz\n",
-        "\n",
-        "\n",
+        "try:\n",
+        "    from rapidfuzz import process, fuzz\n",
+        "except ImportError:\n",
+        "    print(\"rapidfuzz is not installed. Please install it with 'pip install rapidfuzz' to use fuzzy matching.\")\n",
+        "    process = None\n",
+        "    fuzz = None\n",
         "\n",
         "# Get unique countries\n",
         "unique_countries = dirty_data['country'].unique()\n",
         "\n",
         "# For each country, find similar matches\n",
-        "print(\"Finding similar country names (similarity > 70%):\")\n",
-        "for country in unique_countries:\n",
-        "    matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n",
-        "    # Filter matches with similarity > 70 and not identical\n",
-        "    similar = [m for m in matches if m[1] > 70 and m[0] != country]\n",
-        "    if similar:\n",
-        "        print(f\"\\n'{country}' is similar to:\")\n",
-        "        for match, score, _ in similar:\n",
-        "            print(f\"  - '{match}' (similarity: {score}%)\")"
+        "if process is not None and fuzz is not None:\n",
+        "    print(\"Finding similar country names (similarity > 70%):\")\n",
+        "    for country in unique_countries:\n",
+        "        matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n",
+        "        # Filter matches with similarity > 70 and not identical\n",
+        "        similar = [m for m in matches if m[1] > 70 and m[0] != country]\n",
+        "        if similar:\n",
+        "            print(f\"\\n'{country}' is similar to:\")\n",
+        "            for match, score, _ in similar:\n",
+        "                print(f\"  - '{match}' (similarity: {score}%)\")\n",
+        "else:\n",
+        "    print(\"Skipping fuzzy matching because rapidfuzz is not available.\")"
       ]
     },
     {

From 5da6e49382bdc057a843adcca43079937ead8860 Mon Sep 17 00:00:00 2001
From: Lee Stott <leestott@microsoft.com>
Date: Fri, 3 Oct 2025 14:57:37 +0100
Subject: [PATCH 09/10] Update
 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../08-data-preparation/notebook.ipynb        | 79 ++++++++++---------
 1 file changed, 41 insertions(+), 38 deletions(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index fe10a4f6..294f278c 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -3997,49 +3997,52 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "from rapidfuzz import process, fuzz\n",
+        "try:\n",
+        "    from rapidfuzz import process, fuzz\n",
         "\n",
-        "# Function to find potential duplicates\n",
-        "def find_near_duplicates(df, column, threshold=90):\n",
-        "    \"\"\"\n",
-        "    Find near-duplicate entries in a column using fuzzy matching.\n",
-        "    \n",
-        "    Parameters:\n",
-        "    - df: DataFrame\n",
-        "    - column: Column name to check for duplicates\n",
-        "    - threshold: Similarity threshold (0-100)\n",
-        "    \n",
-        "    Returns: List of potential duplicate groups\n",
-        "    \"\"\"\n",
-        "    values = df[column].unique()\n",
-        "    duplicate_groups = []\n",
-        "    checked = set()\n",
-        "    \n",
-        "    for value in values:\n",
-        "        if value in checked:\n",
-        "            continue\n",
+        "    # Function to find potential duplicates\n",
+        "    def find_near_duplicates(df, column, threshold=90):\n",
+        "        \"\"\"\n",
+        "        Find near-duplicate entries in a column using fuzzy matching.\n",
+        "        \n",
+        "        Parameters:\n",
+        "        - df: DataFrame\n",
+        "        - column: Column name to check for duplicates\n",
+        "        - threshold: Similarity threshold (0-100)\n",
+        "        \n",
+        "        Returns: List of potential duplicate groups\n",
+        "        \"\"\"\n",
+        "        values = df[column].unique()\n",
+        "        duplicate_groups = []\n",
+        "        checked = set()\n",
+        "        \n",
+        "        for value in values:\n",
+        "            if value in checked:\n",
+        "                continue\n",
+        "                \n",
+        "            # Find similar values\n",
+        "            matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n",
+        "            similar = [m[0] for m in matches if m[1] >= threshold]\n",
         "            \n",
-        "        # Find similar values\n",
-        "        matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n",
-        "        similar = [m[0] for m in matches if m[1] >= threshold]\n",
+        "            if len(similar) > 1:\n",
+        "                duplicate_groups.append(similar)\n",
+        "                checked.update(similar)\n",
         "        \n",
-        "        if len(similar) > 1:\n",
-        "            duplicate_groups.append(similar)\n",
-        "            checked.update(similar)\n",
-        "    \n",
-        "    return duplicate_groups\n",
+        "        return duplicate_groups\n",
         "\n",
-        "# Find near-duplicate names\n",
-        "duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
+        "    # Find near-duplicate names\n",
+        "    duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
         "\n",
-        "print(\"Potential duplicate groups:\")\n",
-        "for i, group in enumerate(duplicate_groups, 1):\n",
-        "    print(f\"\\nGroup {i}:\")\n",
-        "    for name in group:\n",
-        "        matching_rows = dirty_data[dirty_data['name'] == name]\n",
-        "        print(f\"  '{name}': {len(matching_rows)} occurrence(s)\")\n",
-        "        for _, row in matching_rows.iterrows():\n",
-        "            print(f\"    - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")"
+        "    print(\"Potential duplicate groups:\")\n",
+        "    for i, group in enumerate(duplicate_groups, 1):\n",
+        "        print(f\"\\nGroup {i}:\")\n",
+        "        for name in group:\n",
+        "            matching_rows = dirty_data[dirty_data['name'] == name]\n",
+        "            print(f\"  '{name}': {len(matching_rows)} occurrence(s)\")\n",
+        "            for _, row in matching_rows.iterrows():\n",
+        "                print(f\"    - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")\n",
+        "except ImportError:\n",
+        "    print(\"rapidfuzz is not installed. Skipping fuzzy matching for near-duplicates.\")"
       ]
     },
     {

From 8e723abc2495f04c70611c8098ab56fece11bb6f Mon Sep 17 00:00:00 2001
From: Lee Stott <leestott@microsoft.com>
Date: Fri, 3 Oct 2025 14:58:12 +0100
Subject: [PATCH 10/10] Update
 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 2-Working-With-Data/08-data-preparation/notebook.ipynb | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
index 294f278c..ceff57fe 100644
--- a/2-Working-With-Data/08-data-preparation/notebook.ipynb
+++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -3919,8 +3919,11 @@
         "except ImportError:\n",
         "    print(\"scipy is required for Z-score calculation. Please install it with 'pip install scipy' and rerun this cell.\")\n",
         "else:\n",
-        "    # Calculate Z-scores for age\n",
-        "    dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n",
+        "    # Calculate Z-scores for age, handling NaN values\n",
+        "    age_nonan = dirty_data['age'].dropna()\n",
+        "    zscores = np.abs(stats.zscore(age_nonan))\n",
+        "    dirty_data['age_zscore'] = np.nan\n",
+        "    dirty_data.loc[age_nonan.index, 'age_zscore'] = zscores\n",
         "\n",
         "    # Typically, Z-score > 3 indicates an outlier\n",
         "    print(\"Rows with age Z-score > 3:\")\n",