From de35c7e78b5681be889ea66ee1a82720dfa03a92 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:30:35 +0000 Subject: [PATCH 02/10] Add real-world data quality checks to data cleaning lesson Co-authored-by: leestott <2511341+leestott@users.noreply.github.com> --- .../08-data-preparation/notebook.ipynb | 502 +++++++++++++++++- 1 file changed, 501 insertions(+), 1 deletion(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index 31898613..2a3ef0fe 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3687,6 +3687,506 @@ "source": [ "> **Takeaway:** Removing duplicate data is an essential part of almost every data-science project. Duplicate data can change the results of your analyses and give you inaccurate results!" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real-World Data Quality Checks\n", + "\n", + "> **Learning goal:** By the end of this section, you should be comfortable detecting and correcting common real-world data quality issues including inconsistent categorical values, abnormal numeric values (outliers), and duplicate entities with variations.\n", + "\n", + "While missing values and exact duplicates are common issues, real-world datasets often contain more subtle problems:\n", + "\n", + "1. **Inconsistent categorical values**: The same category spelled differently (e.g., \"USA\", \"U.S.A\", \"United States\")\n", + "2. **Abnormal numeric values**: Extreme outliers that indicate data entry errors (e.g., age = 999)\n", + "3. **Near-duplicate rows**: Records that represent the same entity with slight variations\n", + "\n", + "Let's explore techniques to detect and handle these issues." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating a Sample \"Dirty\" Dataset\n", + "\n", + "First, let's create a sample dataset that contains the types of issues we commonly encounter in real-world data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Create a sample dataset with quality issues\n", + "dirty_data = pd.DataFrame({\n", + " 'customer_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],\n", + " 'name': ['John Smith', 'Jane Doe', 'John Smith', 'Bob Johnson', \n", + " 'Alice Williams', 'Charlie Brown', 'John Smith', 'Eva Martinez',\n", + " 'Bob Johnson', 'Diana Prince', 'Frank Castle', 'Alice Williams'],\n", + " 'age': [25, 32, 25, 45, 28, 199, 25, 31, 45, 27, -5, 28],\n", + " 'country': ['USA', 'UK', 'U.S.A', 'Canada', 'USA', 'United Kingdom',\n", + " 'United States', 'Mexico', 'canada', 'USA', 'UK', 'usa'],\n", + " 'purchase_amount': [100.50, 250.00, 105.00, 320.00, 180.00, 90.00,\n", + " 102.00, 275.00, 325.00, 195.00, 410.00, 185.00]\n", + "})\n", + "\n", + "print(\"Sample 'Dirty' Dataset:\")\n", + "print(dirty_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Detecting Inconsistent Categorical Values\n", + "\n", + "Notice the `country` column has multiple representations for the same countries. Let's identify these inconsistencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check unique values in the country column\n", + "print(\"Unique country values:\")\n", + "print(dirty_data['country'].unique())\n", + "print(f\"\\nTotal unique values: {dirty_data['country'].nunique()}\")\n", + "\n", + "# Count occurrences of each variation\n", + "print(\"\\nValue counts:\")\n", + "print(dirty_data['country'].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Standardizing Categorical Values\n", + "\n", + "We can create a mapping to standardize these values. A simple approach is to convert to lowercase and create a mapping dictionary:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a standardization mapping\n", + "country_mapping = {\n", + " 'usa': 'USA',\n", + " 'u.s.a': 'USA',\n", + " 'united states': 'USA',\n", + " 'uk': 'UK',\n", + " 'united kingdom': 'UK',\n", + " 'canada': 'Canada',\n", + " 'mexico': 'Mexico'\n", + "}\n", + "\n", + "# Standardize the country column\n", + "dirty_data['country_clean'] = dirty_data['country'].str.lower().map(country_mapping)\n", + "\n", + "print(\"Before standardization:\")\n", + "print(dirty_data[['country']].value_counts())\n", + "print(\"\\nAfter standardization:\")\n", + "print(dirty_data[['country_clean']].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Alternative: Using Fuzzy Matching**\n", + "\n", + "For more complex cases, we can use fuzzy string matching with the `rapidfuzz` library to automatically detect similar strings:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Note: You may need to install rapidfuzz: pip install rapidfuzz\n", + "from rapidfuzz import process, fuzz\n", + "\n", + "# Get unique countries\n", + "unique_countries = dirty_data['country'].unique()\n", + "\n", + "# For each country, find similar matches\n", + "print(\"Finding similar country names (similarity > 70%):\")\n", + "for country in unique_countries:\n", + " matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n", + " # Filter matches with similarity > 70 and not identical\n", + " similar = [m for m in matches if m[1] > 70 and m[0] != country]\n", + " if similar:\n", + " print(f\"\\n'{country}' is similar to:\")\n", + " for match, score, _ in similar:\n", + " print(f\" - '{match}' (similarity: {score}%)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Detecting Abnormal Numeric Values (Outliers)\n", + "\n", + "Looking at the `age` column, we have some suspicious values like 199 and -5. Let's use statistical methods to detect these outliers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display basic statistics\n", + "print(\"Age column statistics:\")\n", + "print(dirty_data['age'].describe())\n", + "\n", + "# Identify impossible values using domain knowledge\n", + "print(\"\\nRows with impossible age values (< 0 or > 120):\")\n", + "impossible_ages = dirty_data[(dirty_data['age'] < 0) | (dirty_data['age'] > 120)]\n", + "print(impossible_ages[['customer_id', 'name', 'age']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Using IQR (Interquartile Range) Method\n", + "\n", + "The IQR method is a robust statistical technique for outlier detection that is less sensitive to extreme values:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate IQR for age (excluding impossible values)\n", + "valid_ages = dirty_data[(dirty_data['age'] >= 0) & (dirty_data['age'] <= 120)]['age']\n", + "\n", + "Q1 = valid_ages.quantile(0.25)\n", + "Q3 = valid_ages.quantile(0.75)\n", + "IQR = Q3 - Q1\n", + "\n", + "# Define outlier bounds\n", + "lower_bound = Q1 - 1.5 * IQR\n", + "upper_bound = Q3 + 1.5 * IQR\n", + "\n", + "print(f\"IQR-based outlier bounds for age: [{lower_bound:.2f}, {upper_bound:.2f}]\")\n", + "\n", + "# Identify outliers\n", + "age_outliers = dirty_data[(dirty_data['age'] < lower_bound) | (dirty_data['age'] > upper_bound)]\n", + "print(f\"\\nRows with age outliers:\")\n", + "print(age_outliers[['customer_id', 'name', 'age']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Using Z-Score Method\n", + "\n", + "The Z-score method identifies outliers based on standard deviations from the mean:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy import stats\n", + "\n", + "# Calculate Z-scores for age\n", + "dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n", + "\n", + "# Typically, Z-score > 3 indicates an outlier\n", + "print(\"Rows with age Z-score > 3:\")\n", + "zscore_outliers = dirty_data[dirty_data['age_zscore'] > 3]\n", + "print(zscore_outliers[['customer_id', 'name', 'age', 'age_zscore']])\n", + "\n", + "# Clean up the temporary column\n", + "dirty_data = dirty_data.drop('age_zscore', axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Handling Outliers\n", + "\n", + "Once detected, outliers can be handled in several ways:\n", + "1. **Remove**: Drop rows with outliers (if they're errors)\n", + "2. **Cap**: Replace with boundary values\n", + "3. **Replace with NaN**: Treat as missing data and use imputation techniques\n", + "4. **Keep**: If they're legitimate extreme values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a cleaned version by replacing impossible ages with NaN\n", + "dirty_data['age_clean'] = dirty_data['age'].apply(\n", + " lambda x: np.nan if (x < 0 or x > 120) else x\n", + ")\n", + "\n", + "print(\"Age column before and after cleaning:\")\n", + "print(dirty_data[['customer_id', 'name', 'age', 'age_clean']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Detecting Near-Duplicate Rows\n", + "\n", + "Notice that our dataset has multiple entries for \"John Smith\" with slightly different values. Let's identify potential duplicates based on name similarity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First, let's look at exact name matches (ignoring extra whitespace)\n", + "dirty_data['name_normalized'] = dirty_data['name'].str.strip().str.lower()\n", + "\n", + "print(\"Checking for duplicate names:\")\n", + "duplicate_names = dirty_data[dirty_data.duplicated(['name_normalized'], keep=False)]\n", + "print(duplicate_names.sort_values('name_normalized')[['customer_id', 'name', 'age', 'country']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Finding Near-Duplicates with Fuzzy Matching\n", + "\n", + "For more sophisticated duplicate detection, we can use fuzzy matching to find similar names:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rapidfuzz import process, fuzz\n", + "\n", + "# Function to find potential duplicates\n", + "def find_near_duplicates(df, column, threshold=90):\n", + " \"\"\"\n", + " Find near-duplicate entries in a column using fuzzy matching.\n", + " \n", + " Parameters:\n", + " - df: DataFrame\n", + " - column: Column name to check for duplicates\n", + " - threshold: Similarity threshold (0-100)\n", + " \n", + " Returns: List of potential duplicate groups\n", + " \"\"\"\n", + " values = df[column].unique()\n", + " duplicate_groups = []\n", + " checked = set()\n", + " \n", + " for value in values:\n", + " if value in checked:\n", + " continue\n", + " \n", + " # Find similar values\n", + " matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n", + " similar = [m[0] for m in matches if m[1] >= threshold]\n", + " \n", + " if len(similar) > 1:\n", + " duplicate_groups.append(similar)\n", + " checked.update(similar)\n", + " \n", + " return duplicate_groups\n", + "\n", + "# Find near-duplicate names\n", + "duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n", + "\n", + "print(\"Potential duplicate groups:\")\n", + "for i, group in enumerate(duplicate_groups, 1):\n", + " print(f\"\\nGroup {i}:\")\n", + " for name in group:\n", + " matching_rows = dirty_data[dirty_data['name'] == name]\n", + " print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n", + " for _, row in matching_rows.iterrows():\n", + " print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Handling Duplicates\n", + "\n", + "Once identified, you need to decide how to handle duplicates:\n", + "1. **Keep the first occurrence**: Use `drop_duplicates(keep='first')`\n", + "2. **Keep the last occurrence**: Use `drop_duplicates(keep='last')`\n", + "3. **Aggregate information**: Combine information from duplicate rows\n", + "4. **Manual review**: Flag for human review" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Remove duplicates based on normalized name, keeping first occurrence\n", + "cleaned_data = dirty_data.drop_duplicates(subset=['name_normalized'], keep='first')\n", + "\n", + "print(f\"Original dataset: {len(dirty_data)} rows\")\n", + "print(f\"After removing name duplicates: {len(cleaned_data)} rows\")\n", + "print(f\"Removed: {len(dirty_data) - len(cleaned_data)} duplicate rows\")\n", + "\n", + "print(\"\\nCleaned dataset:\")\n", + "print(cleaned_data[['customer_id', 'name', 'age', 'country_clean']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary: Complete Data Cleaning Pipeline\n", + "\n", + "Let's put it all together into a comprehensive cleaning pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_dataset(df):\n", + " \"\"\"\n", + " Comprehensive data cleaning function.\n", + " \"\"\"\n", + " # Create a copy to avoid modifying the original\n", + " cleaned = df.copy()\n", + " \n", + " # 1. Standardize categorical values (country)\n", + " country_mapping = {\n", + " 'usa': 'USA', 'u.s.a': 'USA', 'united states': 'USA',\n", + " 'uk': 'UK', 'united kingdom': 'UK',\n", + " 'canada': 'Canada', 'mexico': 'Mexico'\n", + " }\n", + " cleaned['country'] = cleaned['country'].str.lower().map(country_mapping)\n", + " \n", + " # 2. Clean abnormal age values\n", + " cleaned['age'] = cleaned['age'].apply(\n", + " lambda x: np.nan if (x < 0 or x > 120) else x\n", + " )\n", + " \n", + " # 3. Remove near-duplicate names (normalize whitespace)\n", + " cleaned['name'] = cleaned['name'].str.strip()\n", + " cleaned = cleaned.drop_duplicates(subset=['name'], keep='first')\n", + " \n", + " return cleaned\n", + "\n", + "# Apply the cleaning pipeline\n", + "final_cleaned_data = clean_dataset(dirty_data)\n", + "\n", + "print(\"Before cleaning:\")\n", + "print(f\" Rows: {len(dirty_data)}\")\n", + "print(f\" Unique countries: {dirty_data['country'].nunique()}\")\n", + "print(f\" Invalid ages: {((dirty_data['age'] < 0) | (dirty_data['age'] > 120)).sum()}\")\n", + "\n", + "print(\"\\nAfter cleaning:\")\n", + "print(f\" Rows: {len(final_cleaned_data)}\")\n", + "print(f\" Unique countries: {final_cleaned_data['country'].nunique()}\")\n", + "print(f\" Invalid ages: {((final_cleaned_data['age'] < 0) | (final_cleaned_data['age'] > 120)).sum()}\")\n", + "\n", + "print(\"\\nCleaned dataset:\")\n", + "print(final_cleaned_data[['customer_id', 'name', 'age', 'country', 'purchase_amount']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🎯 Challenge Exercise\n", + "\n", + "Now it's your turn! Below is a new row of data with multiple quality issues. Can you:\n", + "\n", + "1. Identify all the issues in this row\n", + "2. Write code to clean each issue\n", + "3. Add the cleaned row to the dataset\n", + "\n", + "Here's the problematic data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# New problematic row\n", + "new_row = pd.DataFrame({\n", + " 'customer_id': [13],\n", + " 'name': [' Diana Prince '], # Extra whitespace\n", + " 'age': [250], # Impossible age\n", + " 'country': ['U.S.A.'], # Inconsistent format\n", + " 'purchase_amount': [150.00]\n", + "})\n", + "\n", + "print(\"New row to clean:\")\n", + "print(new_row)\n", + "\n", + "# TODO: Your code here to clean this row\n", + "# Hints:\n", + "# 1. Strip whitespace from the name\n", + "# 2. Check if the name is a duplicate (Diana Prince already exists)\n", + "# 3. Handle the impossible age value\n", + "# 4. Standardize the country name\n", + "\n", + "# Example solution (uncomment and modify as needed):\n", + "# new_row_cleaned = new_row.copy()\n", + "# new_row_cleaned['name'] = new_row_cleaned['name'].str.strip()\n", + "# new_row_cleaned['age'] = np.nan # Invalid age\n", + "# new_row_cleaned['country'] = 'USA' # Standardized\n", + "# print(\"\\nCleaned row:\")\n", + "# print(new_row_cleaned)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Key Takeaways\n", + "\n", + "1. **Inconsistent categories** are common in real-world data. Always check unique values and standardize them using mappings or fuzzy matching.\n", + "\n", + "2. **Outliers** can significantly affect your analysis. Use domain knowledge combined with statistical methods (IQR, Z-score) to detect them.\n", + "\n", + "3. **Near-duplicates** are harder to detect than exact duplicates. Consider using fuzzy matching and normalizing data (lowercasing, stripping whitespace) to identify them.\n", + "\n", + "4. **Data cleaning is iterative**. You may need to apply multiple techniques and review the results before finalizing your cleaned dataset.\n", + "\n", + "5. **Document your decisions**. Keep track of what cleaning steps you applied and why, as this is important for reproducibility and transparency.\n", + "\n", + "> **Best Practice:** Always keep a copy of your original \"dirty\" data. Never overwrite your source data files - create cleaned versions with clear naming conventions like `data_cleaned.csv`." + ] } ], "metadata": { @@ -3715,4 +4215,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file From c7982edc029e09183dfd75bc3f07bca371d82c64 Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Fri, 3 Oct 2025 14:47:48 +0100 Subject: [PATCH 03/10] Update 2-Working-With-Data/08-data-preparation/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- 2-Working-With-Data/08-data-preparation/notebook.ipynb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index 2a3ef0fe..22c9a757 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3815,8 +3815,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Note: You may need to install rapidfuzz: pip install rapidfuzz\n", - "from rapidfuzz import process, fuzz\n", + "try:\n", + " from rapidfuzz import process, fuzz\n", + "except ImportError:\n", + " raise ImportError(\"The 'rapidfuzz' library is required for fuzzy matching. Please install it with 'pip install rapidfuzz' and rerun this cell.\")\n", "\n", "# Get unique countries\n", "unique_countries = dirty_data['country'].unique()\n", From 95ded644faba6d4bd979b0ab8646b32cb303f423 Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Fri, 3 Oct 2025 14:48:08 +0100 Subject: [PATCH 04/10] Update 2-Working-With-Data/08-data-preparation/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../08-data-preparation/notebook.ipynb | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index 22c9a757..bcdf44c7 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3909,18 +3909,21 @@ "metadata": {}, "outputs": [], "source": [ - "from scipy import stats\n", - "\n", - "# Calculate Z-scores for age\n", - "dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n", + "try:\n", + " from scipy import stats\n", + "except ImportError:\n", + " print(\"scipy is required for Z-score calculation. Please install it with 'pip install scipy' and rerun this cell.\")\n", + "else:\n", + " # Calculate Z-scores for age\n", + " dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n", "\n", - "# Typically, Z-score > 3 indicates an outlier\n", - "print(\"Rows with age Z-score > 3:\")\n", - "zscore_outliers = dirty_data[dirty_data['age_zscore'] > 3]\n", - "print(zscore_outliers[['customer_id', 'name', 'age', 'age_zscore']])\n", + " # Typically, Z-score > 3 indicates an outlier\n", + " print(\"Rows with age Z-score > 3:\")\n", + " zscore_outliers = dirty_data[dirty_data['age_zscore'] > 3]\n", + " print(zscore_outliers[['customer_id', 'name', 'age', 'age_zscore']])\n", "\n", - "# Clean up the temporary column\n", - "dirty_data = dirty_data.drop('age_zscore', axis=1)" + " # Clean up the temporary column\n", + " dirty_data = dirty_data.drop('age_zscore', axis=1)" ] }, { From 9f59dfaede61d03cd2047413101cf324952e3bc6 Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Fri, 3 Oct 2025 14:48:22 +0100 Subject: [PATCH 05/10] Update 2-Working-With-Data/08-data-preparation/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- 2-Working-With-Data/08-data-preparation/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index bcdf44c7..5e21785f 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3795,7 +3795,7 @@ "dirty_data['country_clean'] = dirty_data['country'].str.lower().map(country_mapping)\n", "\n", "print(\"Before standardization:\")\n", - "print(dirty_data[['country']].value_counts())\n", + "print(dirty_data['country'].value_counts())\n", "print(\"\\nAfter standardization:\")\n", "print(dirty_data[['country_clean']].value_counts())" ] From 97d1ea036a062d4c7e3da99c5894ce2838932d2a Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Fri, 3 Oct 2025 14:52:35 +0100 Subject: [PATCH 06/10] Update 2-Working-With-Data/08-data-preparation/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- 2-Working-With-Data/08-data-preparation/notebook.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index 5e21785f..de5ec916 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3815,10 +3815,10 @@ "metadata": {}, "outputs": [], "source": [ - "try:\n", - " from rapidfuzz import process, fuzz\n", - "except ImportError:\n", - " raise ImportError(\"The 'rapidfuzz' library is required for fuzzy matching. Please install it with 'pip install rapidfuzz' and rerun this cell.\")\n", + "# rapidfuzz was already imported in an earlier cell\n", + "# from rapidfuzz import process, fuzz\n", + "\n", + "\n", "\n", "# Get unique countries\n", "unique_countries = dirty_data['country'].unique()\n", From 37aa09a160d74d550bf5e27510e8c7f3b3ab132b Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Fri, 3 Oct 2025 14:54:50 +0100 Subject: [PATCH 07/10] Update 2-Working-With-Data/08-data-preparation/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- 2-Working-With-Data/08-data-preparation/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index de5ec916..1023b7f3 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3816,7 +3816,7 @@ "outputs": [], "source": [ "# rapidfuzz was already imported in an earlier cell\n", - "# from rapidfuzz import process, fuzz\n", + "from rapidfuzz import process, fuzz\n", "\n", "\n", "\n", From 186327158e9707c5897d0ba70aee6f45af747725 Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Fri, 3 Oct 2025 14:55:12 +0100 Subject: [PATCH 08/10] Update 2-Working-With-Data/08-data-preparation/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../08-data-preparation/notebook.ipynb | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index 1023b7f3..fe10a4f6 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3815,24 +3815,29 @@ "metadata": {}, "outputs": [], "source": [ - "# rapidfuzz was already imported in an earlier cell\n", - "from rapidfuzz import process, fuzz\n", - "\n", - "\n", + "try:\n", + " from rapidfuzz import process, fuzz\n", + "except ImportError:\n", + " print(\"rapidfuzz is not installed. Please install it with 'pip install rapidfuzz' to use fuzzy matching.\")\n", + " process = None\n", + " fuzz = None\n", "\n", "# Get unique countries\n", "unique_countries = dirty_data['country'].unique()\n", "\n", "# For each country, find similar matches\n", - "print(\"Finding similar country names (similarity > 70%):\")\n", - "for country in unique_countries:\n", - " matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n", - " # Filter matches with similarity > 70 and not identical\n", - " similar = [m for m in matches if m[1] > 70 and m[0] != country]\n", - " if similar:\n", - " print(f\"\\n'{country}' is similar to:\")\n", - " for match, score, _ in similar:\n", - " print(f\" - '{match}' (similarity: {score}%)\")" + "if process is not None and fuzz is not None:\n", + " print(\"Finding similar country names (similarity > 70%):\")\n", + " for country in unique_countries:\n", + " matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n", + " # Filter matches with similarity > 70 and not identical\n", + " similar = [m for m in matches if m[1] > 70 and m[0] != country]\n", + " if similar:\n", + " print(f\"\\n'{country}' is similar to:\")\n", + " for match, score, _ in similar:\n", + " print(f\" - '{match}' (similarity: {score}%)\")\n", + "else:\n", + " print(\"Skipping fuzzy matching because rapidfuzz is not available.\")" ] }, { From 5da6e49382bdc057a843adcca43079937ead8860 Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Fri, 3 Oct 2025 14:57:37 +0100 Subject: [PATCH 09/10] Update 2-Working-With-Data/08-data-preparation/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../08-data-preparation/notebook.ipynb | 79 ++++++++++--------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index fe10a4f6..294f278c 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3997,49 +3997,52 @@ "metadata": {}, "outputs": [], "source": [ - "from rapidfuzz import process, fuzz\n", + "try:\n", + " from rapidfuzz import process, fuzz\n", "\n", - "# Function to find potential duplicates\n", - "def find_near_duplicates(df, column, threshold=90):\n", - " \"\"\"\n", - " Find near-duplicate entries in a column using fuzzy matching.\n", - " \n", - " Parameters:\n", - " - df: DataFrame\n", - " - column: Column name to check for duplicates\n", - " - threshold: Similarity threshold (0-100)\n", - " \n", - " Returns: List of potential duplicate groups\n", - " \"\"\"\n", - " values = df[column].unique()\n", - " duplicate_groups = []\n", - " checked = set()\n", - " \n", - " for value in values:\n", - " if value in checked:\n", - " continue\n", + " # Function to find potential duplicates\n", + " def find_near_duplicates(df, column, threshold=90):\n", + " \"\"\"\n", + " Find near-duplicate entries in a column using fuzzy matching.\n", + " \n", + " Parameters:\n", + " - df: DataFrame\n", + " - column: Column name to check for duplicates\n", + " - threshold: Similarity threshold (0-100)\n", + " \n", + " Returns: List of potential duplicate groups\n", + " \"\"\"\n", + " values = df[column].unique()\n", + " duplicate_groups = []\n", + " checked = set()\n", + " \n", + " for value in values:\n", + " if value in checked:\n", + " continue\n", + " \n", + " # Find similar values\n", + " matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n", + " similar = [m[0] for m in matches if m[1] >= threshold]\n", " \n", - " # Find similar values\n", - " matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n", - " similar = [m[0] for m in matches if m[1] >= threshold]\n", + " if len(similar) > 1:\n", + " duplicate_groups.append(similar)\n", + " checked.update(similar)\n", " \n", - " if len(similar) > 1:\n", - " duplicate_groups.append(similar)\n", - " checked.update(similar)\n", - " \n", - " return duplicate_groups\n", + " return duplicate_groups\n", "\n", - "# Find near-duplicate names\n", - "duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n", + " # Find near-duplicate names\n", + " duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n", "\n", - "print(\"Potential duplicate groups:\")\n", - "for i, group in enumerate(duplicate_groups, 1):\n", - " print(f\"\\nGroup {i}:\")\n", - " for name in group:\n", - " matching_rows = dirty_data[dirty_data['name'] == name]\n", - " print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n", - " for _, row in matching_rows.iterrows():\n", - " print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")" + " print(\"Potential duplicate groups:\")\n", + " for i, group in enumerate(duplicate_groups, 1):\n", + " print(f\"\\nGroup {i}:\")\n", + " for name in group:\n", + " matching_rows = dirty_data[dirty_data['name'] == name]\n", + " print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n", + " for _, row in matching_rows.iterrows():\n", + " print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")\n", + "except ImportError:\n", + " print(\"rapidfuzz is not installed. Skipping fuzzy matching for near-duplicates.\")" ] }, { From 8e723abc2495f04c70611c8098ab56fece11bb6f Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Fri, 3 Oct 2025 14:58:12 +0100 Subject: [PATCH 10/10] Update 2-Working-With-Data/08-data-preparation/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- 2-Working-With-Data/08-data-preparation/notebook.ipynb | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index 294f278c..ceff57fe 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3919,8 +3919,11 @@ "except ImportError:\n", " print(\"scipy is required for Z-score calculation. Please install it with 'pip install scipy' and rerun this cell.\")\n", "else:\n", - " # Calculate Z-scores for age\n", - " dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n", + " # Calculate Z-scores for age, handling NaN values\n", + " age_nonan = dirty_data['age'].dropna()\n", + " zscores = np.abs(stats.zscore(age_nonan))\n", + " dirty_data['age_zscore'] = np.nan\n", + " dirty_data.loc[age_nonan.index, 'age_zscore'] = zscores\n", "\n", " # Typically, Z-score > 3 indicates an outlier\n", " print(\"Rows with age Z-score > 3:\")\n",