From 5da6e49382bdc057a843adcca43079937ead8860 Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Fri, 3 Oct 2025 14:57:37 +0100 Subject: [PATCH] Update 2-Working-With-Data/08-data-preparation/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../08-data-preparation/notebook.ipynb | 79 ++++++++++--------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb index fe10a4f6..294f278c 100644 --- a/2-Working-With-Data/08-data-preparation/notebook.ipynb +++ b/2-Working-With-Data/08-data-preparation/notebook.ipynb @@ -3997,49 +3997,52 @@ "metadata": {}, "outputs": [], "source": [ - "from rapidfuzz import process, fuzz\n", + "try:\n", + " from rapidfuzz import process, fuzz\n", "\n", - "# Function to find potential duplicates\n", - "def find_near_duplicates(df, column, threshold=90):\n", - " \"\"\"\n", - " Find near-duplicate entries in a column using fuzzy matching.\n", - " \n", - " Parameters:\n", - " - df: DataFrame\n", - " - column: Column name to check for duplicates\n", - " - threshold: Similarity threshold (0-100)\n", - " \n", - " Returns: List of potential duplicate groups\n", - " \"\"\"\n", - " values = df[column].unique()\n", - " duplicate_groups = []\n", - " checked = set()\n", - " \n", - " for value in values:\n", - " if value in checked:\n", - " continue\n", + " # Function to find potential duplicates\n", + " def find_near_duplicates(df, column, threshold=90):\n", + " \"\"\"\n", + " Find near-duplicate entries in a column using fuzzy matching.\n", + " \n", + " Parameters:\n", + " - df: DataFrame\n", + " - column: Column name to check for duplicates\n", + " - threshold: Similarity threshold (0-100)\n", + " \n", + " Returns: List of potential duplicate groups\n", + " \"\"\"\n", + " values = df[column].unique()\n", + " duplicate_groups = []\n", + " checked = set()\n", + " \n", + " for value in values:\n", + " if value in checked:\n", + " continue\n", + " \n", + " # Find similar values\n", + " matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n", + " similar = [m[0] for m in matches if m[1] >= threshold]\n", " \n", - " # Find similar values\n", - " matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n", - " similar = [m[0] for m in matches if m[1] >= threshold]\n", + " if len(similar) > 1:\n", + " duplicate_groups.append(similar)\n", + " checked.update(similar)\n", " \n", - " if len(similar) > 1:\n", - " duplicate_groups.append(similar)\n", - " checked.update(similar)\n", - " \n", - " return duplicate_groups\n", + " return duplicate_groups\n", "\n", - "# Find near-duplicate names\n", - "duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n", + " # Find near-duplicate names\n", + " duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n", "\n", - "print(\"Potential duplicate groups:\")\n", - "for i, group in enumerate(duplicate_groups, 1):\n", - " print(f\"\\nGroup {i}:\")\n", - " for name in group:\n", - " matching_rows = dirty_data[dirty_data['name'] == name]\n", - " print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n", - " for _, row in matching_rows.iterrows():\n", - " print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")" + " print(\"Potential duplicate groups:\")\n", + " for i, group in enumerate(duplicate_groups, 1):\n", + " print(f\"\\nGroup {i}:\")\n", + " for name in group:\n", + " matching_rows = dirty_data[dirty_data['name'] == name]\n", + " print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n", + " for _, row in matching_rows.iterrows():\n", + " print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")\n", + "except ImportError:\n", + " print(\"rapidfuzz is not installed. Skipping fuzzy matching for near-duplicates.\")" ] }, {