Update 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
copilot/fix-e446e3a1-6b4c-4310-87d5-641ed6823a37
Lee Stott 2 months ago committed by GitHub
parent 186327158e
commit 5da6e49382
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -3997,49 +3997,52 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from rapidfuzz import process, fuzz\n", "try:\n",
" from rapidfuzz import process, fuzz\n",
"\n", "\n",
"# Function to find potential duplicates\n", " # Function to find potential duplicates\n",
"def find_near_duplicates(df, column, threshold=90):\n", " def find_near_duplicates(df, column, threshold=90):\n",
" \"\"\"\n", " \"\"\"\n",
" Find near-duplicate entries in a column using fuzzy matching.\n", " Find near-duplicate entries in a column using fuzzy matching.\n",
" \n", " \n",
" Parameters:\n", " Parameters:\n",
" - df: DataFrame\n", " - df: DataFrame\n",
" - column: Column name to check for duplicates\n", " - column: Column name to check for duplicates\n",
" - threshold: Similarity threshold (0-100)\n", " - threshold: Similarity threshold (0-100)\n",
" \n", " \n",
" Returns: List of potential duplicate groups\n", " Returns: List of potential duplicate groups\n",
" \"\"\"\n", " \"\"\"\n",
" values = df[column].unique()\n", " values = df[column].unique()\n",
" duplicate_groups = []\n", " duplicate_groups = []\n",
" checked = set()\n", " checked = set()\n",
" \n", " \n",
" for value in values:\n", " for value in values:\n",
" if value in checked:\n", " if value in checked:\n",
" continue\n", " continue\n",
" \n",
" # Find similar values\n",
" matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n",
" similar = [m[0] for m in matches if m[1] >= threshold]\n",
" \n", " \n",
" # Find similar values\n", " if len(similar) > 1:\n",
" matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n", " duplicate_groups.append(similar)\n",
" similar = [m[0] for m in matches if m[1] >= threshold]\n", " checked.update(similar)\n",
" \n", " \n",
" if len(similar) > 1:\n", " return duplicate_groups\n",
" duplicate_groups.append(similar)\n",
" checked.update(similar)\n",
" \n",
" return duplicate_groups\n",
"\n", "\n",
"# Find near-duplicate names\n", " # Find near-duplicate names\n",
"duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n", " duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
"\n", "\n",
"print(\"Potential duplicate groups:\")\n", " print(\"Potential duplicate groups:\")\n",
"for i, group in enumerate(duplicate_groups, 1):\n", " for i, group in enumerate(duplicate_groups, 1):\n",
" print(f\"\\nGroup {i}:\")\n", " print(f\"\\nGroup {i}:\")\n",
" for name in group:\n", " for name in group:\n",
" matching_rows = dirty_data[dirty_data['name'] == name]\n", " matching_rows = dirty_data[dirty_data['name'] == name]\n",
" print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n", " print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n",
" for _, row in matching_rows.iterrows():\n", " for _, row in matching_rows.iterrows():\n",
" print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")" " print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")\n",
"except ImportError:\n",
" print(\"rapidfuzz is not installed. Skipping fuzzy matching for near-duplicates.\")"
] ]
}, },
{ {

Loading…
Cancel
Save