Update 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
copilot/fix-e446e3a1-6b4c-4310-87d5-641ed6823a37
Lee Stott 2 months ago committed by GitHub
parent 186327158e
commit 5da6e49382
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -3997,10 +3997,11 @@
"metadata": {},
"outputs": [],
"source": [
"from rapidfuzz import process, fuzz\n",
"try:\n",
" from rapidfuzz import process, fuzz\n",
"\n",
"# Function to find potential duplicates\n",
"def find_near_duplicates(df, column, threshold=90):\n",
" # Function to find potential duplicates\n",
" def find_near_duplicates(df, column, threshold=90):\n",
" \"\"\"\n",
" Find near-duplicate entries in a column using fuzzy matching.\n",
" \n",
@ -4029,17 +4030,19 @@
" \n",
" return duplicate_groups\n",
"\n",
"# Find near-duplicate names\n",
"duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
" # Find near-duplicate names\n",
" duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
"\n",
"print(\"Potential duplicate groups:\")\n",
"for i, group in enumerate(duplicate_groups, 1):\n",
" print(\"Potential duplicate groups:\")\n",
" for i, group in enumerate(duplicate_groups, 1):\n",
" print(f\"\\nGroup {i}:\")\n",
" for name in group:\n",
" matching_rows = dirty_data[dirty_data['name'] == name]\n",
" print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n",
" for _, row in matching_rows.iterrows():\n",
" print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")"
" print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")\n",
"except ImportError:\n",
" print(\"rapidfuzz is not installed. Skipping fuzzy matching for near-duplicates.\")"
]
},
{

Loading…
Cancel
Save