|
|
|
|
@ -3997,6 +3997,7 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"try:\n",
|
|
|
|
|
" from rapidfuzz import process, fuzz\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Function to find potential duplicates\n",
|
|
|
|
|
@ -4039,7 +4040,9 @@
|
|
|
|
|
" matching_rows = dirty_data[dirty_data['name'] == name]\n",
|
|
|
|
|
" print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n",
|
|
|
|
|
" for _, row in matching_rows.iterrows():\n",
|
|
|
|
|
" print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")"
|
|
|
|
|
" print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")\n",
|
|
|
|
|
"except ImportError:\n",
|
|
|
|
|
" print(\"rapidfuzz is not installed. Skipping fuzzy matching for near-duplicates.\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
|