|
|
|
@ -3997,49 +3997,52 @@
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"from rapidfuzz import process, fuzz\n",
|
|
|
|
"try:\n",
|
|
|
|
|
|
|
|
" from rapidfuzz import process, fuzz\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"# Function to find potential duplicates\n",
|
|
|
|
" # Function to find potential duplicates\n",
|
|
|
|
"def find_near_duplicates(df, column, threshold=90):\n",
|
|
|
|
" def find_near_duplicates(df, column, threshold=90):\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" Find near-duplicate entries in a column using fuzzy matching.\n",
|
|
|
|
" Find near-duplicate entries in a column using fuzzy matching.\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" Parameters:\n",
|
|
|
|
" Parameters:\n",
|
|
|
|
" - df: DataFrame\n",
|
|
|
|
" - df: DataFrame\n",
|
|
|
|
" - column: Column name to check for duplicates\n",
|
|
|
|
" - column: Column name to check for duplicates\n",
|
|
|
|
" - threshold: Similarity threshold (0-100)\n",
|
|
|
|
" - threshold: Similarity threshold (0-100)\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" Returns: List of potential duplicate groups\n",
|
|
|
|
" Returns: List of potential duplicate groups\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" \"\"\"\n",
|
|
|
|
" values = df[column].unique()\n",
|
|
|
|
" values = df[column].unique()\n",
|
|
|
|
" duplicate_groups = []\n",
|
|
|
|
" duplicate_groups = []\n",
|
|
|
|
" checked = set()\n",
|
|
|
|
" checked = set()\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" for value in values:\n",
|
|
|
|
" for value in values:\n",
|
|
|
|
" if value in checked:\n",
|
|
|
|
" if value in checked:\n",
|
|
|
|
" continue\n",
|
|
|
|
" continue\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" # Find similar values\n",
|
|
|
|
|
|
|
|
" matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n",
|
|
|
|
|
|
|
|
" similar = [m[0] for m in matches if m[1] >= threshold]\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" # Find similar values\n",
|
|
|
|
" if len(similar) > 1:\n",
|
|
|
|
" matches = process.extract(value, values, scorer=fuzz.ratio, limit=len(values))\n",
|
|
|
|
" duplicate_groups.append(similar)\n",
|
|
|
|
" similar = [m[0] for m in matches if m[1] >= threshold]\n",
|
|
|
|
" checked.update(similar)\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" if len(similar) > 1:\n",
|
|
|
|
" return duplicate_groups\n",
|
|
|
|
" duplicate_groups.append(similar)\n",
|
|
|
|
|
|
|
|
" checked.update(similar)\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" return duplicate_groups\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"# Find near-duplicate names\n",
|
|
|
|
" # Find near-duplicate names\n",
|
|
|
|
"duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
|
|
|
|
" duplicate_groups = find_near_duplicates(dirty_data, 'name', threshold=90)\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"print(\"Potential duplicate groups:\")\n",
|
|
|
|
" print(\"Potential duplicate groups:\")\n",
|
|
|
|
"for i, group in enumerate(duplicate_groups, 1):\n",
|
|
|
|
" for i, group in enumerate(duplicate_groups, 1):\n",
|
|
|
|
" print(f\"\\nGroup {i}:\")\n",
|
|
|
|
" print(f\"\\nGroup {i}:\")\n",
|
|
|
|
" for name in group:\n",
|
|
|
|
" for name in group:\n",
|
|
|
|
" matching_rows = dirty_data[dirty_data['name'] == name]\n",
|
|
|
|
" matching_rows = dirty_data[dirty_data['name'] == name]\n",
|
|
|
|
" print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n",
|
|
|
|
" print(f\" '{name}': {len(matching_rows)} occurrence(s)\")\n",
|
|
|
|
" for _, row in matching_rows.iterrows():\n",
|
|
|
|
" for _, row in matching_rows.iterrows():\n",
|
|
|
|
" print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")"
|
|
|
|
" print(f\" - Customer {row['customer_id']}: age={row['age']}, country={row['country']}\")\n",
|
|
|
|
|
|
|
|
"except ImportError:\n",
|
|
|
|
|
|
|
|
" print(\"rapidfuzz is not installed. Skipping fuzzy matching for near-duplicates.\")"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
|