|
|
|
|
@ -3815,24 +3815,29 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# rapidfuzz was already imported in an earlier cell\n",
|
|
|
|
|
"from rapidfuzz import process, fuzz\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"try:\n",
|
|
|
|
|
" from rapidfuzz import process, fuzz\n",
|
|
|
|
|
"except ImportError:\n",
|
|
|
|
|
" print(\"rapidfuzz is not installed. Please install it with 'pip install rapidfuzz' to use fuzzy matching.\")\n",
|
|
|
|
|
" process = None\n",
|
|
|
|
|
" fuzz = None\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Get unique countries\n",
|
|
|
|
|
"unique_countries = dirty_data['country'].unique()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# For each country, find similar matches\n",
|
|
|
|
|
"print(\"Finding similar country names (similarity > 70%):\")\n",
|
|
|
|
|
"for country in unique_countries:\n",
|
|
|
|
|
" matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n",
|
|
|
|
|
" # Filter matches with similarity > 70 and not identical\n",
|
|
|
|
|
" similar = [m for m in matches if m[1] > 70 and m[0] != country]\n",
|
|
|
|
|
" if similar:\n",
|
|
|
|
|
" print(f\"\\n'{country}' is similar to:\")\n",
|
|
|
|
|
" for match, score, _ in similar:\n",
|
|
|
|
|
" print(f\" - '{match}' (similarity: {score}%)\")"
|
|
|
|
|
"if process is not None and fuzz is not None:\n",
|
|
|
|
|
" print(\"Finding similar country names (similarity > 70%):\")\n",
|
|
|
|
|
" for country in unique_countries:\n",
|
|
|
|
|
" matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n",
|
|
|
|
|
" # Filter matches with similarity > 70 and not identical\n",
|
|
|
|
|
" similar = [m for m in matches if m[1] > 70 and m[0] != country]\n",
|
|
|
|
|
" if similar:\n",
|
|
|
|
|
" print(f\"\\n'{country}' is similar to:\")\n",
|
|
|
|
|
" for match, score, _ in similar:\n",
|
|
|
|
|
" print(f\" - '{match}' (similarity: {score}%)\")\n",
|
|
|
|
|
"else:\n",
|
|
|
|
|
" print(\"Skipping fuzzy matching because rapidfuzz is not available.\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
|