Update 2-Working-With-Data/08-data-preparation/notebook.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
copilot/fix-e446e3a1-6b4c-4310-87d5-641ed6823a37
Lee Stott 2 months ago committed by GitHub
parent 37aa09a160
commit 186327158e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -3815,24 +3815,29 @@
"metadata": {},
"outputs": [],
"source": [
"# rapidfuzz was already imported in an earlier cell\n",
"from rapidfuzz import process, fuzz\n",
"\n",
"\n",
"try:\n",
" from rapidfuzz import process, fuzz\n",
"except ImportError:\n",
" print(\"rapidfuzz is not installed. Please install it with 'pip install rapidfuzz' to use fuzzy matching.\")\n",
" process = None\n",
" fuzz = None\n",
"\n",
"# Get unique countries\n",
"unique_countries = dirty_data['country'].unique()\n",
"\n",
"# For each country, find similar matches\n",
"print(\"Finding similar country names (similarity > 70%):\")\n",
"for country in unique_countries:\n",
" matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n",
" # Filter matches with similarity > 70 and not identical\n",
" similar = [m for m in matches if m[1] > 70 and m[0] != country]\n",
" if similar:\n",
" print(f\"\\n'{country}' is similar to:\")\n",
" for match, score, _ in similar:\n",
" print(f\" - '{match}' (similarity: {score}%)\")"
"if process is not None and fuzz is not None:\n",
" print(\"Finding similar country names (similarity > 70%):\")\n",
" for country in unique_countries:\n",
" matches = process.extract(country, unique_countries, scorer=fuzz.ratio, limit=3)\n",
" # Filter matches with similarity > 70 and not identical\n",
" similar = [m for m in matches if m[1] > 70 and m[0] != country]\n",
" if similar:\n",
" print(f\"\\n'{country}' is similar to:\")\n",
" for match, score, _ in similar:\n",
" print(f\" - '{match}' (similarity: {score}%)\")\n",
"else:\n",
" print(\"Skipping fuzzy matching because rapidfuzz is not available.\")"
]
},
{

Loading…
Cancel
Save