|
|
|
|
@ -3909,18 +3909,21 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"from scipy import stats\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Calculate Z-scores for age\n",
|
|
|
|
|
"dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n",
|
|
|
|
|
"try:\n",
|
|
|
|
|
" from scipy import stats\n",
|
|
|
|
|
"except ImportError:\n",
|
|
|
|
|
" print(\"scipy is required for Z-score calculation. Please install it with 'pip install scipy' and rerun this cell.\")\n",
|
|
|
|
|
"else:\n",
|
|
|
|
|
" # Calculate Z-scores for age\n",
|
|
|
|
|
" dirty_data['age_zscore'] = np.abs(stats.zscore(dirty_data['age']))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Typically, Z-score > 3 indicates an outlier\n",
|
|
|
|
|
"print(\"Rows with age Z-score > 3:\")\n",
|
|
|
|
|
"zscore_outliers = dirty_data[dirty_data['age_zscore'] > 3]\n",
|
|
|
|
|
"print(zscore_outliers[['customer_id', 'name', 'age', 'age_zscore']])\n",
|
|
|
|
|
" # Typically, Z-score > 3 indicates an outlier\n",
|
|
|
|
|
" print(\"Rows with age Z-score > 3:\")\n",
|
|
|
|
|
" zscore_outliers = dirty_data[dirty_data['age_zscore'] > 3]\n",
|
|
|
|
|
" print(zscore_outliers[['customer_id', 'name', 'age', 'age_zscore']])\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Clean up the temporary column\n",
|
|
|
|
|
"dirty_data = dirty_data.drop('age_zscore', axis=1)"
|
|
|
|
|
" # Clean up the temporary column\n",
|
|
|
|
|
" dirty_data = dirty_data.drop('age_zscore', axis=1)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
|