|
|
|
@ -1630,12 +1630,12 @@
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"id": "MY5faq4yLdpQ",
|
|
|
|
|
"outputId": "c3838b07-0d15-471e-8dad-370de91d4bdc",
|
|
|
|
|
"colab": {
|
|
|
|
|
"base_uri": "https://localhost:8080/",
|
|
|
|
|
"height": 204
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"id": "MY5faq4yLdpQ",
|
|
|
|
|
"outputId": "c3838b07-0d15-471e-8dad-370de91d4bdc"
|
|
|
|
|
},
|
|
|
|
|
"source": [
|
|
|
|
|
"fill_with_mode = pd.DataFrame([[1,2,\"True\"],\n",
|
|
|
|
@ -1736,11 +1736,11 @@
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"id": "WKy-9Y2tN5jv",
|
|
|
|
|
"outputId": "41f5064e-502d-4aec-dc2d-86f885068b4f",
|
|
|
|
|
"colab": {
|
|
|
|
|
"base_uri": "https://localhost:8080/"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"id": "WKy-9Y2tN5jv",
|
|
|
|
|
"outputId": "41f5064e-502d-4aec-dc2d-86f885068b4f"
|
|
|
|
|
},
|
|
|
|
|
"source": [
|
|
|
|
|
"fill_with_mode[2].value_counts()"
|
|
|
|
@ -1784,12 +1784,12 @@
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"id": "tvas7c9_OPWE",
|
|
|
|
|
"outputId": "7282c4f7-0e59-4398-b4f2-5919baf61164",
|
|
|
|
|
"colab": {
|
|
|
|
|
"base_uri": "https://localhost:8080/",
|
|
|
|
|
"height": 204
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"id": "tvas7c9_OPWE",
|
|
|
|
|
"outputId": "7282c4f7-0e59-4398-b4f2-5919baf61164"
|
|
|
|
|
},
|
|
|
|
|
"source": [
|
|
|
|
|
"fill_with_mode"
|
|
|
|
@ -1894,19 +1894,252 @@
|
|
|
|
|
"\n",
|
|
|
|
|
"We replace with Median, in case of skewed data with outliers. This is beacuse median is robust to outliers.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"When the data is normalized, we can use mean, as in that case, mean and median would be pretty close."
|
|
|
|
|
"When the data is normalized, we can use mean, as in that case, mean and median would be pretty close.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"First, let us take a column which is normally distributed and let us fill the missing value with the mean of the column. "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"id": "09HM_2feOj5Y"
|
|
|
|
|
"colab": {
|
|
|
|
|
"base_uri": "https://localhost:8080/",
|
|
|
|
|
"height": 204
|
|
|
|
|
},
|
|
|
|
|
"id": "09HM_2feOj5Y",
|
|
|
|
|
"outputId": "ade42fec-dc40-45d0-e22c-974849ea8664"
|
|
|
|
|
},
|
|
|
|
|
"source": [
|
|
|
|
|
""
|
|
|
|
|
"fill_with_mean = pd.DataFrame([[-2,0,1],\n",
|
|
|
|
|
" [-1,2,3],\n",
|
|
|
|
|
" [np.nan,4,5],\n",
|
|
|
|
|
" [1,6,7],\n",
|
|
|
|
|
" [2,8,9]])\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"fill_with_mean"
|
|
|
|
|
],
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"outputs": []
|
|
|
|
|
"execution_count": 33,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "execute_result",
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>-2.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>3</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>6</td>\n",
|
|
|
|
|
" <td>7</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>8</td>\n",
|
|
|
|
|
" <td>9</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" 0 1 2\n",
|
|
|
|
|
"0 -2.0 0 1\n",
|
|
|
|
|
"1 -1.0 2 3\n",
|
|
|
|
|
"2 NaN 4 5\n",
|
|
|
|
|
"3 1.0 6 7\n",
|
|
|
|
|
"4 2.0 8 9"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"execution_count": 33
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"id": "ka7-wNfzSxbx"
|
|
|
|
|
},
|
|
|
|
|
"source": [
|
|
|
|
|
"The mean of the column is"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"id": "XYtYEf5BSxFL",
|
|
|
|
|
"outputId": "1e79aeea-6baf-4572-dcd1-23e5ec742036",
|
|
|
|
|
"colab": {
|
|
|
|
|
"base_uri": "https://localhost:8080/"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"source": [
|
|
|
|
|
"np.mean(fill_with_mean[0])"
|
|
|
|
|
],
|
|
|
|
|
"execution_count": 34,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "execute_result",
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"0.0"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"execution_count": 34
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"id": "oBSRGxKRS39K"
|
|
|
|
|
},
|
|
|
|
|
"source": [
|
|
|
|
|
"Filling with mean"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"id": "FzncQLmuS5jh",
|
|
|
|
|
"outputId": "75f33b25-e6b3-41bb-8049-1ed2e085efe2",
|
|
|
|
|
"colab": {
|
|
|
|
|
"base_uri": "https://localhost:8080/",
|
|
|
|
|
"height": 204
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"source": [
|
|
|
|
|
"fill_with_mean[0].fillna(np.mean(fill_with_mean[0]),inplace=True)\n",
|
|
|
|
|
"fill_with_mean"
|
|
|
|
|
],
|
|
|
|
|
"execution_count": 35,
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "execute_result",
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>-2.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>3</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>6</td>\n",
|
|
|
|
|
" <td>7</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>8</td>\n",
|
|
|
|
|
" <td>9</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" 0 1 2\n",
|
|
|
|
|
"0 -2.0 0 1\n",
|
|
|
|
|
"1 -1.0 2 3\n",
|
|
|
|
|
"2 0.0 4 5\n",
|
|
|
|
|
"3 1.0 6 7\n",
|
|
|
|
|
"4 2.0 8 9"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"execution_count": 35
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"id": "CwpVFCrPTC5z"
|
|
|
|
|
},
|
|
|
|
|
"source": [
|
|
|
|
|
"As we can see, the missing value has been replaced with its mean."
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|