AiLearning-Theory-Applying/机器学习竞赛实战_优胜解决方案/Indoor Location & Navigation/code/floor-model-blstm.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.016863,
     "end_time": "2021-03-25T14:47:56.519370",
     "exception": false,
     "start_time": "2021-03-25T14:47:56.502507",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Overview\n",
    "This compared to the [99 accurate model](https://www.kaggle.com/nigelhenry/simple-99-accurate-floor-model) is a more of a brute force approach,added on with a bit of error analysis & post processing\n",
    "\n",
    "So i studied Kouki's [LSTM](https://www.kaggle.com/kokitanisaka/lstm-by-keras-with-unified-wi-fi-feats) that utilizes [the unified Wi-Fi dataset](https://www.kaggle.com/kokitanisaka/indoorunifiedwifids).<br>\n",
    "and i found it rather intersting that it could score so well on the xy, but the floor prediction was never improving as it was pretty stable after a few epochs.\n",
    "How could it be soo good for the xy and not for the floor?\n",
    "\n",
    "Didn't seem right so i set to work on this model:\n",
    "\n",
    "\n",
    "I know there is already a great floor predicting model out there that got already nearly 99% but seeing how competition is heating up, every decimal counts.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "_kg_hide-input": true,
    "execution": {
     "iopub.execute_input": "2021-03-25T14:47:56.559230Z",
     "iopub.status.busy": "2021-03-25T14:47:56.558432Z",
     "iopub.status.idle": "2021-03-25T14:48:03.762244Z",
     "shell.execute_reply": "2021-03-25T14:48:03.761338Z"
    },
    "papermill": {
     "duration": 7.227414,
     "end_time": "2021-03-25T14:48:03.762445",
     "exception": false,
     "start_time": "2021-03-25T14:47:56.535031",
     "status": "completed"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy.stats as stats\n",
    "from pathlib import Path\n",
    "import glob\n",
    "import pickle\n",
    "import random\n",
    "import os\n",
    "\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "import tensorflow as tf\n",
    "import tensorflow.keras.layers as L\n",
    "import tensorflow.keras.models as M\n",
    "import tensorflow.keras.backend as K\n",
    "# import tensorflow_addons as tfa\n",
    "# from tensorflow_addons.layers import WeightNormalization\n",
    "from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.016009,
     "end_time": "2021-03-25T14:48:03.794104",
     "exception": false,
     "start_time": "2021-03-25T14:48:03.778095",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Kouki's awsome code for preprocessing, is hidden below;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "_kg_hide-input": true,
    "execution": {
     "iopub.execute_input": "2021-03-25T14:48:03.847184Z",
     "iopub.status.busy": "2021-03-25T14:48:03.846460Z",
     "iopub.status.idle": "2021-03-25T14:49:05.017501Z",
     "shell.execute_reply": "2021-03-25T14:49:05.016793Z"
    },
    "papermill": {
     "duration": 61.208277,
     "end_time": "2021-03-25T14:49:05.017676",
     "exception": false,
     "start_time": "2021-03-25T14:48:03.809399",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BSSID TYPES: 61206\n",
      "BSSID TYPES: 33042\n"
     ]
    }
   ],
   "source": [
    "# options\n",
    "\n",
    "N_SPLITS = 5\n",
    "\n",
    "SEED = 2021\n",
    "\n",
    "NUM_FEATS = 20 # number of features that we use. there are 100 feats but we don't need to use all of them\n",
    "\n",
    "base_path = '../'\n",
    "\n",
    "def set_seed(seed=42):\n",
    "    random.seed(seed)\n",
    "    os.environ['PYTHONHASHSEED'] = str(seed)\n",
    "    np.random.seed(seed)\n",
    "    tf.random.set_seed(seed)\n",
    "    session_conf = tf.compat.v1.ConfigProto(\n",
    "        intra_op_parallelism_threads=1,\n",
    "        inter_op_parallelism_threads=1\n",
    "    )\n",
    "    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)\n",
    "    tf.compat.v1.keras.backend.set_session(sess)\n",
    "    \n",
    "def comp_metric(xhat, yhat, fhat, x, y, f):\n",
    "    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)\n",
    "    return intermediate.sum()/xhat.shape[0]\n",
    "\n",
    "feature_dir = f\"{base_path}/input/indoor-unified-wifi-ds\"\n",
    "train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))\n",
    "test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))\n",
    "subm = pd.read_csv(f'{base_path}/input/indoor-location-navigation/sample_submission.csv', index_col=0)\n",
    "\n",
    "with open(f'{feature_dir}/train_all.pkl', 'rb') as f:\n",
    "    data = pickle.load( f)\n",
    "\n",
    "with open(f'{feature_dir}/test_all.pkl', 'rb') as f:\n",
    "    test_data = pickle.load(f)\n",
    "\n",
    "\n",
    "# training target features\n",
    "\n",
    "BSSID_FEATS = [f'bssid_{i}' for i in range(NUM_FEATS)]\n",
    "RSSI_FEATS  = [f'rssi_{i}' for i in range(NUM_FEATS)]\n",
    "\n",
    "\n",
    "# get numbers of bssids to embed them in a layer\n",
    "\n",
    "wifi_bssids = []\n",
    "for i in range(100):\n",
    "    wifi_bssids.extend(data.iloc[:,i].values.tolist())\n",
    "wifi_bssids = list(set(wifi_bssids))\n",
    "\n",
    "wifi_bssids_size = len(wifi_bssids)\n",
    "print(f'BSSID TYPES: {wifi_bssids_size}')\n",
    "\n",
    "wifi_bssids_test = []\n",
    "for i in range(100):\n",
    "    wifi_bssids_test.extend(test_data.iloc[:,i].values.tolist())\n",
    "wifi_bssids_test = list(set(wifi_bssids_test))\n",
    "\n",
    "wifi_bssids_size = len(wifi_bssids_test)\n",
    "print(f'BSSID TYPES: {wifi_bssids_size}')\n",
    "\n",
    "wifi_bssids.extend(wifi_bssids_test)\n",
    "wifi_bssids_size = len(wifi_bssids)\n",
    "\n",
    "# preprocess\n",
    "\n",
    "le = LabelEncoder()\n",
    "le.fit(wifi_bssids)\n",
    "le_site = LabelEncoder()\n",
    "le_site.fit(data['site_id'])\n",
    "\n",
    "ss = StandardScaler()\n",
    "ss.fit(data.loc[:,RSSI_FEATS])\n",
    "\n",
    "\n",
    "data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])\n",
    "for i in BSSID_FEATS:\n",
    "    data.loc[:,i] = le.transform(data.loc[:,i])\n",
    "    data.loc[:,i] = data.loc[:,i] + 1\n",
    "    \n",
    "data.loc[:, 'site_id'] = le_site.transform(data.loc[:, 'site_id'])\n",
    "\n",
    "data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])\n",
    "\n",
    "test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])\n",
    "for i in BSSID_FEATS:\n",
    "    test_data.loc[:,i] = le.transform(test_data.loc[:,i])\n",
    "    test_data.loc[:,i] = test_data.loc[:,i] + 1\n",
    "    \n",
    "test_data.loc[:, 'site_id'] = le_site.transform(test_data.loc[:, 'site_id'])\n",
    "\n",
    "test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])\n",
    "\n",
    "\n",
    "site_count = len(data['site_id'].unique())\n",
    "data.reset_index(drop=True, inplace=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.016003,
     "end_time": "2021-03-25T14:49:05.050212",
     "exception": false,
     "start_time": "2021-03-25T14:49:05.034209",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## The Reajusted model\n",
    "The floor predictions wee being made by a softmax layer with just one dense unit, a pretty easy error to make and a difficult one to spot since there are two exits, and therefor the unit was prediciting between floors 1-0 (almost always one). So i set out to, test knowledge and fix it for the floor predictions and make a modol for them. \n",
    "\n",
    "I used one hot encoding( there are a total of 11 cats)with categorical loss and Sigmoid activation unit for the last layer &  added a bit of bilateral firing power for an over kill and came out with the accurate following result:\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T14:49:05.094400Z",
     "iopub.status.busy": "2021-03-25T14:49:05.093565Z",
     "iopub.status.idle": "2021-03-25T14:49:05.096346Z",
     "shell.execute_reply": "2021-03-25T14:49:05.095866Z"
    },
    "papermill": {
     "duration": 0.030636,
     "end_time": "2021-03-25T14:49:05.096485",
     "exception": false,
     "start_time": "2021-03-25T14:49:05.065849",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "#FLOOR\n",
    "def create_fmodel(input_data):\n",
    "\n",
    "    # bssid feats\n",
    "    input_dim = input_data[0].shape[1]\n",
    "\n",
    "    input_embd_layer = L.Input(shape=(input_dim,))\n",
    "    x1 = L.Embedding(wifi_bssids_size, 64)(input_embd_layer)  # use the embedding for bssid.\n",
    "    x1 = L.Flatten()(x1)  # Return a copy of the array collapsed into one dimension.\n",
    "\n",
    "    # rssi feats\n",
    "    input_dim = input_data[1].shape[1]\n",
    "\n",
    "    input_layer = L.Input(input_dim, )\n",
    "    x2 = L.BatchNormalization()(input_layer)\n",
    "    x2 = L.Dense(NUM_FEATS * 64, activation='relu')(x2)\n",
    "\n",
    "    # site\n",
    "    input_site_layer = L.Input(shape=(1,))\n",
    "    x3 = L.Embedding(site_count, 2)(input_site_layer)\n",
    "    x3 = L.Flatten()(x3)\n",
    "\n",
    "\n",
    "    # main stream\n",
    "    x = L.Concatenate(axis=1)([x1, x3, x2])\n",
    "\n",
    "\n",
    "    x = L.Reshape((1, -1))(x)\n",
    "    x = L.BatchNormalization()(x)\n",
    "    mod1=L.LSTM(256, dropout=0.4, recurrent_dropout=0.3, return_sequences=True, activation='tanh')\n",
    "    x = L.Bidirectional(mod1)(x)\n",
    "    x = L.Bidirectional(L.LSTM(32, dropout=0.4, return_sequences=False, activation='relu'))(x)\n",
    "    x = L.BatchNormalization()(x)\n",
    "    x = L.Dense(16, activation='tanh')(x) \n",
    "    \n",
    "    output_layer_1 = L.Dense(11, activation='softmax', name='floor')(x) \n",
    "\n",
    "    model = M.Model([input_embd_layer, input_layer, input_site_layer], \n",
    "                    [output_layer_1])\n",
    "\n",
    "    model.compile(optimizer=tf.optimizers.Adam(lr=0.001),\n",
    "                  loss=tf.keras.losses.CategoricalCrossentropy(), metrics=['mse','accuracy'])\n",
    "\n",
    "    return model\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T14:49:05.139349Z",
     "iopub.status.busy": "2021-03-25T14:49:05.138667Z",
     "iopub.status.idle": "2021-03-25T14:49:05.142395Z",
     "shell.execute_reply": "2021-03-25T14:49:05.141732Z"
    },
    "papermill": {
     "duration": 0.030252,
     "end_time": "2021-03-25T14:49:05.142541",
     "exception": false,
     "start_time": "2021-03-25T14:49:05.112289",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "data.index=data['path']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T14:49:05.181621Z",
     "iopub.status.busy": "2021-03-25T14:49:05.180900Z",
     "iopub.status.idle": "2021-03-25T14:49:05.188225Z",
     "shell.execute_reply": "2021-03-25T14:49:05.187710Z"
    },
    "papermill": {
     "duration": 0.029352,
     "end_time": "2021-03-25T14:49:05.188372",
     "exception": false,
     "start_time": "2021-03-25T14:49:05.159020",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "#OneHot The floor\n",
    "one_hot=pd.get_dummies(data['floor'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T14:49:05.245543Z",
     "iopub.status.busy": "2021-03-25T14:49:05.244869Z",
     "iopub.status.idle": "2021-03-25T14:49:06.290893Z",
     "shell.execute_reply": "2021-03-25T14:49:06.290022Z"
    },
    "papermill": {
     "duration": 1.086278,
     "end_time": "2021-03-25T14:49:06.291076",
     "exception": false,
     "start_time": "2021-03-25T14:49:05.204798",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10352, 500)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#500 Random, totally unseen paths\n",
    "val_p_ind=pd.DataFrame(data.path.unique()).sample(n=500,random_state=1).values.reshape((-1)) #100%/500samples so accuracy of preicision estimate should be around 0.2 % so +/- 0,1 % \n",
    "t_idx = data.path.unique().tolist() \n",
    "t_idx=[ a for a in t_idx if a not in val_p_ind.tolist()]\n",
    "\n",
    "train_data=data.loc[t_idx]\n",
    "X_ass_val= data.loc[val_p_ind]\n",
    "len(t_idx),len(val_p_ind)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.016725,
     "end_time": "2021-03-25T14:49:06.324749",
     "exception": false,
     "start_time": "2021-03-25T14:49:06.308024",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "It is important that the paths are unseen for reasons shown in the error analysis\n",
    "otherwise in the post processing i would have data leakage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T14:49:06.404418Z",
     "iopub.status.busy": "2021-03-25T14:49:06.403735Z",
     "iopub.status.idle": "2021-03-25T14:49:06.421984Z",
     "shell.execute_reply": "2021-03-25T14:49:06.421429Z"
    },
    "papermill": {
     "duration": 0.080496,
     "end_time": "2021-03-25T14:49:06.422117",
     "exception": false,
     "start_time": "2021-03-25T14:49:06.341621",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bssid_0</th>\n",
       "      <th>bssid_1</th>\n",
       "      <th>bssid_2</th>\n",
       "      <th>bssid_3</th>\n",
       "      <th>bssid_4</th>\n",
       "      <th>bssid_5</th>\n",
       "      <th>bssid_6</th>\n",
       "      <th>bssid_7</th>\n",
       "      <th>bssid_8</th>\n",
       "      <th>bssid_9</th>\n",
       "      <th>...</th>\n",
       "      <th>rssi_95</th>\n",
       "      <th>rssi_96</th>\n",
       "      <th>rssi_97</th>\n",
       "      <th>rssi_98</th>\n",
       "      <th>rssi_99</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>floor</th>\n",
       "      <th>path</th>\n",
       "      <th>site_id</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>path</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>0 rows × 205 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [bssid_0, bssid_1, bssid_2, bssid_3, bssid_4, bssid_5, bssid_6, bssid_7, bssid_8, bssid_9, bssid_10, bssid_11, bssid_12, bssid_13, bssid_14, bssid_15, bssid_16, bssid_17, bssid_18, bssid_19, bssid_20, bssid_21, bssid_22, bssid_23, bssid_24, bssid_25, bssid_26, bssid_27, bssid_28, bssid_29, bssid_30, bssid_31, bssid_32, bssid_33, bssid_34, bssid_35, bssid_36, bssid_37, bssid_38, bssid_39, bssid_40, bssid_41, bssid_42, bssid_43, bssid_44, bssid_45, bssid_46, bssid_47, bssid_48, bssid_49, bssid_50, bssid_51, bssid_52, bssid_53, bssid_54, bssid_55, bssid_56, bssid_57, bssid_58, bssid_59, bssid_60, bssid_61, bssid_62, bssid_63, bssid_64, bssid_65, bssid_66, bssid_67, bssid_68, bssid_69, bssid_70, bssid_71, bssid_72, bssid_73, bssid_74, bssid_75, bssid_76, bssid_77, bssid_78, bssid_79, bssid_80, bssid_81, bssid_82, bssid_83, bssid_84, bssid_85, bssid_86, bssid_87, bssid_88, bssid_89, bssid_90, bssid_91, bssid_92, bssid_93, bssid_94, bssid_95, bssid_96, bssid_97, bssid_98, bssid_99, ...]\n",
       "Index: []\n",
       "\n",
       "[0 rows x 205 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#check there is no cross contamination of the validation data\n",
    "train_data[train_data['path']==val_p_ind[5]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T14:49:06.474967Z",
     "iopub.status.busy": "2021-03-25T14:49:06.472331Z",
     "iopub.status.idle": "2021-03-25T15:20:04.415833Z",
     "shell.execute_reply": "2021-03-25T15:20:04.416354Z"
    },
    "papermill": {
     "duration": 1857.975369,
     "end_time": "2021-03-25T15:20:04.416548",
     "exception": false,
     "start_time": "2021-03-25T14:49:06.441179",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/100\n",
      "1923/1923 [==============================] - 267s 134ms/step - loss: 0.6500 - mse: 0.0233 - accuracy: 0.7893 - val_loss: 0.0709 - val_mse: 0.0027 - val_accuracy: 0.9812\n",
      "Epoch 2/100\n",
      "1923/1923 [==============================] - 268s 140ms/step - loss: 0.0238 - mse: 8.7623e-04 - accuracy: 0.9929 - val_loss: 0.0539 - val_mse: 0.0020 - val_accuracy: 0.9856\n",
      "Epoch 3/100\n",
      "1923/1923 [==============================] - 263s 137ms/step - loss: 0.0116 - mse: 4.6434e-04 - accuracy: 0.9967 - val_loss: 0.0656 - val_mse: 0.0023 - val_accuracy: 0.9851\n",
      "Epoch 4/100\n",
      "1923/1923 [==============================] - 266s 138ms/step - loss: 0.0052 - mse: 2.1056e-04 - accuracy: 0.9985 - val_loss: 0.0648 - val_mse: 0.0021 - val_accuracy: 0.9856\n",
      "Epoch 5/100\n",
      "1923/1923 [==============================] - 259s 135ms/step - loss: 0.0054 - mse: 2.1736e-04 - accuracy: 0.9985 - val_loss: 0.0629 - val_mse: 0.0018 - val_accuracy: 0.9891\n",
      "\n",
      "Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.\n",
      "Epoch 6/100\n",
      "1923/1923 [==============================] - 260s 135ms/step - loss: 0.0023 - mse: 8.6135e-05 - accuracy: 0.9994 - val_loss: 0.0611 - val_mse: 0.0017 - val_accuracy: 0.9898\n",
      "Epoch 7/100\n",
      "1923/1923 [==============================] - 262s 136ms/step - loss: 8.9690e-04 - mse: 2.9825e-05 - accuracy: 0.9998 - val_loss: 0.0674 - val_mse: 0.0018 - val_accuracy: 0.9894\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9855541718555417"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_trainf = one_hot.loc[t_idx, :]\n",
    "y_validf = one_hot.loc[val_p_ind, :]\n",
    "X_train = train_data.loc[:, BSSID_FEATS + RSSI_FEATS + ['site_id']]\n",
    "X_valid = X_ass_val.loc[:, BSSID_FEATS + RSSI_FEATS + ['site_id']]\n",
    "fmodel = create_fmodel([X_train.loc[:,BSSID_FEATS], X_train.loc[:,RSSI_FEATS], X_train.loc[:,'site_id']])\n",
    "#     model = multi_gpu_model(model, 1)\n",
    "fmodel.fit([X_train.loc[:,BSSID_FEATS], X_train.loc[:,RSSI_FEATS], X_train.loc[:,'site_id']], y_trainf, \n",
    "            validation_data=([X_valid.loc[:,BSSID_FEATS], X_valid.loc[:,RSSI_FEATS], X_valid.loc[:,'site_id']], y_validf), \n",
    "            batch_size=128, epochs=100\n",
    "             ,shuffle=True\n",
    "            ,callbacks=[\n",
    "            ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_delta=1e-4, mode='min')\n",
    "            , ModelCheckpoint(f'{base_path}/RNN_{SEED}_.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')\n",
    "            , EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, mode='min', baseline=None, restore_best_weights=True)\n",
    "        ]\n",
    "   )\n",
    "\n",
    "fmodel.load_weights(f'{base_path}/RNN_{SEED}_.hdf5')\n",
    "fvalid = fmodel.predict([X_ass_val.loc[:,BSSID_FEATS], X_ass_val.loc[:,RSSI_FEATS], X_ass_val.loc[:,'site_id']])#minus two is make the interval [-2:8] again\n",
    "fvalid = np.argmax(fvalid, axis=1)-2\n",
    "# ass_val_arr[:, fold] = fvalid\n",
    "\n",
    "pred = fmodel.predict([test_data.loc[:,BSSID_FEATS], test_data.loc[:,RSSI_FEATS], test_data.loc[:,'site_id']]) # test_data.iloc[:, :-1])\n",
    "pred =np.argmax(pred, axis=1)-2#minus two is make the interval [-2:8] again\n",
    "# preds_f_arr[:, fold] = pred\n",
    "\n",
    "ass_val_floors=fvalid\n",
    "floors=pred\n",
    "                                                                \n",
    "accuracy_score(X_ass_val['floor'], ass_val_floors)#second validation, checks the argmax and shifting"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 4.042726,
     "end_time": "2021-03-25T15:20:12.587771",
     "exception": false,
     "start_time": "2021-03-25T15:20:08.545045",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Error Analysis "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T15:20:20.857222Z",
     "iopub.status.busy": "2021-03-25T15:20:20.856625Z",
     "iopub.status.idle": "2021-03-25T15:20:21.171213Z",
     "shell.execute_reply": "2021-03-25T15:20:21.170489Z"
    },
    "papermill": {
     "duration": 4.441317,
     "end_time": "2021-03-25T15:20:21.171371",
     "exception": false,
     "start_time": "2021-03-25T15:20:16.730054",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((174, 206), (38,))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Error Analysis - how many paths i got wrong and how many times \n",
    "X_ass_val['wrong']=(X_ass_val['floor']- ass_val_floors)!=0\n",
    "wrongs= X_ass_val[X_ass_val['wrong']==True]\n",
    "rights= X_ass_val[X_ass_val['wrong']==False]\n",
    "wrongs.shape, wrongs['path'].unique().shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 4.138347,
     "end_time": "2021-03-25T15:20:29.362908",
     "exception": false,
     "start_time": "2021-03-25T15:20:25.224561",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "**Only one floor per path right** ? \n",
    "well i think this is a given as the original data is presented as in paths within the floors file;\n",
    "I double checked this assumptionto be true."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 4.110906,
     "end_time": "2021-03-25T15:20:37.498830",
     "exception": false,
     "start_time": "2021-03-25T15:20:33.387924",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "So i check if the ones i got wrong i ever got right..\n",
    "I check out the number of times i got that path right"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "_kg_hide-input": false,
    "_kg_hide-output": true,
    "execution": {
     "iopub.execute_input": "2021-03-25T15:20:45.666098Z",
     "iopub.status.busy": "2021-03-25T15:20:45.665388Z",
     "iopub.status.idle": "2021-03-25T15:20:45.843468Z",
     "shell.execute_reply": "2021-03-25T15:20:45.842933Z"
    },
    "papermill": {
     "duration": 4.224142,
     "end_time": "2021-03-25T15:20:45.843625",
     "exception": false,
     "start_time": "2021-03-25T15:20:41.619483",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(23, 1),\n",
       " (62, 2),\n",
       " (20, 1),\n",
       " (58, 1),\n",
       " (80, 40),\n",
       " (49, 2),\n",
       " (16, 1),\n",
       " (5, 1),\n",
       " (19, 1),\n",
       " (84, 14),\n",
       " (21, 3),\n",
       " (38, 3),\n",
       " (132, 5),\n",
       " (57, 3),\n",
       " (15, 6),\n",
       " (3, 1),\n",
       " (39, 4),\n",
       " (13, 1),\n",
       " (31, 2),\n",
       " (331, 1),\n",
       " (74, 4),\n",
       " (15, 1),\n",
       " (9, 9),\n",
       " (18, 1),\n",
       " (33, 9),\n",
       " (10, 1),\n",
       " (17, 3),\n",
       " (23, 1),\n",
       " (22, 1),\n",
       " (82, 1),\n",
       " (94, 1),\n",
       " (9, 3),\n",
       " (50, 1),\n",
       " (16, 3),\n",
       " (4, 10),\n",
       " (4, 1)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#create tuple\n",
    "#(Number of times predicted correctly left  vs numebr of times corrected incorecctly right)\n",
    "[(rights[rights['path']==p].shape[0],wrongs[wrongs['path']==p].shape[0]) for p in  wrongs['path'].unique() if p in rights['path'].unique()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 4.057255,
     "end_time": "2021-03-25T15:20:54.118709",
     "exception": false,
     "start_time": "2021-03-25T15:20:50.061454",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "As you can see if you unhide the above result, in taking the most frequent column i would avoid many erorrs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T15:21:02.300357Z",
     "iopub.status.busy": "2021-03-25T15:21:02.299383Z",
     "iopub.status.idle": "2021-03-25T15:21:03.116053Z",
     "shell.execute_reply": "2021-03-25T15:21:03.115378Z"
    },
    "papermill": {
     "duration": 4.893704,
     "end_time": "2021-03-25T15:21:03.116196",
     "exception": false,
     "start_time": "2021-03-25T15:20:58.222492",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "#re-elaboration taking the most frequent\n",
    "X_ass_val['p_floor']=ass_val_floors\n",
    "X_ass_val=X_ass_val.reset_index(drop=True)\n",
    "X_ass_val\n",
    "\n",
    "def mode(a):\n",
    "    '''returns the mode of the group'''\n",
    "    return( a['p_floor'].value_counts().head(1).reset_index()['index'].values[0])\n",
    "\n",
    "df = pd.DataFrame()    \n",
    "# df['path']=X_ass_val.groupby('path').apply(modee1)\n",
    "df['blended_floor_pred']=X_ass_val.groupby('path').apply(mode)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 4.105118,
     "end_time": "2021-03-25T15:21:11.322575",
     "exception": false,
     "start_time": "2021-03-25T15:21:07.217457",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Checking the post processing Bump"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T15:21:19.718064Z",
     "iopub.status.busy": "2021-03-25T15:21:19.713033Z",
     "iopub.status.idle": "2021-03-25T15:21:19.996038Z",
     "shell.execute_reply": "2021-03-25T15:21:19.996548Z"
    },
    "papermill": {
     "duration": 4.521753,
     "end_time": "2021-03-25T15:21:19.996732",
     "exception": false,
     "start_time": "2021-03-25T15:21:15.474979",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9962640099626401"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_ass_val=X_ass_val.merge(df, how='left', on='path')\n",
    "accuracy_score(X_ass_val['floor'], X_ass_val['blended_floor_pred'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 4.087199,
     "end_time": "2021-03-25T15:21:28.153491",
     "exception": false,
     "start_time": "2021-03-25T15:21:24.066292",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "I can round it up (defectivly) to 99,80%. This Rounding is because of the test size i have a scale unit of 0,2%. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 4.093125,
     "end_time": "2021-03-25T15:21:36.328867",
     "exception": false,
     "start_time": "2021-03-25T15:21:32.235742",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Satisfactory, so do it on the test data too and submit."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "_kg_hide-input": true,
    "execution": {
     "iopub.execute_input": "2021-03-25T15:21:44.498849Z",
     "iopub.status.busy": "2021-03-25T15:21:44.496862Z",
     "iopub.status.idle": "2021-03-25T15:21:45.790694Z",
     "shell.execute_reply": "2021-03-25T15:21:45.791181Z"
    },
    "papermill": {
     "duration": 5.371631,
     "end_time": "2021-03-25T15:21:45.791378",
     "exception": false,
     "start_time": "2021-03-25T15:21:40.419747",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "test_data['path']=test_data['site_path_timestamp'].str.split(pat='_', n=- 1, expand=True)[1]\n",
    "(test_data['site_path_timestamp'].str.split(pat='_', n=- 1, expand=True)[0]+test_data['site_path_timestamp'].str.split(pat='_', n=- 1, expand=True)[1]).unique().shape\n",
    "\n",
    "test_data['p_floor']=pred\n",
    "test_data\n",
    "#re-elaboration taking the median\n",
    "def modee1(a):\n",
    "    return (a['path'].unique())\n",
    "def modee2(a):\n",
    "    return( a['p_floor'].value_counts().head(1).reset_index()['index'].values[0])\n",
    "\n",
    "dft = pd.DataFrame()    \n",
    "# df['path']=X_ass_val.groupby('path').apply(modee1)\n",
    "dft['my_b_floor_pred']=test_data.groupby('path').apply(modee2)\n",
    "test_data=test_data.merge(dft, how='left', on='path')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-03-25T15:21:54.053294Z",
     "iopub.status.busy": "2021-03-25T15:21:54.052342Z",
     "iopub.status.idle": "2021-03-25T15:21:54.169648Z",
     "shell.execute_reply": "2021-03-25T15:21:54.170143Z"
    },
    "papermill": {
     "duration": 4.292946,
     "end_time": "2021-03-25T15:21:54.170321",
     "exception": false,
     "start_time": "2021-03-25T15:21:49.877375",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "#fetching K' submissions to see if there is an improvement on the lb\n",
    "sub= pd.read_csv('../input/indoor-location-navigation/submission.csv')\n",
    "sub['floor']=test_data['my_b_floor_pred']\n",
    "sub.index=sub['site_path_timestamp']\n",
    "sub.drop(columns=['site_path_timestamp'],inplace=True)\n",
    "sub.to_csv('submission_floor.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 4.07245,
     "end_time": "2021-03-25T15:22:02.455467",
     "exception": false,
     "start_time": "2021-03-25T15:21:58.383017",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "See if it gets and up grade on the score by substituting this on the floor prediction\n",
    "      unfortunately on the pubblic leaderboard score it doesn't improve on the visible decimal values, but on the private ? \n",
    "      \n",
    "check if there are differences that got noticed :"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 2077.205078,
   "end_time": "2021-03-25T15:22:28.131889",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2021-03-25T14:47:50.926811",
   "version": "2.2.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}