{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.016863,
"end_time": "2021-03-25T14:47:56.519370",
"exception": false,
"start_time": "2021-03-25T14:47:56.502507",
"status": "completed"
},
"tags": []
},
"source": [
"## Overview\n",
"This compared to the [99 accurate model](https://www.kaggle.com/nigelhenry/simple-99-accurate-floor-model) is a more of a brute force approach,added on with a bit of error analysis & post processing\n",
"\n",
"So i studied Kouki's [LSTM](https://www.kaggle.com/kokitanisaka/lstm-by-keras-with-unified-wi-fi-feats) that utilizes [the unified Wi-Fi dataset](https://www.kaggle.com/kokitanisaka/indoorunifiedwifids).
\n",
"and i found it rather intersting that it could score so well on the xy, but the floor prediction was never improving as it was pretty stable after a few epochs.\n",
"How could it be soo good for the xy and not for the floor?\n",
"\n",
"Didn't seem right so i set to work on this model:\n",
"\n",
"\n",
"I know there is already a great floor predicting model out there that got already nearly 99% but seeing how competition is heating up, every decimal counts.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"_kg_hide-input": true,
"execution": {
"iopub.execute_input": "2021-03-25T14:47:56.559230Z",
"iopub.status.busy": "2021-03-25T14:47:56.558432Z",
"iopub.status.idle": "2021-03-25T14:48:03.762244Z",
"shell.execute_reply": "2021-03-25T14:48:03.761338Z"
},
"papermill": {
"duration": 7.227414,
"end_time": "2021-03-25T14:48:03.762445",
"exception": false,
"start_time": "2021-03-25T14:47:56.535031",
"status": "completed"
},
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import scipy.stats as stats\n",
"from pathlib import Path\n",
"import glob\n",
"import pickle\n",
"import random\n",
"import os\n",
"\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.model_selection import train_test_split\n",
"import tensorflow as tf\n",
"import tensorflow.keras.layers as L\n",
"import tensorflow.keras.models as M\n",
"import tensorflow.keras.backend as K\n",
"# import tensorflow_addons as tfa\n",
"# from tensorflow_addons.layers import WeightNormalization\n",
"from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.016009,
"end_time": "2021-03-25T14:48:03.794104",
"exception": false,
"start_time": "2021-03-25T14:48:03.778095",
"status": "completed"
},
"tags": []
},
"source": [
"Kouki's awsome code for preprocessing, is hidden below;"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"_kg_hide-input": true,
"execution": {
"iopub.execute_input": "2021-03-25T14:48:03.847184Z",
"iopub.status.busy": "2021-03-25T14:48:03.846460Z",
"iopub.status.idle": "2021-03-25T14:49:05.017501Z",
"shell.execute_reply": "2021-03-25T14:49:05.016793Z"
},
"papermill": {
"duration": 61.208277,
"end_time": "2021-03-25T14:49:05.017676",
"exception": false,
"start_time": "2021-03-25T14:48:03.809399",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BSSID TYPES: 61206\n",
"BSSID TYPES: 33042\n"
]
}
],
"source": [
"# options\n",
"\n",
"N_SPLITS = 5\n",
"\n",
"SEED = 2021\n",
"\n",
"NUM_FEATS = 20 # number of features that we use. there are 100 feats but we don't need to use all of them\n",
"\n",
"base_path = '../'\n",
"\n",
"def set_seed(seed=42):\n",
" random.seed(seed)\n",
" os.environ['PYTHONHASHSEED'] = str(seed)\n",
" np.random.seed(seed)\n",
" tf.random.set_seed(seed)\n",
" session_conf = tf.compat.v1.ConfigProto(\n",
" intra_op_parallelism_threads=1,\n",
" inter_op_parallelism_threads=1\n",
" )\n",
" sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)\n",
" tf.compat.v1.keras.backend.set_session(sess)\n",
" \n",
"def comp_metric(xhat, yhat, fhat, x, y, f):\n",
" intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)\n",
" return intermediate.sum()/xhat.shape[0]\n",
"\n",
"feature_dir = f\"{base_path}/input/indoor-unified-wifi-ds\"\n",
"train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))\n",
"test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))\n",
"subm = pd.read_csv(f'{base_path}/input/indoor-location-navigation/sample_submission.csv', index_col=0)\n",
"\n",
"with open(f'{feature_dir}/train_all.pkl', 'rb') as f:\n",
" data = pickle.load( f)\n",
"\n",
"with open(f'{feature_dir}/test_all.pkl', 'rb') as f:\n",
" test_data = pickle.load(f)\n",
"\n",
"\n",
"# training target features\n",
"\n",
"BSSID_FEATS = [f'bssid_{i}' for i in range(NUM_FEATS)]\n",
"RSSI_FEATS = [f'rssi_{i}' for i in range(NUM_FEATS)]\n",
"\n",
"\n",
"# get numbers of bssids to embed them in a layer\n",
"\n",
"wifi_bssids = []\n",
"for i in range(100):\n",
" wifi_bssids.extend(data.iloc[:,i].values.tolist())\n",
"wifi_bssids = list(set(wifi_bssids))\n",
"\n",
"wifi_bssids_size = len(wifi_bssids)\n",
"print(f'BSSID TYPES: {wifi_bssids_size}')\n",
"\n",
"wifi_bssids_test = []\n",
"for i in range(100):\n",
" wifi_bssids_test.extend(test_data.iloc[:,i].values.tolist())\n",
"wifi_bssids_test = list(set(wifi_bssids_test))\n",
"\n",
"wifi_bssids_size = len(wifi_bssids_test)\n",
"print(f'BSSID TYPES: {wifi_bssids_size}')\n",
"\n",
"wifi_bssids.extend(wifi_bssids_test)\n",
"wifi_bssids_size = len(wifi_bssids)\n",
"\n",
"# preprocess\n",
"\n",
"le = LabelEncoder()\n",
"le.fit(wifi_bssids)\n",
"le_site = LabelEncoder()\n",
"le_site.fit(data['site_id'])\n",
"\n",
"ss = StandardScaler()\n",
"ss.fit(data.loc[:,RSSI_FEATS])\n",
"\n",
"\n",
"data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])\n",
"for i in BSSID_FEATS:\n",
" data.loc[:,i] = le.transform(data.loc[:,i])\n",
" data.loc[:,i] = data.loc[:,i] + 1\n",
" \n",
"data.loc[:, 'site_id'] = le_site.transform(data.loc[:, 'site_id'])\n",
"\n",
"data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])\n",
"\n",
"test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])\n",
"for i in BSSID_FEATS:\n",
" test_data.loc[:,i] = le.transform(test_data.loc[:,i])\n",
" test_data.loc[:,i] = test_data.loc[:,i] + 1\n",
" \n",
"test_data.loc[:, 'site_id'] = le_site.transform(test_data.loc[:, 'site_id'])\n",
"\n",
"test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])\n",
"\n",
"\n",
"site_count = len(data['site_id'].unique())\n",
"data.reset_index(drop=True, inplace=True)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.016003,
"end_time": "2021-03-25T14:49:05.050212",
"exception": false,
"start_time": "2021-03-25T14:49:05.034209",
"status": "completed"
},
"tags": []
},
"source": [
"## The Reajusted model\n",
"The floor predictions wee being made by a softmax layer with just one dense unit, a pretty easy error to make and a difficult one to spot since there are two exits, and therefor the unit was prediciting between floors 1-0 (almost always one). So i set out to, test knowledge and fix it for the floor predictions and make a modol for them. \n",
"\n",
"I used one hot encoding( there are a total of 11 cats)with categorical loss and Sigmoid activation unit for the last layer & added a bit of bilateral firing power for an over kill and came out with the accurate following result:\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-25T14:49:05.094400Z",
"iopub.status.busy": "2021-03-25T14:49:05.093565Z",
"iopub.status.idle": "2021-03-25T14:49:05.096346Z",
"shell.execute_reply": "2021-03-25T14:49:05.095866Z"
},
"papermill": {
"duration": 0.030636,
"end_time": "2021-03-25T14:49:05.096485",
"exception": false,
"start_time": "2021-03-25T14:49:05.065849",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"#FLOOR\n",
"def create_fmodel(input_data):\n",
"\n",
" # bssid feats\n",
" input_dim = input_data[0].shape[1]\n",
"\n",
" input_embd_layer = L.Input(shape=(input_dim,))\n",
" x1 = L.Embedding(wifi_bssids_size, 64)(input_embd_layer) # use the embedding for bssid.\n",
" x1 = L.Flatten()(x1) # Return a copy of the array collapsed into one dimension.\n",
"\n",
" # rssi feats\n",
" input_dim = input_data[1].shape[1]\n",
"\n",
" input_layer = L.Input(input_dim, )\n",
" x2 = L.BatchNormalization()(input_layer)\n",
" x2 = L.Dense(NUM_FEATS * 64, activation='relu')(x2)\n",
"\n",
" # site\n",
" input_site_layer = L.Input(shape=(1,))\n",
" x3 = L.Embedding(site_count, 2)(input_site_layer)\n",
" x3 = L.Flatten()(x3)\n",
"\n",
"\n",
" # main stream\n",
" x = L.Concatenate(axis=1)([x1, x3, x2])\n",
"\n",
"\n",
" x = L.Reshape((1, -1))(x)\n",
" x = L.BatchNormalization()(x)\n",
" mod1=L.LSTM(256, dropout=0.4, recurrent_dropout=0.3, return_sequences=True, activation='tanh')\n",
" x = L.Bidirectional(mod1)(x)\n",
" x = L.Bidirectional(L.LSTM(32, dropout=0.4, return_sequences=False, activation='relu'))(x)\n",
" x = L.BatchNormalization()(x)\n",
" x = L.Dense(16, activation='tanh')(x) \n",
" \n",
" output_layer_1 = L.Dense(11, activation='softmax', name='floor')(x) \n",
"\n",
" model = M.Model([input_embd_layer, input_layer, input_site_layer], \n",
" [output_layer_1])\n",
"\n",
" model.compile(optimizer=tf.optimizers.Adam(lr=0.001),\n",
" loss=tf.keras.losses.CategoricalCrossentropy(), metrics=['mse','accuracy'])\n",
"\n",
" return model\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-25T14:49:05.139349Z",
"iopub.status.busy": "2021-03-25T14:49:05.138667Z",
"iopub.status.idle": "2021-03-25T14:49:05.142395Z",
"shell.execute_reply": "2021-03-25T14:49:05.141732Z"
},
"papermill": {
"duration": 0.030252,
"end_time": "2021-03-25T14:49:05.142541",
"exception": false,
"start_time": "2021-03-25T14:49:05.112289",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"data.index=data['path']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-25T14:49:05.181621Z",
"iopub.status.busy": "2021-03-25T14:49:05.180900Z",
"iopub.status.idle": "2021-03-25T14:49:05.188225Z",
"shell.execute_reply": "2021-03-25T14:49:05.187710Z"
},
"papermill": {
"duration": 0.029352,
"end_time": "2021-03-25T14:49:05.188372",
"exception": false,
"start_time": "2021-03-25T14:49:05.159020",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"#OneHot The floor\n",
"one_hot=pd.get_dummies(data['floor'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-25T14:49:05.245543Z",
"iopub.status.busy": "2021-03-25T14:49:05.244869Z",
"iopub.status.idle": "2021-03-25T14:49:06.290893Z",
"shell.execute_reply": "2021-03-25T14:49:06.290022Z"
},
"papermill": {
"duration": 1.086278,
"end_time": "2021-03-25T14:49:06.291076",
"exception": false,
"start_time": "2021-03-25T14:49:05.204798",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"(10352, 500)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#500 Random, totally unseen paths\n",
"val_p_ind=pd.DataFrame(data.path.unique()).sample(n=500,random_state=1).values.reshape((-1)) #100%/500samples so accuracy of preicision estimate should be around 0.2 % so +/- 0,1 % \n",
"t_idx = data.path.unique().tolist() \n",
"t_idx=[ a for a in t_idx if a not in val_p_ind.tolist()]\n",
"\n",
"train_data=data.loc[t_idx]\n",
"X_ass_val= data.loc[val_p_ind]\n",
"len(t_idx),len(val_p_ind)"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.016725,
"end_time": "2021-03-25T14:49:06.324749",
"exception": false,
"start_time": "2021-03-25T14:49:06.308024",
"status": "completed"
},
"tags": []
},
"source": [
"It is important that the paths are unseen for reasons shown in the error analysis\n",
"otherwise in the post processing i would have data leakage."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-25T14:49:06.404418Z",
"iopub.status.busy": "2021-03-25T14:49:06.403735Z",
"iopub.status.idle": "2021-03-25T14:49:06.421984Z",
"shell.execute_reply": "2021-03-25T14:49:06.421429Z"
},
"papermill": {
"duration": 0.080496,
"end_time": "2021-03-25T14:49:06.422117",
"exception": false,
"start_time": "2021-03-25T14:49:06.341621",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
\n", " | bssid_0 | \n", "bssid_1 | \n", "bssid_2 | \n", "bssid_3 | \n", "bssid_4 | \n", "bssid_5 | \n", "bssid_6 | \n", "bssid_7 | \n", "bssid_8 | \n", "bssid_9 | \n", "... | \n", "rssi_95 | \n", "rssi_96 | \n", "rssi_97 | \n", "rssi_98 | \n", "rssi_99 | \n", "x | \n", "y | \n", "floor | \n", "path | \n", "site_id | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
path | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
0 rows × 205 columns
\n", "