{ "cells": [ { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.016863, "end_time": "2021-03-25T14:47:56.519370", "exception": false, "start_time": "2021-03-25T14:47:56.502507", "status": "completed" }, "tags": [] }, "source": [ "## Overview\n", "This compared to the [99 accurate model](https://www.kaggle.com/nigelhenry/simple-99-accurate-floor-model) is a more of a brute force approach,added on with a bit of error analysis & post processing\n", "\n", "So i studied Kouki's [LSTM](https://www.kaggle.com/kokitanisaka/lstm-by-keras-with-unified-wi-fi-feats) that utilizes [the unified Wi-Fi dataset](https://www.kaggle.com/kokitanisaka/indoorunifiedwifids).
\n", "and i found it rather intersting that it could score so well on the xy, but the floor prediction was never improving as it was pretty stable after a few epochs.\n", "How could it be soo good for the xy and not for the floor?\n", "\n", "Didn't seem right so i set to work on this model:\n", "\n", "\n", "I know there is already a great floor predicting model out there that got already nearly 99% but seeing how competition is heating up, every decimal counts.\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "_kg_hide-input": true, "execution": { "iopub.execute_input": "2021-03-25T14:47:56.559230Z", "iopub.status.busy": "2021-03-25T14:47:56.558432Z", "iopub.status.idle": "2021-03-25T14:48:03.762244Z", "shell.execute_reply": "2021-03-25T14:48:03.761338Z" }, "papermill": { "duration": 7.227414, "end_time": "2021-03-25T14:48:03.762445", "exception": false, "start_time": "2021-03-25T14:47:56.535031", "status": "completed" }, "scrolled": true, "tags": [] }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import scipy.stats as stats\n", "from pathlib import Path\n", "import glob\n", "import pickle\n", "import random\n", "import os\n", "\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.model_selection import train_test_split\n", "import tensorflow as tf\n", "import tensorflow.keras.layers as L\n", "import tensorflow.keras.models as M\n", "import tensorflow.keras.backend as K\n", "# import tensorflow_addons as tfa\n", "# from tensorflow_addons.layers import WeightNormalization\n", "from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.016009, "end_time": "2021-03-25T14:48:03.794104", "exception": false, "start_time": "2021-03-25T14:48:03.778095", "status": "completed" }, "tags": [] }, "source": [ "Kouki's awsome code for preprocessing, is hidden below;" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "_kg_hide-input": true, "execution": { "iopub.execute_input": "2021-03-25T14:48:03.847184Z", "iopub.status.busy": "2021-03-25T14:48:03.846460Z", "iopub.status.idle": "2021-03-25T14:49:05.017501Z", "shell.execute_reply": "2021-03-25T14:49:05.016793Z" }, "papermill": { "duration": 61.208277, "end_time": "2021-03-25T14:49:05.017676", "exception": false, "start_time": "2021-03-25T14:48:03.809399", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BSSID TYPES: 61206\n", "BSSID TYPES: 33042\n" ] } ], "source": [ "# options\n", "\n", "N_SPLITS = 5\n", "\n", "SEED = 2021\n", "\n", "NUM_FEATS = 20 # number of features that we use. there are 100 feats but we don't need to use all of them\n", "\n", "base_path = '../'\n", "\n", "def set_seed(seed=42):\n", " random.seed(seed)\n", " os.environ['PYTHONHASHSEED'] = str(seed)\n", " np.random.seed(seed)\n", " tf.random.set_seed(seed)\n", " session_conf = tf.compat.v1.ConfigProto(\n", " intra_op_parallelism_threads=1,\n", " inter_op_parallelism_threads=1\n", " )\n", " sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)\n", " tf.compat.v1.keras.backend.set_session(sess)\n", " \n", "def comp_metric(xhat, yhat, fhat, x, y, f):\n", " intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)\n", " return intermediate.sum()/xhat.shape[0]\n", "\n", "feature_dir = f\"{base_path}/input/indoor-unified-wifi-ds\"\n", "train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))\n", "test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))\n", "subm = pd.read_csv(f'{base_path}/input/indoor-location-navigation/sample_submission.csv', index_col=0)\n", "\n", "with open(f'{feature_dir}/train_all.pkl', 'rb') as f:\n", " data = pickle.load( f)\n", "\n", "with open(f'{feature_dir}/test_all.pkl', 'rb') as f:\n", " test_data = pickle.load(f)\n", "\n", "\n", "# training target features\n", "\n", "BSSID_FEATS = [f'bssid_{i}' for i in range(NUM_FEATS)]\n", "RSSI_FEATS = [f'rssi_{i}' for i in range(NUM_FEATS)]\n", "\n", "\n", "# get numbers of bssids to embed them in a layer\n", "\n", "wifi_bssids = []\n", "for i in range(100):\n", " wifi_bssids.extend(data.iloc[:,i].values.tolist())\n", "wifi_bssids = list(set(wifi_bssids))\n", "\n", "wifi_bssids_size = len(wifi_bssids)\n", "print(f'BSSID TYPES: {wifi_bssids_size}')\n", "\n", "wifi_bssids_test = []\n", "for i in range(100):\n", " wifi_bssids_test.extend(test_data.iloc[:,i].values.tolist())\n", "wifi_bssids_test = list(set(wifi_bssids_test))\n", "\n", "wifi_bssids_size = len(wifi_bssids_test)\n", "print(f'BSSID TYPES: {wifi_bssids_size}')\n", "\n", "wifi_bssids.extend(wifi_bssids_test)\n", "wifi_bssids_size = len(wifi_bssids)\n", "\n", "# preprocess\n", "\n", "le = LabelEncoder()\n", "le.fit(wifi_bssids)\n", "le_site = LabelEncoder()\n", "le_site.fit(data['site_id'])\n", "\n", "ss = StandardScaler()\n", "ss.fit(data.loc[:,RSSI_FEATS])\n", "\n", "\n", "data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])\n", "for i in BSSID_FEATS:\n", " data.loc[:,i] = le.transform(data.loc[:,i])\n", " data.loc[:,i] = data.loc[:,i] + 1\n", " \n", "data.loc[:, 'site_id'] = le_site.transform(data.loc[:, 'site_id'])\n", "\n", "data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])\n", "\n", "test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])\n", "for i in BSSID_FEATS:\n", " test_data.loc[:,i] = le.transform(test_data.loc[:,i])\n", " test_data.loc[:,i] = test_data.loc[:,i] + 1\n", " \n", "test_data.loc[:, 'site_id'] = le_site.transform(test_data.loc[:, 'site_id'])\n", "\n", "test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])\n", "\n", "\n", "site_count = len(data['site_id'].unique())\n", "data.reset_index(drop=True, inplace=True)\n" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.016003, "end_time": "2021-03-25T14:49:05.050212", "exception": false, "start_time": "2021-03-25T14:49:05.034209", "status": "completed" }, "tags": [] }, "source": [ "## The Reajusted model\n", "The floor predictions wee being made by a softmax layer with just one dense unit, a pretty easy error to make and a difficult one to spot since there are two exits, and therefor the unit was prediciting between floors 1-0 (almost always one). So i set out to, test knowledge and fix it for the floor predictions and make a modol for them. \n", "\n", "I used one hot encoding( there are a total of 11 cats)with categorical loss and Sigmoid activation unit for the last layer & added a bit of bilateral firing power for an over kill and came out with the accurate following result:\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T14:49:05.094400Z", "iopub.status.busy": "2021-03-25T14:49:05.093565Z", "iopub.status.idle": "2021-03-25T14:49:05.096346Z", "shell.execute_reply": "2021-03-25T14:49:05.095866Z" }, "papermill": { "duration": 0.030636, "end_time": "2021-03-25T14:49:05.096485", "exception": false, "start_time": "2021-03-25T14:49:05.065849", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "#FLOOR\n", "def create_fmodel(input_data):\n", "\n", " # bssid feats\n", " input_dim = input_data[0].shape[1]\n", "\n", " input_embd_layer = L.Input(shape=(input_dim,))\n", " x1 = L.Embedding(wifi_bssids_size, 64)(input_embd_layer) # use the embedding for bssid.\n", " x1 = L.Flatten()(x1) # Return a copy of the array collapsed into one dimension.\n", "\n", " # rssi feats\n", " input_dim = input_data[1].shape[1]\n", "\n", " input_layer = L.Input(input_dim, )\n", " x2 = L.BatchNormalization()(input_layer)\n", " x2 = L.Dense(NUM_FEATS * 64, activation='relu')(x2)\n", "\n", " # site\n", " input_site_layer = L.Input(shape=(1,))\n", " x3 = L.Embedding(site_count, 2)(input_site_layer)\n", " x3 = L.Flatten()(x3)\n", "\n", "\n", " # main stream\n", " x = L.Concatenate(axis=1)([x1, x3, x2])\n", "\n", "\n", " x = L.Reshape((1, -1))(x)\n", " x = L.BatchNormalization()(x)\n", " mod1=L.LSTM(256, dropout=0.4, recurrent_dropout=0.3, return_sequences=True, activation='tanh')\n", " x = L.Bidirectional(mod1)(x)\n", " x = L.Bidirectional(L.LSTM(32, dropout=0.4, return_sequences=False, activation='relu'))(x)\n", " x = L.BatchNormalization()(x)\n", " x = L.Dense(16, activation='tanh')(x) \n", " \n", " output_layer_1 = L.Dense(11, activation='softmax', name='floor')(x) \n", "\n", " model = M.Model([input_embd_layer, input_layer, input_site_layer], \n", " [output_layer_1])\n", "\n", " model.compile(optimizer=tf.optimizers.Adam(lr=0.001),\n", " loss=tf.keras.losses.CategoricalCrossentropy(), metrics=['mse','accuracy'])\n", "\n", " return model\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T14:49:05.139349Z", "iopub.status.busy": "2021-03-25T14:49:05.138667Z", "iopub.status.idle": "2021-03-25T14:49:05.142395Z", "shell.execute_reply": "2021-03-25T14:49:05.141732Z" }, "papermill": { "duration": 0.030252, "end_time": "2021-03-25T14:49:05.142541", "exception": false, "start_time": "2021-03-25T14:49:05.112289", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "data.index=data['path']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T14:49:05.181621Z", "iopub.status.busy": "2021-03-25T14:49:05.180900Z", "iopub.status.idle": "2021-03-25T14:49:05.188225Z", "shell.execute_reply": "2021-03-25T14:49:05.187710Z" }, "papermill": { "duration": 0.029352, "end_time": "2021-03-25T14:49:05.188372", "exception": false, "start_time": "2021-03-25T14:49:05.159020", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "#OneHot The floor\n", "one_hot=pd.get_dummies(data['floor'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T14:49:05.245543Z", "iopub.status.busy": "2021-03-25T14:49:05.244869Z", "iopub.status.idle": "2021-03-25T14:49:06.290893Z", "shell.execute_reply": "2021-03-25T14:49:06.290022Z" }, "papermill": { "duration": 1.086278, "end_time": "2021-03-25T14:49:06.291076", "exception": false, "start_time": "2021-03-25T14:49:05.204798", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(10352, 500)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#500 Random, totally unseen paths\n", "val_p_ind=pd.DataFrame(data.path.unique()).sample(n=500,random_state=1).values.reshape((-1)) #100%/500samples so accuracy of preicision estimate should be around 0.2 % so +/- 0,1 % \n", "t_idx = data.path.unique().tolist() \n", "t_idx=[ a for a in t_idx if a not in val_p_ind.tolist()]\n", "\n", "train_data=data.loc[t_idx]\n", "X_ass_val= data.loc[val_p_ind]\n", "len(t_idx),len(val_p_ind)" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.016725, "end_time": "2021-03-25T14:49:06.324749", "exception": false, "start_time": "2021-03-25T14:49:06.308024", "status": "completed" }, "tags": [] }, "source": [ "It is important that the paths are unseen for reasons shown in the error analysis\n", "otherwise in the post processing i would have data leakage." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T14:49:06.404418Z", "iopub.status.busy": "2021-03-25T14:49:06.403735Z", "iopub.status.idle": "2021-03-25T14:49:06.421984Z", "shell.execute_reply": "2021-03-25T14:49:06.421429Z" }, "papermill": { "duration": 0.080496, "end_time": "2021-03-25T14:49:06.422117", "exception": false, "start_time": "2021-03-25T14:49:06.341621", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bssid_0bssid_1bssid_2bssid_3bssid_4bssid_5bssid_6bssid_7bssid_8bssid_9...rssi_95rssi_96rssi_97rssi_98rssi_99xyfloorpathsite_id
path
\n", "

0 rows × 205 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [bssid_0, bssid_1, bssid_2, bssid_3, bssid_4, bssid_5, bssid_6, bssid_7, bssid_8, bssid_9, bssid_10, bssid_11, bssid_12, bssid_13, bssid_14, bssid_15, bssid_16, bssid_17, bssid_18, bssid_19, bssid_20, bssid_21, bssid_22, bssid_23, bssid_24, bssid_25, bssid_26, bssid_27, bssid_28, bssid_29, bssid_30, bssid_31, bssid_32, bssid_33, bssid_34, bssid_35, bssid_36, bssid_37, bssid_38, bssid_39, bssid_40, bssid_41, bssid_42, bssid_43, bssid_44, bssid_45, bssid_46, bssid_47, bssid_48, bssid_49, bssid_50, bssid_51, bssid_52, bssid_53, bssid_54, bssid_55, bssid_56, bssid_57, bssid_58, bssid_59, bssid_60, bssid_61, bssid_62, bssid_63, bssid_64, bssid_65, bssid_66, bssid_67, bssid_68, bssid_69, bssid_70, bssid_71, bssid_72, bssid_73, bssid_74, bssid_75, bssid_76, bssid_77, bssid_78, bssid_79, bssid_80, bssid_81, bssid_82, bssid_83, bssid_84, bssid_85, bssid_86, bssid_87, bssid_88, bssid_89, bssid_90, bssid_91, bssid_92, bssid_93, bssid_94, bssid_95, bssid_96, bssid_97, bssid_98, bssid_99, ...]\n", "Index: []\n", "\n", "[0 rows x 205 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check there is no cross contamination of the validation data\n", "train_data[train_data['path']==val_p_ind[5]]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T14:49:06.474967Z", "iopub.status.busy": "2021-03-25T14:49:06.472331Z", "iopub.status.idle": "2021-03-25T15:20:04.415833Z", "shell.execute_reply": "2021-03-25T15:20:04.416354Z" }, "papermill": { "duration": 1857.975369, "end_time": "2021-03-25T15:20:04.416548", "exception": false, "start_time": "2021-03-25T14:49:06.441179", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/100\n", "1923/1923 [==============================] - 267s 134ms/step - loss: 0.6500 - mse: 0.0233 - accuracy: 0.7893 - val_loss: 0.0709 - val_mse: 0.0027 - val_accuracy: 0.9812\n", "Epoch 2/100\n", "1923/1923 [==============================] - 268s 140ms/step - loss: 0.0238 - mse: 8.7623e-04 - accuracy: 0.9929 - val_loss: 0.0539 - val_mse: 0.0020 - val_accuracy: 0.9856\n", "Epoch 3/100\n", "1923/1923 [==============================] - 263s 137ms/step - loss: 0.0116 - mse: 4.6434e-04 - accuracy: 0.9967 - val_loss: 0.0656 - val_mse: 0.0023 - val_accuracy: 0.9851\n", "Epoch 4/100\n", "1923/1923 [==============================] - 266s 138ms/step - loss: 0.0052 - mse: 2.1056e-04 - accuracy: 0.9985 - val_loss: 0.0648 - val_mse: 0.0021 - val_accuracy: 0.9856\n", "Epoch 5/100\n", "1923/1923 [==============================] - 259s 135ms/step - loss: 0.0054 - mse: 2.1736e-04 - accuracy: 0.9985 - val_loss: 0.0629 - val_mse: 0.0018 - val_accuracy: 0.9891\n", "\n", "Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.\n", "Epoch 6/100\n", "1923/1923 [==============================] - 260s 135ms/step - loss: 0.0023 - mse: 8.6135e-05 - accuracy: 0.9994 - val_loss: 0.0611 - val_mse: 0.0017 - val_accuracy: 0.9898\n", "Epoch 7/100\n", "1923/1923 [==============================] - 262s 136ms/step - loss: 8.9690e-04 - mse: 2.9825e-05 - accuracy: 0.9998 - val_loss: 0.0674 - val_mse: 0.0018 - val_accuracy: 0.9894\n" ] }, { "data": { "text/plain": [ "0.9855541718555417" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_trainf = one_hot.loc[t_idx, :]\n", "y_validf = one_hot.loc[val_p_ind, :]\n", "X_train = train_data.loc[:, BSSID_FEATS + RSSI_FEATS + ['site_id']]\n", "X_valid = X_ass_val.loc[:, BSSID_FEATS + RSSI_FEATS + ['site_id']]\n", "fmodel = create_fmodel([X_train.loc[:,BSSID_FEATS], X_train.loc[:,RSSI_FEATS], X_train.loc[:,'site_id']])\n", "# model = multi_gpu_model(model, 1)\n", "fmodel.fit([X_train.loc[:,BSSID_FEATS], X_train.loc[:,RSSI_FEATS], X_train.loc[:,'site_id']], y_trainf, \n", " validation_data=([X_valid.loc[:,BSSID_FEATS], X_valid.loc[:,RSSI_FEATS], X_valid.loc[:,'site_id']], y_validf), \n", " batch_size=128, epochs=100\n", " ,shuffle=True\n", " ,callbacks=[\n", " ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_delta=1e-4, mode='min')\n", " , ModelCheckpoint(f'{base_path}/RNN_{SEED}_.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')\n", " , EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, mode='min', baseline=None, restore_best_weights=True)\n", " ]\n", " )\n", "\n", "fmodel.load_weights(f'{base_path}/RNN_{SEED}_.hdf5')\n", "fvalid = fmodel.predict([X_ass_val.loc[:,BSSID_FEATS], X_ass_val.loc[:,RSSI_FEATS], X_ass_val.loc[:,'site_id']])#minus two is make the interval [-2:8] again\n", "fvalid = np.argmax(fvalid, axis=1)-2\n", "# ass_val_arr[:, fold] = fvalid\n", "\n", "pred = fmodel.predict([test_data.loc[:,BSSID_FEATS], test_data.loc[:,RSSI_FEATS], test_data.loc[:,'site_id']]) # test_data.iloc[:, :-1])\n", "pred =np.argmax(pred, axis=1)-2#minus two is make the interval [-2:8] again\n", "# preds_f_arr[:, fold] = pred\n", "\n", "ass_val_floors=fvalid\n", "floors=pred\n", " \n", "accuracy_score(X_ass_val['floor'], ass_val_floors)#second validation, checks the argmax and shifting" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 4.042726, "end_time": "2021-03-25T15:20:12.587771", "exception": false, "start_time": "2021-03-25T15:20:08.545045", "status": "completed" }, "tags": [] }, "source": [ "## Error Analysis " ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T15:20:20.857222Z", "iopub.status.busy": "2021-03-25T15:20:20.856625Z", "iopub.status.idle": "2021-03-25T15:20:21.171213Z", "shell.execute_reply": "2021-03-25T15:20:21.170489Z" }, "papermill": { "duration": 4.441317, "end_time": "2021-03-25T15:20:21.171371", "exception": false, "start_time": "2021-03-25T15:20:16.730054", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "((174, 206), (38,))" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Error Analysis - how many paths i got wrong and how many times \n", "X_ass_val['wrong']=(X_ass_val['floor']- ass_val_floors)!=0\n", "wrongs= X_ass_val[X_ass_val['wrong']==True]\n", "rights= X_ass_val[X_ass_val['wrong']==False]\n", "wrongs.shape, wrongs['path'].unique().shape" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 4.138347, "end_time": "2021-03-25T15:20:29.362908", "exception": false, "start_time": "2021-03-25T15:20:25.224561", "status": "completed" }, "tags": [] }, "source": [ "**Only one floor per path right** ? \n", "well i think this is a given as the original data is presented as in paths within the floors file;\n", "I double checked this assumptionto be true." ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 4.110906, "end_time": "2021-03-25T15:20:37.498830", "exception": false, "start_time": "2021-03-25T15:20:33.387924", "status": "completed" }, "tags": [] }, "source": [ "So i check if the ones i got wrong i ever got right..\n", "I check out the number of times i got that path right" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "_kg_hide-input": false, "_kg_hide-output": true, "execution": { "iopub.execute_input": "2021-03-25T15:20:45.666098Z", "iopub.status.busy": "2021-03-25T15:20:45.665388Z", "iopub.status.idle": "2021-03-25T15:20:45.843468Z", "shell.execute_reply": "2021-03-25T15:20:45.842933Z" }, "papermill": { "duration": 4.224142, "end_time": "2021-03-25T15:20:45.843625", "exception": false, "start_time": "2021-03-25T15:20:41.619483", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[(23, 1),\n", " (62, 2),\n", " (20, 1),\n", " (58, 1),\n", " (80, 40),\n", " (49, 2),\n", " (16, 1),\n", " (5, 1),\n", " (19, 1),\n", " (84, 14),\n", " (21, 3),\n", " (38, 3),\n", " (132, 5),\n", " (57, 3),\n", " (15, 6),\n", " (3, 1),\n", " (39, 4),\n", " (13, 1),\n", " (31, 2),\n", " (331, 1),\n", " (74, 4),\n", " (15, 1),\n", " (9, 9),\n", " (18, 1),\n", " (33, 9),\n", " (10, 1),\n", " (17, 3),\n", " (23, 1),\n", " (22, 1),\n", " (82, 1),\n", " (94, 1),\n", " (9, 3),\n", " (50, 1),\n", " (16, 3),\n", " (4, 10),\n", " (4, 1)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create tuple\n", "#(Number of times predicted correctly left vs numebr of times corrected incorecctly right)\n", "[(rights[rights['path']==p].shape[0],wrongs[wrongs['path']==p].shape[0]) for p in wrongs['path'].unique() if p in rights['path'].unique()]" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 4.057255, "end_time": "2021-03-25T15:20:54.118709", "exception": false, "start_time": "2021-03-25T15:20:50.061454", "status": "completed" }, "tags": [] }, "source": [ "As you can see if you unhide the above result, in taking the most frequent column i would avoid many erorrs" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T15:21:02.300357Z", "iopub.status.busy": "2021-03-25T15:21:02.299383Z", "iopub.status.idle": "2021-03-25T15:21:03.116053Z", "shell.execute_reply": "2021-03-25T15:21:03.115378Z" }, "papermill": { "duration": 4.893704, "end_time": "2021-03-25T15:21:03.116196", "exception": false, "start_time": "2021-03-25T15:20:58.222492", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "#re-elaboration taking the most frequent\n", "X_ass_val['p_floor']=ass_val_floors\n", "X_ass_val=X_ass_val.reset_index(drop=True)\n", "X_ass_val\n", "\n", "def mode(a):\n", " '''returns the mode of the group'''\n", " return( a['p_floor'].value_counts().head(1).reset_index()['index'].values[0])\n", "\n", "df = pd.DataFrame() \n", "# df['path']=X_ass_val.groupby('path').apply(modee1)\n", "df['blended_floor_pred']=X_ass_val.groupby('path').apply(mode)" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 4.105118, "end_time": "2021-03-25T15:21:11.322575", "exception": false, "start_time": "2021-03-25T15:21:07.217457", "status": "completed" }, "tags": [] }, "source": [ "Checking the post processing Bump" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T15:21:19.718064Z", "iopub.status.busy": "2021-03-25T15:21:19.713033Z", "iopub.status.idle": "2021-03-25T15:21:19.996038Z", "shell.execute_reply": "2021-03-25T15:21:19.996548Z" }, "papermill": { "duration": 4.521753, "end_time": "2021-03-25T15:21:19.996732", "exception": false, "start_time": "2021-03-25T15:21:15.474979", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0.9962640099626401" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_ass_val=X_ass_val.merge(df, how='left', on='path')\n", "accuracy_score(X_ass_val['floor'], X_ass_val['blended_floor_pred'])" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 4.087199, "end_time": "2021-03-25T15:21:28.153491", "exception": false, "start_time": "2021-03-25T15:21:24.066292", "status": "completed" }, "tags": [] }, "source": [ "I can round it up (defectivly) to 99,80%. This Rounding is because of the test size i have a scale unit of 0,2%. " ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 4.093125, "end_time": "2021-03-25T15:21:36.328867", "exception": false, "start_time": "2021-03-25T15:21:32.235742", "status": "completed" }, "tags": [] }, "source": [ "Satisfactory, so do it on the test data too and submit." ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "_kg_hide-input": true, "execution": { "iopub.execute_input": "2021-03-25T15:21:44.498849Z", "iopub.status.busy": "2021-03-25T15:21:44.496862Z", "iopub.status.idle": "2021-03-25T15:21:45.790694Z", "shell.execute_reply": "2021-03-25T15:21:45.791181Z" }, "papermill": { "duration": 5.371631, "end_time": "2021-03-25T15:21:45.791378", "exception": false, "start_time": "2021-03-25T15:21:40.419747", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "test_data['path']=test_data['site_path_timestamp'].str.split(pat='_', n=- 1, expand=True)[1]\n", "(test_data['site_path_timestamp'].str.split(pat='_', n=- 1, expand=True)[0]+test_data['site_path_timestamp'].str.split(pat='_', n=- 1, expand=True)[1]).unique().shape\n", "\n", "test_data['p_floor']=pred\n", "test_data\n", "#re-elaboration taking the median\n", "def modee1(a):\n", " return (a['path'].unique())\n", "def modee2(a):\n", " return( a['p_floor'].value_counts().head(1).reset_index()['index'].values[0])\n", "\n", "dft = pd.DataFrame() \n", "# df['path']=X_ass_val.groupby('path').apply(modee1)\n", "dft['my_b_floor_pred']=test_data.groupby('path').apply(modee2)\n", "test_data=test_data.merge(dft, how='left', on='path')\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2021-03-25T15:21:54.053294Z", "iopub.status.busy": "2021-03-25T15:21:54.052342Z", "iopub.status.idle": "2021-03-25T15:21:54.169648Z", "shell.execute_reply": "2021-03-25T15:21:54.170143Z" }, "papermill": { "duration": 4.292946, "end_time": "2021-03-25T15:21:54.170321", "exception": false, "start_time": "2021-03-25T15:21:49.877375", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "#fetching K' submissions to see if there is an improvement on the lb\n", "sub= pd.read_csv('../input/indoor-location-navigation/submission.csv')\n", "sub['floor']=test_data['my_b_floor_pred']\n", "sub.index=sub['site_path_timestamp']\n", "sub.drop(columns=['site_path_timestamp'],inplace=True)\n", "sub.to_csv('submission_floor.csv')" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 4.07245, "end_time": "2021-03-25T15:22:02.455467", "exception": false, "start_time": "2021-03-25T15:21:58.383017", "status": "completed" }, "tags": [] }, "source": [ "See if it gets and up grade on the score by substituting this on the floor prediction\n", " unfortunately on the pubblic leaderboard score it doesn't improve on the visible decimal values, but on the private ? \n", " \n", "check if there are differences that got noticed :" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "papermill": { "default_parameters": {}, "duration": 2077.205078, "end_time": "2021-03-25T15:22:28.131889", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2021-03-25T14:47:50.926811", "version": "2.2.2" } }, "nbformat": 4, "nbformat_minor": 4 }