## Overview
This compared to the [99 accurate model](https://www.kaggle.com/nigelhenry/simple-99-accurate-floor-model) is a more of a brute force approach,added on with a bit of error analysis & post processing

So i studied Kouki's [LSTM](https://www.kaggle.com/kokitanisaka/lstm-by-keras-with-unified-wi-fi-feats) that utilizes [the unified Wi-Fi dataset](https://www.kaggle.com/kokitanisaka/indoorunifiedwifids).<br>
and i found it rather intersting that it could score so well on the xy, but the floor prediction was never improving as it was pretty stable after a few epochs.
How could it be soo good for the xy and not for the floor?

Didn't seem right so i set to work on this model:


I know there is already a great floor predicting model out there that got already nearly 99% but seeing how competition is heating up, every decimal counts.



In [3]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path
import glob
import pickle
import random
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
# import tensorflow_addons as tfa
# from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

Kouki's awsome code for preprocessing, is hidden below;

In [4]:
# options

N_SPLITS = 5

SEED = 2021

NUM_FEATS = 20 # number of features that we use. there are 100 feats but we don't need to use all of them

base_path = '../'

def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(
        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)
    
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

feature_dir = f"{base_path}/input/indoor-unified-wifi-ds"
train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))
subm = pd.read_csv(f'{base_path}/input/indoor-location-navigation/sample_submission.csv', index_col=0)

with open(f'{feature_dir}/train_all.pkl', 'rb') as f:
    data = pickle.load( f)

with open(f'{feature_dir}/test_all.pkl', 'rb') as f:
    test_data = pickle.load(f)


# training target features

BSSID_FEATS = [f'bssid_{i}' for i in range(NUM_FEATS)]
RSSI_FEATS  = [f'rssi_{i}' for i in range(NUM_FEATS)]


# get numbers of bssids to embed them in a layer

wifi_bssids = []
for i in range(100):
    wifi_bssids.extend(data.iloc[:,i].values.tolist())
wifi_bssids = list(set(wifi_bssids))

wifi_bssids_size = len(wifi_bssids)
print(f'BSSID TYPES: {wifi_bssids_size}')

wifi_bssids_test = []
for i in range(100):
    wifi_bssids_test.extend(test_data.iloc[:,i].values.tolist())
wifi_bssids_test = list(set(wifi_bssids_test))

wifi_bssids_size = len(wifi_bssids_test)
print(f'BSSID TYPES: {wifi_bssids_size}')

wifi_bssids.extend(wifi_bssids_test)
wifi_bssids_size = len(wifi_bssids)

# preprocess

le = LabelEncoder()
le.fit(wifi_bssids)
le_site = LabelEncoder()
le_site.fit(data['site_id'])

ss = StandardScaler()
ss.fit(data.loc[:,RSSI_FEATS])


data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])
for i in BSSID_FEATS:
    data.loc[:,i] = le.transform(data.loc[:,i])
    data.loc[:,i] = data.loc[:,i] + 1
    
data.loc[:, 'site_id'] = le_site.transform(data.loc[:, 'site_id'])

data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])

test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])
for i in BSSID_FEATS:
    test_data.loc[:,i] = le.transform(test_data.loc[:,i])
    test_data.loc[:,i] = test_data.loc[:,i] + 1
    
test_data.loc[:, 'site_id'] = le_site.transform(test_data.loc[:, 'site_id'])

test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])


site_count = len(data['site_id'].unique())
data.reset_index(drop=True, inplace=True)


BSSID TYPES: 61206
BSSID TYPES: 33042


## The Reajusted model
The floor predictions wee being made by a softmax layer with just one dense unit, a pretty easy error to make and a difficult one to spot since there are two exits, and therefor the unit was prediciting between floors 1-0 (almost always one). So i set out to, test knowledge and fix it for the floor predictions and make a modol for them. 

I used one hot encoding( there are a total of 11 cats)with categorical loss and Sigmoid activation unit for the last layer &  added a bit of bilateral firing power for an over kill and came out with the accurate following result:


In [5]:
#FLOOR
def create_fmodel(input_data):

    # bssid feats
    input_dim = input_data[0].shape[1]

    input_embd_layer = L.Input(shape=(input_dim,))
    x1 = L.Embedding(wifi_bssids_size, 64)(input_embd_layer)  # use the embedding for bssid.
    x1 = L.Flatten()(x1)  # Return a copy of the array collapsed into one dimension.

    # rssi feats
    input_dim = input_data[1].shape[1]

    input_layer = L.Input(input_dim, )
    x2 = L.BatchNormalization()(input_layer)
    x2 = L.Dense(NUM_FEATS * 64, activation='relu')(x2)

    # site
    input_site_layer = L.Input(shape=(1,))
    x3 = L.Embedding(site_count, 2)(input_site_layer)
    x3 = L.Flatten()(x3)


    # main stream
    x = L.Concatenate(axis=1)([x1, x3, x2])


    x = L.Reshape((1, -1))(x)
    x = L.BatchNormalization()(x)
    mod1=L.LSTM(256, dropout=0.4, recurrent_dropout=0.3, return_sequences=True, activation='tanh')
    x = L.Bidirectional(mod1)(x)
    x = L.Bidirectional(L.LSTM(32, dropout=0.4, return_sequences=False, activation='relu'))(x)
    x = L.BatchNormalization()(x)
    x = L.Dense(16, activation='tanh')(x) 
    
    output_layer_1 = L.Dense(11, activation='softmax', name='floor')(x) 

    model = M.Model([input_embd_layer, input_layer, input_site_layer], 
                    [output_layer_1])

    model.compile(optimizer=tf.optimizers.Adam(lr=0.001),
                  loss=tf.keras.losses.CategoricalCrossentropy(), metrics=['mse','accuracy'])

    return model




In [6]:
data.index=data['path']

In [7]:
#OneHot The floor
one_hot=pd.get_dummies(data['floor'])

In [8]:
#500 Random, totally unseen paths
val_p_ind=pd.DataFrame(data.path.unique()).sample(n=500,random_state=1).values.reshape((-1)) #100%/500samples so accuracy of preicision estimate should be around 0.2 % so +/- 0,1 % 
t_idx = data.path.unique().tolist() 
t_idx=[ a for a in t_idx if a not in val_p_ind.tolist()]

train_data=data.loc[t_idx]
X_ass_val= data.loc[val_p_ind]
len(t_idx),len(val_p_ind)

(10352, 500)

It is important that the paths are unseen for reasons shown in the error analysis
otherwise in the post processing i would have data leakage.

In [9]:
#check there is no cross contamination of the validation data
train_data[train_data['path']==val_p_ind[5]]

Unnamed: 0_level_0,bssid_0,bssid_1,bssid_2,bssid_3,bssid_4,bssid_5,bssid_6,bssid_7,bssid_8,bssid_9,...,rssi_95,rssi_96,rssi_97,rssi_98,rssi_99,x,y,floor,path,site_id
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [8]:
y_trainf = one_hot.loc[t_idx, :]
y_validf = one_hot.loc[val_p_ind, :]
X_train = train_data.loc[:, BSSID_FEATS + RSSI_FEATS + ['site_id']]
X_valid = X_ass_val.loc[:, BSSID_FEATS + RSSI_FEATS + ['site_id']]
fmodel = create_fmodel([X_train.loc[:,BSSID_FEATS], X_train.loc[:,RSSI_FEATS], X_train.loc[:,'site_id']])
#     model = multi_gpu_model(model, 1)
fmodel.fit([X_train.loc[:,BSSID_FEATS], X_train.loc[:,RSSI_FEATS], X_train.loc[:,'site_id']], y_trainf, 
            validation_data=([X_valid.loc[:,BSSID_FEATS], X_valid.loc[:,RSSI_FEATS], X_valid.loc[:,'site_id']], y_validf), 
            batch_size=128, epochs=100
             ,shuffle=True
            ,callbacks=[
            ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_delta=1e-4, mode='min')
            , ModelCheckpoint(f'{base_path}/RNN_{SEED}_.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
            , EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, mode='min', baseline=None, restore_best_weights=True)
        ]
   )

fmodel.load_weights(f'{base_path}/RNN_{SEED}_.hdf5')
fvalid = fmodel.predict([X_ass_val.loc[:,BSSID_FEATS], X_ass_val.loc[:,RSSI_FEATS], X_ass_val.loc[:,'site_id']])#minus two is make the interval [-2:8] again
fvalid = np.argmax(fvalid, axis=1)-2
# ass_val_arr[:, fold] = fvalid

pred = fmodel.predict([test_data.loc[:,BSSID_FEATS], test_data.loc[:,RSSI_FEATS], test_data.loc[:,'site_id']]) # test_data.iloc[:, :-1])
pred =np.argmax(pred, axis=1)-2#minus two is make the interval [-2:8] again
# preds_f_arr[:, fold] = pred

ass_val_floors=fvalid
floors=pred
                                                                
accuracy_score(X_ass_val['floor'], ass_val_floors)#second validation, checks the argmax and shifting

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 6/100
Epoch 7/100


0.9855541718555417

## Error Analysis 

In [9]:
#Error Analysis - how many paths i got wrong and how many times 
X_ass_val['wrong']=(X_ass_val['floor']- ass_val_floors)!=0
wrongs= X_ass_val[X_ass_val['wrong']==True]
rights= X_ass_val[X_ass_val['wrong']==False]
wrongs.shape, wrongs['path'].unique().shape

((174, 206), (38,))

**Only one floor per path right** ? 
well i think this is a given as the original data is presented as in paths within the floors file;
I double checked this assumptionto be true.

So i check if the ones i got wrong i ever got right..
I check out the number of times i got that path right

In [10]:
#create tuple
#(Number of times predicted correctly left  vs numebr of times corrected incorecctly right)
[(rights[rights['path']==p].shape[0],wrongs[wrongs['path']==p].shape[0]) for p in  wrongs['path'].unique() if p in rights['path'].unique()]

[(23, 1),
 (62, 2),
 (20, 1),
 (58, 1),
 (80, 40),
 (49, 2),
 (16, 1),
 (5, 1),
 (19, 1),
 (84, 14),
 (21, 3),
 (38, 3),
 (132, 5),
 (57, 3),
 (15, 6),
 (3, 1),
 (39, 4),
 (13, 1),
 (31, 2),
 (331, 1),
 (74, 4),
 (15, 1),
 (9, 9),
 (18, 1),
 (33, 9),
 (10, 1),
 (17, 3),
 (23, 1),
 (22, 1),
 (82, 1),
 (94, 1),
 (9, 3),
 (50, 1),
 (16, 3),
 (4, 10),
 (4, 1)]

As you can see if you unhide the above result, in taking the most frequent column i would avoid many erorrs

In [11]:
#re-elaboration taking the most frequent
X_ass_val['p_floor']=ass_val_floors
X_ass_val=X_ass_val.reset_index(drop=True)
X_ass_val

def mode(a):
    '''returns the mode of the group'''
    return( a['p_floor'].value_counts().head(1).reset_index()['index'].values[0])

df = pd.DataFrame()    
# df['path']=X_ass_val.groupby('path').apply(modee1)
df['blended_floor_pred']=X_ass_val.groupby('path').apply(mode)

Checking the post processing Bump

In [12]:
X_ass_val=X_ass_val.merge(df, how='left', on='path')
accuracy_score(X_ass_val['floor'], X_ass_val['blended_floor_pred'])

0.9962640099626401

I can round it up (defectivly) to 99,80%. This Rounding is because of the test size i have a scale unit of 0,2%. 

Satisfactory, so do it on the test data too and submit.

In [13]:
test_data['path']=test_data['site_path_timestamp'].str.split(pat='_', n=- 1, expand=True)[1]
(test_data['site_path_timestamp'].str.split(pat='_', n=- 1, expand=True)[0]+test_data['site_path_timestamp'].str.split(pat='_', n=- 1, expand=True)[1]).unique().shape

test_data['p_floor']=pred
test_data
#re-elaboration taking the median
def modee1(a):
    return (a['path'].unique())
def modee2(a):
    return( a['p_floor'].value_counts().head(1).reset_index()['index'].values[0])

dft = pd.DataFrame()    
# df['path']=X_ass_val.groupby('path').apply(modee1)
dft['my_b_floor_pred']=test_data.groupby('path').apply(modee2)
test_data=test_data.merge(dft, how='left', on='path')


In [14]:
#fetching K' submissions to see if there is an improvement on the lb
sub= pd.read_csv('../input/indoor-location-navigation/submission.csv')
sub['floor']=test_data['my_b_floor_pred']
sub.index=sub['site_path_timestamp']
sub.drop(columns=['site_path_timestamp'],inplace=True)
sub.to_csv('submission_floor.csv')

See if it gets and up grade on the score by substituting this on the floor prediction
      unfortunately on the pubblic leaderboard score it doesn't improve on the visible decimal values, but on the private ? 
      
check if there are differences that got noticed :