{ "cells": [ { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.007463, "end_time": "2021-02-03T20:30:06.571139", "exception": false, "start_time": "2021-02-03T20:30:06.563676", "status": "completed" }, "tags": [] }, "source": [ "### Wifi features\n", "\n", "This this is the code to generate the wifi features available in [this dataset](https://www.kaggle.com/devinanzelmo/indoor-navigation-and-location-wifi-features). Using these features can get a score below 14. For an example notebook using them see [this notebook](https://www.kaggle.com/devinanzelmo/wifi-features-lightgbm-starter). They only uses waypoints, wifi and timestamp data to generate solution. See this [forum post](https://www.kaggle.com/c/indoor-location-navigation/discussion/215445) for an outline of this solution method, and methods of improvement.\n", "\n", "There are `break`'s inserted into loops which need to be removed to get this to run. Right now data is written to current working directory. This takes 2-4 hours to run depending on hard drive etc. There is a lot of room for improvement speeding up feature generation. \n", "\n", "**Update:** I added one line that creates a column for the path filename, this allows for a groupkfold crossvalidation. \n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2021-02-03T20:30:06.590945Z", "iopub.status.busy": "2021-02-03T20:30:06.589984Z", "iopub.status.idle": "2021-02-03T20:30:06.593594Z", "shell.execute_reply": "2021-02-03T20:30:06.592887Z" }, "papermill": { "duration": 0.01623, "end_time": "2021-02-03T20:30:06.593847", "exception": false, "start_time": "2021-02-03T20:30:06.577617", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import glob\n", "import os\n", "import gc\n", "import json " ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2021-02-03T20:30:06.614521Z", "iopub.status.busy": "2021-02-03T20:30:06.613572Z", "iopub.status.idle": "2021-02-03T20:30:06.616669Z", "shell.execute_reply": "2021-02-03T20:30:06.616121Z" }, "papermill": { "duration": 0.015585, "end_time": "2021-02-03T20:30:06.616837", "exception": false, "start_time": "2021-02-03T20:30:06.601252", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "base_path = '../input/indoor-location-navigation/'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2021-02-03T20:30:06.639011Z", "iopub.status.busy": "2021-02-03T20:30:06.638118Z", "iopub.status.idle": "2021-02-03T20:30:09.333807Z", "shell.execute_reply": "2021-02-03T20:30:09.334360Z" }, "papermill": { "duration": 2.711076, "end_time": "2021-02-03T20:30:09.334617", "exception": false, "start_time": "2021-02-03T20:30:06.623541", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# pull out all the buildings actually used in the test set, given current method we don't need the other ones\n", "ssubm = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv')\n", "\n", "# only 24 of the total buildings are used in the test set, \n", "# this allows us to greatly reduce the intial size of the dataset\n", "\n", "ssubm_df = ssubm[\"site_path_timestamp\"].apply(lambda x: pd.Series(x.split(\"_\")))\n", "used_buildings = sorted(ssubm_df[0].value_counts().index.tolist())\n", "\n", "# dictionary used to map the floor codes to the values used in the submission file. \n", "floor_map = {\"B2\":-2, \"B1\":-1, \"F1\":0, \"F2\": 1, \"F3\":2, \"F4\":3, \"F5\":4, \"F6\":5, \"F7\":6,\"F8\":7, \"F9\":8,\n", " \"1F\":0, \"2F\":1, \"3F\":2, \"4F\":3, \"5F\":4, \"6F\":5, \"7F\":6, \"8F\": 7, \"9F\":8}" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2021-02-03T20:30:09.359905Z", "iopub.status.busy": "2021-02-03T20:30:09.359123Z", "iopub.status.idle": "2021-02-03T20:30:09.362909Z", "shell.execute_reply": "2021-02-03T20:30:09.362224Z" }, "papermill": { "duration": 0.021272, "end_time": "2021-02-03T20:30:09.363069", "exception": false, "start_time": "2021-02-03T20:30:09.341797", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# get only the wifi bssid that occur over 1000 times(this number can be experimented with)\n", "# these will be the only ones used when constructing features\n", "bssid = dict()\n", "\n", "for building in used_buildings:\n", " break\n", " folders = sorted(glob.glob(os.path.join(base_path,'train/'+building+'/*')))\n", " print(building)\n", " wifi = list()\n", " for folder in folders:\n", " floor = floor_map[folder.split('/')[-1]]\n", " files = glob.glob(os.path.join(folder, \"*.txt\"))\n", " for file in files:\n", " with open(file) as f:\n", " txt = f.readlines()\n", " for e, line in enumerate(txt):\n", " tmp = line.strip().split()\n", " if tmp[1] == \"TYPE_WIFI\":\n", " wifi.append(tmp)\n", " df = pd.DataFrame(wifi)\n", " #top_bssid = df[3].value_counts().iloc[:500].index.tolist()\n", " value_counts = df[3].value_counts()\n", " top_bssid = value_counts[value_counts > 0].index.tolist()\n", " print(len(top_bssid))\n", " bssid[building] = top_bssid\n", " del df\n", " del wifi\n", " gc.collect()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2021-02-03T20:30:09.383252Z", "iopub.status.busy": "2021-02-03T20:30:09.382581Z", "iopub.status.idle": "2021-02-03T20:30:09.386704Z", "shell.execute_reply": "2021-02-03T20:30:09.385809Z" }, "papermill": { "duration": 0.016635, "end_time": "2021-02-03T20:30:09.386885", "exception": false, "start_time": "2021-02-03T20:30:09.370250", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "with open(\"bssid_1000.json\", \"w\") as f:\n", " json.dump(bssid, f)\n", "\n", "with open(\"bssid_1000.json\") as f:\n", " bssid = json.load(f)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2021-02-03T20:30:09.418284Z", "iopub.status.busy": "2021-02-03T20:30:09.417119Z", "iopub.status.idle": "2021-02-03T20:30:09.420513Z", "shell.execute_reply": "2021-02-03T20:30:09.419767Z" }, "papermill": { "duration": 0.026514, "end_time": "2021-02-03T20:30:09.420694", "exception": false, "start_time": "2021-02-03T20:30:09.394180", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# generate all the training data \n", "building_dfs = dict()\n", "\n", "for building in used_buildings:\n", " break\n", " folders = sorted(glob.glob(os.path.join(base_path,'train', building +'/*')))\n", " dfs = list()\n", " index = sorted(bssid[building])\n", " print(building)\n", " for folder in folders:\n", " floor = floor_map[folder.split('/')[-1]]\n", " files = glob.glob(os.path.join(folder, \"*.txt\"))\n", " print(floor)\n", " for file in files:\n", " wifi = list()\n", " waypoint = list()\n", " with open(file) as f:\n", " txt = f.readlines()\n", " for line in txt:\n", " line = line.strip().split()\n", " if line[1] == \"TYPE_WAYPOINT\":\n", " waypoint.append(line)\n", " if line[1] == \"TYPE_WIFI\":\n", " wifi.append(line)\n", "\n", " df = pd.DataFrame(np.array(wifi)) \n", "\n", " # generate a feature, and label for each wifi block\n", " for gid, g in df.groupby(0):\n", " dists = list()\n", " for e, k in enumerate(waypoint):\n", " dist = abs(int(gid) - int(k[0]))\n", " dists.append(dist)\n", " nearest_wp_index = np.argmin(dists)\n", " \n", " g = g.drop_duplicates(subset=3)\n", " tmp = g.iloc[:,3:5]\n", " feat = tmp.set_index(3).reindex(index).replace(np.nan, -999).T\n", " feat[\"x\"] = float(waypoint[nearest_wp_index][2])\n", " feat[\"y\"] = float(waypoint[nearest_wp_index][3])\n", " feat[\"f\"] = floor\n", " feat[\"path\"] = file.split('/')[-1].split('.')[0] # useful for crossvalidation\n", " dfs.append(feat)\n", " \n", " building_df = pd.concat(dfs)\n", " building_dfs[building] = df\n", " building_df.to_csv('../input/indoor-navigation-and-location-wifi-features/'+building+\"_train.csv\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2021-02-03T20:30:09.454304Z", "iopub.status.busy": "2021-02-03T20:30:09.451093Z", "iopub.status.idle": "2021-02-03T20:30:09.464308Z", "shell.execute_reply": "2021-02-03T20:30:09.464854Z" }, "papermill": { "duration": 0.036471, "end_time": "2021-02-03T20:30:09.465079", "exception": false, "start_time": "2021-02-03T20:30:09.428608", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Generate the features for the test set\n", "\n", "ssubm_building_g = ssubm_df.groupby(0)\n", "feature_dict = dict()\n", "\n", "for gid0, g0 in ssubm_building_g:\n", " break\n", " index = sorted(bssid[g0.iloc[0,0]])\n", " feats = list()\n", " print(gid0)\n", " for gid,g in g0.groupby(1):\n", "\n", " # get all wifi time locations, \n", " with open(os.path.join(base_path, 'test/' + g.iloc[0,1] + '.txt')) as f:\n", " txt = f.readlines()\n", "\n", " wifi = list()\n", "\n", " for line in txt:\n", " line = line.strip().split()\n", " if line[1] == \"TYPE_WIFI\":\n", " wifi.append(line)\n", "\n", " wifi_df = pd.DataFrame(wifi)\n", " wifi_points = pd.DataFrame(wifi_df.groupby(0).count().index.tolist())\n", " \n", " for timepoint in g.iloc[:,2].tolist():\n", "\n", " deltas = (wifi_points.astype(int) - int(timepoint)).abs()\n", " min_delta_idx = deltas.values.argmin()\n", " wifi_block_timestamp = wifi_points.iloc[min_delta_idx].values[0]\n", " \n", " wifi_block = wifi_df[wifi_df[0] == wifi_block_timestamp].drop_duplicates(subset=3)\n", " feat = wifi_block.set_index(3)[4].reindex(index).fillna(-999)\n", "\n", " feat['site_path_timestamp'] = g.iloc[0,0] + \"_\" + g.iloc[0,1] + \"_\" + timepoint\n", " feats.append(feat)\n", " feature_df = pd.concat(feats, axis=1).T\n", " feature_df.to_csv('../input/indoor-navigation-and-location-wifi-features/'+gid0+\"_test.csv\")\n", " feature_dict[gid0] = feature_df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "papermill": { "default_parameters": {}, "duration": 9.894085, "end_time": "2021-02-03T20:30:10.083699", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2021-02-03T20:30:00.189614", "version": "2.2.2" } }, "nbformat": 4, "nbformat_minor": 4 }