You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
362 lines
13 KiB
362 lines
13 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"papermill": {
|
|
"duration": 0.007463,
|
|
"end_time": "2021-02-03T20:30:06.571139",
|
|
"exception": false,
|
|
"start_time": "2021-02-03T20:30:06.563676",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"### Wifi features\n",
|
|
"\n",
|
|
"This this is the code to generate the wifi features available in [this dataset](https://www.kaggle.com/devinanzelmo/indoor-navigation-and-location-wifi-features). Using these features can get a score below 14. For an example notebook using them see [this notebook](https://www.kaggle.com/devinanzelmo/wifi-features-lightgbm-starter). They only uses waypoints, wifi and timestamp data to generate solution. See this [forum post](https://www.kaggle.com/c/indoor-location-navigation/discussion/215445) for an outline of this solution method, and methods of improvement.\n",
|
|
"\n",
|
|
"There are `break`'s inserted into loops which need to be removed to get this to run. Right now data is written to current working directory. This takes 2-4 hours to run depending on hard drive etc. There is a lot of room for improvement speeding up feature generation. \n",
|
|
"\n",
|
|
"**Update:** I added one line that creates a column for the path filename, this allows for a groupkfold crossvalidation. \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
|
|
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-03T20:30:06.590945Z",
|
|
"iopub.status.busy": "2021-02-03T20:30:06.589984Z",
|
|
"iopub.status.idle": "2021-02-03T20:30:06.593594Z",
|
|
"shell.execute_reply": "2021-02-03T20:30:06.592887Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.01623,
|
|
"end_time": "2021-02-03T20:30:06.593847",
|
|
"exception": false,
|
|
"start_time": "2021-02-03T20:30:06.577617",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import glob\n",
|
|
"import os\n",
|
|
"import gc\n",
|
|
"import json "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-03T20:30:06.614521Z",
|
|
"iopub.status.busy": "2021-02-03T20:30:06.613572Z",
|
|
"iopub.status.idle": "2021-02-03T20:30:06.616669Z",
|
|
"shell.execute_reply": "2021-02-03T20:30:06.616121Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.015585,
|
|
"end_time": "2021-02-03T20:30:06.616837",
|
|
"exception": false,
|
|
"start_time": "2021-02-03T20:30:06.601252",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"base_path = '../input/indoor-location-navigation/'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-03T20:30:06.639011Z",
|
|
"iopub.status.busy": "2021-02-03T20:30:06.638118Z",
|
|
"iopub.status.idle": "2021-02-03T20:30:09.333807Z",
|
|
"shell.execute_reply": "2021-02-03T20:30:09.334360Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 2.711076,
|
|
"end_time": "2021-02-03T20:30:09.334617",
|
|
"exception": false,
|
|
"start_time": "2021-02-03T20:30:06.623541",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# pull out all the buildings actually used in the test set, given current method we don't need the other ones\n",
|
|
"ssubm = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv')\n",
|
|
"\n",
|
|
"# only 24 of the total buildings are used in the test set, \n",
|
|
"# this allows us to greatly reduce the intial size of the dataset\n",
|
|
"\n",
|
|
"ssubm_df = ssubm[\"site_path_timestamp\"].apply(lambda x: pd.Series(x.split(\"_\")))\n",
|
|
"used_buildings = sorted(ssubm_df[0].value_counts().index.tolist())\n",
|
|
"\n",
|
|
"# dictionary used to map the floor codes to the values used in the submission file. \n",
|
|
"floor_map = {\"B2\":-2, \"B1\":-1, \"F1\":0, \"F2\": 1, \"F3\":2, \"F4\":3, \"F5\":4, \"F6\":5, \"F7\":6,\"F8\":7, \"F9\":8,\n",
|
|
" \"1F\":0, \"2F\":1, \"3F\":2, \"4F\":3, \"5F\":4, \"6F\":5, \"7F\":6, \"8F\": 7, \"9F\":8}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-03T20:30:09.359905Z",
|
|
"iopub.status.busy": "2021-02-03T20:30:09.359123Z",
|
|
"iopub.status.idle": "2021-02-03T20:30:09.362909Z",
|
|
"shell.execute_reply": "2021-02-03T20:30:09.362224Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.021272,
|
|
"end_time": "2021-02-03T20:30:09.363069",
|
|
"exception": false,
|
|
"start_time": "2021-02-03T20:30:09.341797",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# get only the wifi bssid that occur over 1000 times(this number can be experimented with)\n",
|
|
"# these will be the only ones used when constructing features\n",
|
|
"bssid = dict()\n",
|
|
"\n",
|
|
"for building in used_buildings:\n",
|
|
" break\n",
|
|
" folders = sorted(glob.glob(os.path.join(base_path,'train/'+building+'/*')))\n",
|
|
" print(building)\n",
|
|
" wifi = list()\n",
|
|
" for folder in folders:\n",
|
|
" floor = floor_map[folder.split('/')[-1]]\n",
|
|
" files = glob.glob(os.path.join(folder, \"*.txt\"))\n",
|
|
" for file in files:\n",
|
|
" with open(file) as f:\n",
|
|
" txt = f.readlines()\n",
|
|
" for e, line in enumerate(txt):\n",
|
|
" tmp = line.strip().split()\n",
|
|
" if tmp[1] == \"TYPE_WIFI\":\n",
|
|
" wifi.append(tmp)\n",
|
|
" df = pd.DataFrame(wifi)\n",
|
|
" #top_bssid = df[3].value_counts().iloc[:500].index.tolist()\n",
|
|
" value_counts = df[3].value_counts()\n",
|
|
" top_bssid = value_counts[value_counts > 0].index.tolist()\n",
|
|
" print(len(top_bssid))\n",
|
|
" bssid[building] = top_bssid\n",
|
|
" del df\n",
|
|
" del wifi\n",
|
|
" gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-03T20:30:09.383252Z",
|
|
"iopub.status.busy": "2021-02-03T20:30:09.382581Z",
|
|
"iopub.status.idle": "2021-02-03T20:30:09.386704Z",
|
|
"shell.execute_reply": "2021-02-03T20:30:09.385809Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.016635,
|
|
"end_time": "2021-02-03T20:30:09.386885",
|
|
"exception": false,
|
|
"start_time": "2021-02-03T20:30:09.370250",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"bssid_1000.json\", \"w\") as f:\n",
|
|
" json.dump(bssid, f)\n",
|
|
"\n",
|
|
"with open(\"bssid_1000.json\") as f:\n",
|
|
" bssid = json.load(f)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-03T20:30:09.418284Z",
|
|
"iopub.status.busy": "2021-02-03T20:30:09.417119Z",
|
|
"iopub.status.idle": "2021-02-03T20:30:09.420513Z",
|
|
"shell.execute_reply": "2021-02-03T20:30:09.419767Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.026514,
|
|
"end_time": "2021-02-03T20:30:09.420694",
|
|
"exception": false,
|
|
"start_time": "2021-02-03T20:30:09.394180",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# generate all the training data \n",
|
|
"building_dfs = dict()\n",
|
|
"\n",
|
|
"for building in used_buildings:\n",
|
|
" break\n",
|
|
" folders = sorted(glob.glob(os.path.join(base_path,'train', building +'/*')))\n",
|
|
" dfs = list()\n",
|
|
" index = sorted(bssid[building])\n",
|
|
" print(building)\n",
|
|
" for folder in folders:\n",
|
|
" floor = floor_map[folder.split('/')[-1]]\n",
|
|
" files = glob.glob(os.path.join(folder, \"*.txt\"))\n",
|
|
" print(floor)\n",
|
|
" for file in files:\n",
|
|
" wifi = list()\n",
|
|
" waypoint = list()\n",
|
|
" with open(file) as f:\n",
|
|
" txt = f.readlines()\n",
|
|
" for line in txt:\n",
|
|
" line = line.strip().split()\n",
|
|
" if line[1] == \"TYPE_WAYPOINT\":\n",
|
|
" waypoint.append(line)\n",
|
|
" if line[1] == \"TYPE_WIFI\":\n",
|
|
" wifi.append(line)\n",
|
|
"\n",
|
|
" df = pd.DataFrame(np.array(wifi)) \n",
|
|
"\n",
|
|
" # generate a feature, and label for each wifi block\n",
|
|
" for gid, g in df.groupby(0):\n",
|
|
" dists = list()\n",
|
|
" for e, k in enumerate(waypoint):\n",
|
|
" dist = abs(int(gid) - int(k[0]))\n",
|
|
" dists.append(dist)\n",
|
|
" nearest_wp_index = np.argmin(dists)\n",
|
|
" \n",
|
|
" g = g.drop_duplicates(subset=3)\n",
|
|
" tmp = g.iloc[:,3:5]\n",
|
|
" feat = tmp.set_index(3).reindex(index).replace(np.nan, -999).T\n",
|
|
" feat[\"x\"] = float(waypoint[nearest_wp_index][2])\n",
|
|
" feat[\"y\"] = float(waypoint[nearest_wp_index][3])\n",
|
|
" feat[\"f\"] = floor\n",
|
|
" feat[\"path\"] = file.split('/')[-1].split('.')[0] # useful for crossvalidation\n",
|
|
" dfs.append(feat)\n",
|
|
" \n",
|
|
" building_df = pd.concat(dfs)\n",
|
|
" building_dfs[building] = df\n",
|
|
" building_df.to_csv('../input/indoor-navigation-and-location-wifi-features/'+building+\"_train.csv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-03T20:30:09.454304Z",
|
|
"iopub.status.busy": "2021-02-03T20:30:09.451093Z",
|
|
"iopub.status.idle": "2021-02-03T20:30:09.464308Z",
|
|
"shell.execute_reply": "2021-02-03T20:30:09.464854Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.036471,
|
|
"end_time": "2021-02-03T20:30:09.465079",
|
|
"exception": false,
|
|
"start_time": "2021-02-03T20:30:09.428608",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Generate the features for the test set\n",
|
|
"\n",
|
|
"ssubm_building_g = ssubm_df.groupby(0)\n",
|
|
"feature_dict = dict()\n",
|
|
"\n",
|
|
"for gid0, g0 in ssubm_building_g:\n",
|
|
" break\n",
|
|
" index = sorted(bssid[g0.iloc[0,0]])\n",
|
|
" feats = list()\n",
|
|
" print(gid0)\n",
|
|
" for gid,g in g0.groupby(1):\n",
|
|
"\n",
|
|
" # get all wifi time locations, \n",
|
|
" with open(os.path.join(base_path, 'test/' + g.iloc[0,1] + '.txt')) as f:\n",
|
|
" txt = f.readlines()\n",
|
|
"\n",
|
|
" wifi = list()\n",
|
|
"\n",
|
|
" for line in txt:\n",
|
|
" line = line.strip().split()\n",
|
|
" if line[1] == \"TYPE_WIFI\":\n",
|
|
" wifi.append(line)\n",
|
|
"\n",
|
|
" wifi_df = pd.DataFrame(wifi)\n",
|
|
" wifi_points = pd.DataFrame(wifi_df.groupby(0).count().index.tolist())\n",
|
|
" \n",
|
|
" for timepoint in g.iloc[:,2].tolist():\n",
|
|
"\n",
|
|
" deltas = (wifi_points.astype(int) - int(timepoint)).abs()\n",
|
|
" min_delta_idx = deltas.values.argmin()\n",
|
|
" wifi_block_timestamp = wifi_points.iloc[min_delta_idx].values[0]\n",
|
|
" \n",
|
|
" wifi_block = wifi_df[wifi_df[0] == wifi_block_timestamp].drop_duplicates(subset=3)\n",
|
|
" feat = wifi_block.set_index(3)[4].reindex(index).fillna(-999)\n",
|
|
"\n",
|
|
" feat['site_path_timestamp'] = g.iloc[0,0] + \"_\" + g.iloc[0,1] + \"_\" + timepoint\n",
|
|
" feats.append(feat)\n",
|
|
" feature_df = pd.concat(feats, axis=1).T\n",
|
|
" feature_df.to_csv('../input/indoor-navigation-and-location-wifi-features/'+gid0+\"_test.csv\")\n",
|
|
" feature_dict[gid0] = feature_df"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.3"
|
|
},
|
|
"papermill": {
|
|
"default_parameters": {},
|
|
"duration": 9.894085,
|
|
"end_time": "2021-02-03T20:30:10.083699",
|
|
"environment_variables": {},
|
|
"exception": null,
|
|
"input_path": "__notebook__.ipynb",
|
|
"output_path": "__notebook__.ipynb",
|
|
"parameters": {},
|
|
"start_time": "2021-02-03T20:30:00.189614",
|
|
"version": "2.2.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|