You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1120 lines
31 KiB
1120 lines
31 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"papermill": {
|
|
"duration": 0.017006,
|
|
"end_time": "2021-02-19T07:32:16.999516",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:16.982510",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"In this notebook, I'd like to share how to label encode all wifi files in parallel using dask. On my workstation with 20 cpu cores, it took only 10 mins to encode all the data. Due to the limitation of kaggle kernel, I have to set number of workers to 2. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:17.032789Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:17.031894Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:17.783035Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:17.783614Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.769447,
|
|
"end_time": "2021-02-19T07:32:17.783971",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:17.014524",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"metadata sample_submission.csv test train\r\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"! ls ../input/indoor-location-navigation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:17.817174Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:17.816429Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:18.815016Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:18.813025Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 1.016149,
|
|
"end_time": "2021-02-19T07:32:18.815223",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:17.799074",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from glob import glob\n",
|
|
"import os\n",
|
|
"import sys\n",
|
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"from tqdm import tqdm"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:18.859430Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:18.854138Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:19.455581Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:19.455022Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.62535,
|
|
"end_time": "2021-02-19T07:32:19.455739",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:18.830389",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from tqdm import tqdm\n",
|
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|
"from glob import glob\n",
|
|
"from dask.distributed import wait\n",
|
|
"\n",
|
|
"SENSORS = ['acce','acce_uncali','gyro',\n",
|
|
" 'gyro_uncali','magn','magn_uncali','ahrs']\n",
|
|
"\n",
|
|
"NFEAS = {\n",
|
|
" 'acce': 3,\n",
|
|
" 'acce_uncali': 3,\n",
|
|
" 'gyro': 3,\n",
|
|
" 'gyro_uncali': 3,\n",
|
|
" 'magn': 3,\n",
|
|
" 'magn_uncali': 3,\n",
|
|
" 'ahrs': 3,\n",
|
|
" 'wifi': 1,\n",
|
|
" 'ibeacon': 1,\n",
|
|
" 'waypoint': 3\n",
|
|
"}\n",
|
|
"\n",
|
|
"ACOLS = ['timestamp','x','y','z']\n",
|
|
" \n",
|
|
"FIELDS = {\n",
|
|
" 'acce': ACOLS,\n",
|
|
" 'acce_uncali': ACOLS,\n",
|
|
" 'gyro': ACOLS,\n",
|
|
" 'gyro_uncali': ACOLS,\n",
|
|
" 'magn': ACOLS,\n",
|
|
" 'magn_uncali': ACOLS,\n",
|
|
" 'ahrs': ACOLS,\n",
|
|
" 'wifi': ['timestamp','ssid','bssid','rssi','last_timestamp'],\n",
|
|
" 'ibeacon': ['timestamp','code','rssi'],\n",
|
|
" 'waypoint': ['timestamp','x','y']\n",
|
|
"}\n",
|
|
"\n",
|
|
"def to_frame(data, col):\n",
|
|
" cols = FIELDS[col]\n",
|
|
" if data.shape[0]>0:\n",
|
|
" df = pd.DataFrame(data, columns=cols)\n",
|
|
" else:\n",
|
|
" df = create_dummy_df(cols)\n",
|
|
" for col in df.columns:\n",
|
|
" if 'timestamp' in col:\n",
|
|
" df[col] = df[col].astype('int64')\n",
|
|
" return df\n",
|
|
"\n",
|
|
"def create_dummy_df(cols):\n",
|
|
" df = pd.DataFrame()\n",
|
|
" for col in cols:\n",
|
|
" df[col] = [0]\n",
|
|
" if col in ['ssid','bssid']:\n",
|
|
" df[col] = df[col].map(str)\n",
|
|
" return df\n",
|
|
"\n",
|
|
"def get_building_floor(fname):\n",
|
|
" xx = fname.split('/')\n",
|
|
" return xx[-3],xx[-2]\n",
|
|
"\n",
|
|
"def get_test_building(name):\n",
|
|
" with open(name) as f:\n",
|
|
" for c,line in enumerate(f):\n",
|
|
" if c==1:\n",
|
|
" x = line.split()[1].split(':')[1]\n",
|
|
" return x \n",
|
|
"\n",
|
|
"def get_floor_target(floor):\n",
|
|
" floor = floor.lower()\n",
|
|
" if floor in ['bf','bm']:\n",
|
|
" return None\n",
|
|
" elif floor == 'b':\n",
|
|
" return -1\n",
|
|
" if floor.startswith('f'):\n",
|
|
" return int(floor[1])-1\n",
|
|
" elif floor.endswith('f'):\n",
|
|
" return int(floor[0])-1\n",
|
|
" elif floor.startswith('b'):\n",
|
|
" return -int(floor[1])\n",
|
|
" elif floor.endswith('b'):\n",
|
|
" return -int(floor[0])\n",
|
|
" else:\n",
|
|
" return None\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:19.505222Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:19.494804Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:19.517069Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:19.516496Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.04658,
|
|
"end_time": "2021-02-19T07:32:19.517227",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:19.470647",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from dataclasses import dataclass\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"\n",
|
|
"@dataclass\n",
|
|
"class ReadData:\n",
|
|
" acce: np.ndarray\n",
|
|
" acce_uncali: np.ndarray\n",
|
|
" gyro: np.ndarray\n",
|
|
" gyro_uncali: np.ndarray\n",
|
|
" magn: np.ndarray\n",
|
|
" magn_uncali: np.ndarray\n",
|
|
" ahrs: np.ndarray\n",
|
|
" wifi: np.ndarray\n",
|
|
" ibeacon: np.ndarray\n",
|
|
" waypoint: np.ndarray\n",
|
|
"\n",
|
|
"\n",
|
|
"def read_data_file(data_filename):\n",
|
|
" acce = []\n",
|
|
" acce_uncali = []\n",
|
|
" gyro = []\n",
|
|
" gyro_uncali = []\n",
|
|
" magn = []\n",
|
|
" magn_uncali = []\n",
|
|
" ahrs = []\n",
|
|
" wifi = []\n",
|
|
" ibeacon = []\n",
|
|
" waypoint = []\n",
|
|
"\n",
|
|
" with open(data_filename, 'r', encoding='utf-8') as file:\n",
|
|
" lines = file.readlines()\n",
|
|
"\n",
|
|
" for line_data in lines:\n",
|
|
" line_data = line_data.strip()\n",
|
|
" if not line_data or line_data[0] == '#':\n",
|
|
" continue\n",
|
|
"\n",
|
|
" line_data = line_data.split('\\t')\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_ACCELEROMETER':\n",
|
|
" acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':\n",
|
|
" acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_GYROSCOPE':\n",
|
|
" gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':\n",
|
|
" gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_MAGNETIC_FIELD':\n",
|
|
" magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':\n",
|
|
" magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_ROTATION_VECTOR':\n",
|
|
" if len(line_data)>=5:\n",
|
|
" ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_WIFI':\n",
|
|
" sys_ts = line_data[0]\n",
|
|
" ssid = line_data[2]\n",
|
|
" bssid = line_data[3]\n",
|
|
" rssi = line_data[4]\n",
|
|
" lastseen_ts = line_data[6]\n",
|
|
" wifi_data = [sys_ts, ssid, bssid, rssi, lastseen_ts]\n",
|
|
" wifi.append(wifi_data)\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_BEACON':\n",
|
|
" ts = line_data[0]\n",
|
|
" uuid = line_data[2]\n",
|
|
" major = line_data[3]\n",
|
|
" minor = line_data[4]\n",
|
|
" rssi = line_data[6]\n",
|
|
" ibeacon_data = [ts, '_'.join([uuid, major, minor]), rssi]\n",
|
|
" ibeacon.append(ibeacon_data)\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if line_data[1] == 'TYPE_WAYPOINT':\n",
|
|
" waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])\n",
|
|
"\n",
|
|
" acce = np.array(acce)\n",
|
|
" acce_uncali = np.array(acce_uncali)\n",
|
|
" gyro = np.array(gyro)\n",
|
|
" gyro_uncali = np.array(gyro_uncali)\n",
|
|
" magn = np.array(magn)\n",
|
|
" magn_uncali = np.array(magn_uncali)\n",
|
|
" ahrs = np.array(ahrs)\n",
|
|
" wifi = np.array(wifi)\n",
|
|
" ibeacon = np.array(ibeacon)\n",
|
|
" waypoint = np.array(waypoint)\n",
|
|
"\n",
|
|
" return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:19.552943Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:19.551880Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:19.555636Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:19.555024Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.023563,
|
|
"end_time": "2021-02-19T07:32:19.555795",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:19.532232",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import dask\n",
|
|
"from dask.distributed import Client, wait, LocalCluster"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:19.591248Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:19.590578Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:21.064629Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:21.065319Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 1.494609,
|
|
"end_time": "2021-02-19T07:32:21.065605",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:19.570996",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table style=\"border: 2px solid white;\">\n",
|
|
"<tr>\n",
|
|
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
|
|
"<h3 style=\"text-align: left;\">Client</h3>\n",
|
|
"<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
|
|
" <li><b>Scheduler: </b>tcp://127.0.0.1:40123</li>\n",
|
|
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a></li>\n",
|
|
"</ul>\n",
|
|
"</td>\n",
|
|
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
|
|
"<h3 style=\"text-align: left;\">Cluster</h3>\n",
|
|
"<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
|
|
" <li><b>Workers: </b>2</li>\n",
|
|
" <li><b>Cores: </b>2</li>\n",
|
|
" <li><b>Memory: </b>17.18 GB</li>\n",
|
|
"</ul>\n",
|
|
"</td>\n",
|
|
"</tr>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<Client: 'tcp://127.0.0.1:40123' processes=2 threads=2, memory=17.18 GB>"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# set n_workers to number of cores\n",
|
|
"client = Client(n_workers=2, \n",
|
|
" threads_per_worker=1)\n",
|
|
"client"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"papermill": {
|
|
"duration": 0.015792,
|
|
"end_time": "2021-02-19T07:32:21.097941",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:21.082149",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"### Label encode wifi ID features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:21.137129Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:21.136113Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:21.139703Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:21.139149Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.025247,
|
|
"end_time": "2021-02-19T07:32:21.139872",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:21.114625",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"PATH = '../input/indoor-location-navigation'\n",
|
|
"OUT = '../input/wifi_lbl_encode'\n",
|
|
"os.mkdir(OUT)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:21.178354Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:21.177687Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:21.188733Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:21.187818Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.031758,
|
|
"end_time": "2021-02-19T07:32:21.188900",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:21.157142",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def read_wifi_ssid_bssid(name):\n",
|
|
" data = read_data_file(name)\n",
|
|
" dw = to_frame(data.wifi,'wifi')\n",
|
|
" ss = dw['ssid'].unique()\n",
|
|
" bs = dw['bssid'].unique()\n",
|
|
" return ss,bs\n",
|
|
"\n",
|
|
"def encode_wifi_ssid_bssid(name, lbl_ssid, lbl_bssid):\n",
|
|
" data = read_data_file(name)\n",
|
|
" dw = to_frame(data.wifi,'wifi')\n",
|
|
" dw['ssid'] = lbl_ssid.transform(dw['ssid'])\n",
|
|
" dw['bssid'] = lbl_bssid.transform(dw['bssid'])\n",
|
|
" name = name.replace(PATH,OUT)\n",
|
|
" dw.to_csv(name,index=False)\n",
|
|
" return dw"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"papermill": {
|
|
"duration": 0.016428,
|
|
"end_time": "2021-02-19T07:32:21.222099",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:21.205671",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"### Run"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:21.258939Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:21.258260Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:25.615593Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:25.615024Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 4.377188,
|
|
"end_time": "2021-02-19T07:32:25.615758",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:21.238570",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(626, 26925)"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"train_files = glob(f'{PATH}/train/*/*/*.txt')\n",
|
|
"test_files = glob(f'{PATH}/test/*.txt')\n",
|
|
"len(test_files),len(train_files)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:25.663419Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:25.662683Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:25.734282Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:25.733346Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.100222,
|
|
"end_time": "2021-02-19T07:32:25.734466",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:25.634244",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(numpy.ndarray,\n",
|
|
" (19,),\n",
|
|
" (19,),\n",
|
|
" '../input/indoor-location-navigation/train/5cd56c0ce2acfd2d33b6ab27/B1/5d09a625bd54340008acddb9.txt')"
|
|
]
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"name = train_files[0]\n",
|
|
"ss,bs = read_wifi_ssid_bssid(name)\n",
|
|
"type(ss),ss.shape,bs.shape,name"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:25.783302Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:25.782524Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:25.944817Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:25.945372Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.193007,
|
|
"end_time": "2021-02-19T07:32:25.945570",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:25.752563",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"os.mkdir(f'{OUT}/train')\n",
|
|
"os.mkdir(f'{OUT}/test')\n",
|
|
"for site in os.listdir(f'{PATH}/train'):\n",
|
|
" os.mkdir(f'{OUT}/train/{site}')\n",
|
|
" for floor in os.listdir(f'{PATH}/train/{site}'):\n",
|
|
" os.mkdir(f'{OUT}/train/{site}/{floor}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:25.987521Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:25.986482Z",
|
|
"iopub.status.idle": "2021-02-19T07:32:26.098839Z",
|
|
"shell.execute_reply": "2021-02-19T07:32:26.099376Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.135143,
|
|
"end_time": "2021-02-19T07:32:26.099568",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:25.964425",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"100%|██████████| 26925/26925 [00:00<00:00, 288117.37it/s]"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"CPU times: user 97.7 ms, sys: 2.84 ms, total: 101 ms\n",
|
|
"Wall time: 101 ms\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%%time\n",
|
|
"buildings = []\n",
|
|
"floors = []\n",
|
|
"used = []\n",
|
|
"for fname in tqdm(train_files):\n",
|
|
" b,f = get_building_floor(fname)\n",
|
|
" f = get_floor_target(f)\n",
|
|
" if f is None:\n",
|
|
" continue\n",
|
|
" used.append(fname)\n",
|
|
" buildings.append(b)\n",
|
|
" floors.append(f)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T07:32:26.143381Z",
|
|
"iopub.status.busy": "2021-02-19T07:32:26.142328Z",
|
|
"iopub.status.idle": "2021-02-19T08:43:54.571192Z",
|
|
"shell.execute_reply": "2021-02-19T08:43:54.571750Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 4288.452672,
|
|
"end_time": "2021-02-19T08:43:54.572122",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T07:32:26.119450",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"100%|██████████| 24732/24732 [00:11<00:00, 2222.39it/s]\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"CPU times: user 39min 16s, sys: 1min 40s, total: 40min 57s\n",
|
|
"Wall time: 1h 11min 28s\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%%time\n",
|
|
"futures = [] # save the future since dask is lazy, otherwise nothing is executed.\n",
|
|
"for fname in tqdm(test_files+used):\n",
|
|
" f = client.submit(read_wifi_ssid_bssid,fname) \n",
|
|
" futures.append(f) \n",
|
|
"_ = wait(futures)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T08:43:54.670582Z",
|
|
"iopub.status.busy": "2021-02-19T08:43:54.669650Z",
|
|
"iopub.status.idle": "2021-02-19T08:45:35.428441Z",
|
|
"shell.execute_reply": "2021-02-19T08:45:35.427413Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 100.812271,
|
|
"end_time": "2021-02-19T08:45:35.428636",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T08:43:54.616365",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"100%|██████████| 24732/24732 [01:40<00:00, 246.25it/s]\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"CPU times: user 1min 9s, sys: 9.36 s, total: 1min 18s\n",
|
|
"Wall time: 1min 40s\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"((1536700,), (4858800,))"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"%%time\n",
|
|
"ss = []\n",
|
|
"bs = []\n",
|
|
"for f in tqdm(futures):\n",
|
|
" s,b = f.result()\n",
|
|
" ss.append(s)\n",
|
|
" bs.append(b)\n",
|
|
"ss = np.concatenate(ss)\n",
|
|
"bs = np.concatenate(bs)\n",
|
|
"ss.shape, bs.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T08:45:36.434012Z",
|
|
"iopub.status.busy": "2021-02-19T08:45:36.423373Z",
|
|
"iopub.status.idle": "2021-02-19T08:45:36.455609Z",
|
|
"shell.execute_reply": "2021-02-19T08:45:36.454894Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.675897,
|
|
"end_time": "2021-02-19T08:45:36.455784",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T08:45:35.779887",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(74246,)\n",
|
|
"CPU times: user 283 ms, sys: 6.93 ms, total: 290 ms\n",
|
|
"Wall time: 289 ms\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"array(['0', '000015ed38fdbc763149432d7ba3b7bd208461d3',\n",
|
|
" '000073084baa0ea60776e25c07c6ee6b988aa072', ...,\n",
|
|
" 'fffe51167f1ad1bf26dda45ccfc40b5d7fab8384',\n",
|
|
" 'fffe9c57a9b25623ac219260c1b5155087a788e9',\n",
|
|
" 'ffff286949cbddd576ed8b2d5e16ce30eab87af6'], dtype=object)"
|
|
]
|
|
},
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"%%time\n",
|
|
"\n",
|
|
"lbl_ssid = LabelEncoder()\n",
|
|
"lbl_ssid.fit(ss)\n",
|
|
"print(lbl_ssid.classes_.shape)\n",
|
|
"lbl_ssid.classes_"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T08:45:38.090288Z",
|
|
"iopub.status.busy": "2021-02-19T08:45:38.089396Z",
|
|
"iopub.status.idle": "2021-02-19T08:45:38.127469Z",
|
|
"shell.execute_reply": "2021-02-19T08:45:38.126844Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 1.317812,
|
|
"end_time": "2021-02-19T08:45:38.127643",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T08:45:36.809831",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(216210,)\n",
|
|
"CPU times: user 948 ms, sys: 9.79 ms, total: 958 ms\n",
|
|
"Wall time: 942 ms\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"array(['0', '00001d7b6fbf0a24da65285b686b03c6e796962a',\n",
|
|
" '0000fe40d201cfc6cada502b07f29883cd17fe4a', ...,\n",
|
|
" 'ffff295a9e6ae90d9de67a1e1939ba969b765259',\n",
|
|
" 'ffff2c098362b016764229a1bb5e06d10cf0d895',\n",
|
|
" 'ffffb8116ceb5c0326ec2eb039028ec71ffdfbab'], dtype=object)"
|
|
]
|
|
},
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"%%time\n",
|
|
"\n",
|
|
"lbl_bssid = LabelEncoder()\n",
|
|
"lbl_bssid.fit(bs)\n",
|
|
"print(lbl_bssid.classes_.shape)\n",
|
|
"lbl_bssid.classes_"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"papermill": {
|
|
"duration": 0.349307,
|
|
"end_time": "2021-02-19T08:45:38.841314",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T08:45:38.492007",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"Let's look at one encode result."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T08:45:39.579857Z",
|
|
"iopub.status.busy": "2021-02-19T08:45:39.579079Z",
|
|
"iopub.status.idle": "2021-02-19T08:45:39.910385Z",
|
|
"shell.execute_reply": "2021-02-19T08:45:39.909623Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 0.720966,
|
|
"end_time": "2021-02-19T08:45:39.910549",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T08:45:39.189583",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>timestamp</th>\n",
|
|
" <th>ssid</th>\n",
|
|
" <th>bssid</th>\n",
|
|
" <th>rssi</th>\n",
|
|
" <th>last_timestamp</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1560913370116</td>\n",
|
|
" <td>54774</td>\n",
|
|
" <td>69149</td>\n",
|
|
" <td>-90</td>\n",
|
|
" <td>1560913363914</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>1560913370116</td>\n",
|
|
" <td>18591</td>\n",
|
|
" <td>182228</td>\n",
|
|
" <td>-80</td>\n",
|
|
" <td>1560913363973</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>1560913370116</td>\n",
|
|
" <td>54874</td>\n",
|
|
" <td>109727</td>\n",
|
|
" <td>-87</td>\n",
|
|
" <td>1560913354584</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>1560913370116</td>\n",
|
|
" <td>36901</td>\n",
|
|
" <td>124351</td>\n",
|
|
" <td>-85</td>\n",
|
|
" <td>1560913351034</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>1560913370116</td>\n",
|
|
" <td>51444</td>\n",
|
|
" <td>114314</td>\n",
|
|
" <td>-84</td>\n",
|
|
" <td>1560913363543</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" timestamp ssid bssid rssi last_timestamp\n",
|
|
"0 1560913370116 54774 69149 -90 1560913363914\n",
|
|
"1 1560913370116 18591 182228 -80 1560913363973\n",
|
|
"2 1560913370116 54874 109727 -87 1560913354584\n",
|
|
"3 1560913370116 36901 124351 -85 1560913351034\n",
|
|
"4 1560913370116 51444 114314 -84 1560913363543"
|
|
]
|
|
},
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"encode_wifi_ssid_bssid(name, lbl_ssid, lbl_bssid)\n",
|
|
"name = name.replace(PATH,OUT)\n",
|
|
"df = pd.read_csv(name)\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T08:45:40.737958Z",
|
|
"iopub.status.busy": "2021-02-19T08:45:40.736888Z",
|
|
"iopub.status.idle": "2021-02-19T08:45:43.340615Z",
|
|
"shell.execute_reply": "2021-02-19T08:45:43.339718Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 3.060116,
|
|
"end_time": "2021-02-19T08:45:43.340782",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T08:45:40.280666",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"CPU times: user 284 ms, sys: 34 ms, total: 317 ms\n",
|
|
"Wall time: 2.6 s\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%%time\n",
|
|
"\n",
|
|
"lbl_ssid_f = client.scatter(lbl_ssid)\n",
|
|
"lbl_bssid_f = client.scatter(lbl_bssid)\n",
|
|
"_ = wait(lbl_ssid_f)\n",
|
|
"_ = wait(lbl_bssid_f)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2021-02-19T08:45:44.071507Z",
|
|
"iopub.status.busy": "2021-02-19T08:45:44.070692Z",
|
|
"iopub.status.idle": "2021-02-19T10:47:34.115015Z",
|
|
"shell.execute_reply": "2021-02-19T10:47:34.115693Z"
|
|
},
|
|
"papermill": {
|
|
"duration": 7310.41872,
|
|
"end_time": "2021-02-19T10:47:34.116072",
|
|
"exception": false,
|
|
"start_time": "2021-02-19T08:45:43.697352",
|
|
"status": "completed"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"100%|██████████| 24732/24732 [00:17<00:00, 1415.46it/s]\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"CPU times: user 54min 44s, sys: 2min 6s, total: 56min 50s\n",
|
|
"Wall time: 2h 1min 49s\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%%time\n",
|
|
"futures = [] # save the future since dask is lazy, otherwise nothing is executed.\n",
|
|
"\n",
|
|
"for fname in tqdm(test_files+used):\n",
|
|
" f = client.submit(encode_wifi_ssid_bssid, fname, lbl_ssid_f, lbl_bssid_f)\n",
|
|
" futures.append(f) \n",
|
|
"_ = wait(futures)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.7"
|
|
},
|
|
"papermill": {
|
|
"default_parameters": {},
|
|
"duration": 11729.018326,
|
|
"end_time": "2021-02-19T10:47:39.772795",
|
|
"environment_variables": {},
|
|
"exception": null,
|
|
"input_path": "__notebook__.ipynb",
|
|
"output_path": "__notebook__.ipynb",
|
|
"parameters": {},
|
|
"start_time": "2021-02-19T07:32:10.754469",
|
|
"version": "2.2.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|