From efa0d102e51d92f46bf02f77c1a264110ab72758 Mon Sep 17 00:00:00 2001 From: likileads <70283754+likileads@users.noreply.github.com> Date: Sat, 28 Aug 2021 22:02:05 +0530 Subject: [PATCH] added data preprocessing tool --- 1-Introduction/5-data-preprocessing-tool/README.md | 8 ++++++++ .../5-data-preprocessing-tool/data/retail-company.csv | 11 +++++++++++ .../data_preprocessing_tools.ipynb | 1 + 3 files changed, 20 insertions(+) create mode 100644 1-Introduction/5-data-preprocessing-tool/README.md create mode 100644 1-Introduction/5-data-preprocessing-tool/data/retail-company.csv create mode 100644 1-Introduction/5-data-preprocessing-tool/data_preprocessing_tools.ipynb diff --git a/1-Introduction/5-data-preprocessing-tool/README.md b/1-Introduction/5-data-preprocessing-tool/README.md new file mode 100644 index 00000000..5592bc00 --- /dev/null +++ b/1-Introduction/5-data-preprocessing-tool/README.md @@ -0,0 +1,8 @@ +## Dataset +Dataset is of retail company that collected data from their customers wheather or not they purchased some product. +Each row belong to different customers and includes customer name, age, salary and wheather or not they purchased some product (YES or NO). + +x --> freatures (first three colums)
+y --> dependent variable vector (last column) + + diff --git a/1-Introduction/5-data-preprocessing-tool/data/retail-company.csv b/1-Introduction/5-data-preprocessing-tool/data/retail-company.csv new file mode 100644 index 00000000..564b65b2 --- /dev/null +++ b/1-Introduction/5-data-preprocessing-tool/data/retail-company.csv @@ -0,0 +1,11 @@ +Country,Age,Salary,Purchased +France,44,72000,No +Spain,27,48000,Yes +Germany,30,54000,No +Spain,38,61000,No +Germany,40,,Yes +France,35,58000,Yes +Spain,,52000,No +France,48,79000,Yes +Germany,50,83000,No +France,37,67000,Yes \ No newline at end of file diff --git a/1-Introduction/5-data-preprocessing-tool/data_preprocessing_tools.ipynb b/1-Introduction/5-data-preprocessing-tool/data_preprocessing_tools.ipynb new file mode 100644 index 00000000..b2c38b56 --- /dev/null +++ b/1-Introduction/5-data-preprocessing-tool/data_preprocessing_tools.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"data_preprocessing_tools.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true,"authorship_tag":"ABX9TyNxDRfLvKVBN9HjXcmlURF3"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"37puETfgRzzg","colab_type":"text"},"source":["# Data Preprocessing Tools"]},{"cell_type":"markdown","metadata":{"id":"EoRP98MpR-qj","colab_type":"text"},"source":["## Importing the libraries"]},{"cell_type":"code","metadata":{"id":"N-qiINBQSK2g","colab_type":"code","colab":{}},"source":["import numpy as np\n","import matplotlib.pyplot as plt\n","import pandas as pd"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"RopL7tUZSQkT","colab_type":"text"},"source":["## Importing the dataset"]},{"cell_type":"code","metadata":{"id":"WwEPNDWySTKm","colab_type":"code","colab":{}},"source":["dataset = pd.read_csv('Data.csv')\n","X = dataset.iloc[:, :-1].values\n","y = dataset.iloc[:, -1].values"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"hCsz2yCebe1R","colab_type":"code","outputId":"1e4cc568-4e51-4b38-9d46-4aa3f15204be","executionInfo":{"status":"ok","timestamp":1587622253093,"user_tz":-240,"elapsed":895,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}},"colab":{"base_uri":"https://localhost:8080/","height":188}},"source":["print(X)"],"execution_count":3,"outputs":[{"output_type":"stream","text":["[['France' 44.0 72000.0]\n"," ['Spain' 27.0 48000.0]\n"," ['Germany' 30.0 54000.0]\n"," ['Spain' 38.0 61000.0]\n"," ['Germany' 40.0 nan]\n"," ['France' 35.0 58000.0]\n"," ['Spain' nan 52000.0]\n"," ['France' 48.0 79000.0]\n"," ['Germany' 50.0 83000.0]\n"," ['France' 37.0 67000.0]]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"eYrOQ43XcJR3","colab_type":"code","outputId":"e0873b2a-3b08-4bab-ef0d-15b88858ca44","executionInfo":{"status":"ok","timestamp":1587622256072,"user_tz":-240,"elapsed":656,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["print(y)"],"execution_count":4,"outputs":[{"output_type":"stream","text":["['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"nhfKXNxlSabC","colab_type":"text"},"source":["## Taking care of missing data"]},{"cell_type":"code","metadata":{"id":"c93k7ipkSexq","colab_type":"code","colab":{}},"source":["from sklearn.impute import SimpleImputer\n","imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n","imputer.fit(X[:, 1:3])\n","X[:, 1:3] = imputer.transform(X[:, 1:3])"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"3UgLdMS_bjq_","colab_type":"code","outputId":"254af4e0-681e-47f5-aaa7-b9c6f43258e9","executionInfo":{"status":"ok","timestamp":1587622284427,"user_tz":-240,"elapsed":919,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}},"colab":{"base_uri":"https://localhost:8080/","height":188}},"source":["print(X)"],"execution_count":6,"outputs":[{"output_type":"stream","text":["[['France' 44.0 72000.0]\n"," ['Spain' 27.0 48000.0]\n"," ['Germany' 30.0 54000.0]\n"," ['Spain' 38.0 61000.0]\n"," ['Germany' 40.0 63777.77777777778]\n"," ['France' 35.0 58000.0]\n"," ['Spain' 38.77777777777778 52000.0]\n"," ['France' 48.0 79000.0]\n"," ['Germany' 50.0 83000.0]\n"," ['France' 37.0 67000.0]]\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"CriG6VzVSjcK","colab_type":"text"},"source":["## Encoding categorical data"]},{"cell_type":"markdown","metadata":{"id":"AhSpdQWeSsFh","colab_type":"text"},"source":["### Encoding the Independent Variable"]},{"cell_type":"code","metadata":{"id":"5hwuVddlSwVi","colab_type":"code","colab":{}},"source":["from sklearn.compose import ColumnTransformer\n","from sklearn.preprocessing import OneHotEncoder\n","ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')\n","X = np.array(ct.fit_transform(X))"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"f7QspewyeBfx","colab_type":"code","outputId":"5b35feef-7fe2-46ef-ce70-80495f94f4ed","executionInfo":{"status":"ok","timestamp":1587622291650,"user_tz":-240,"elapsed":570,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}},"colab":{"base_uri":"https://localhost:8080/","height":188}},"source":["print(X)"],"execution_count":8,"outputs":[{"output_type":"stream","text":["[[1.0 0.0 0.0 44.0 72000.0]\n"," [0.0 0.0 1.0 27.0 48000.0]\n"," [0.0 1.0 0.0 30.0 54000.0]\n"," [0.0 0.0 1.0 38.0 61000.0]\n"," [0.0 1.0 0.0 40.0 63777.77777777778]\n"," [1.0 0.0 0.0 35.0 58000.0]\n"," [0.0 0.0 1.0 38.77777777777778 52000.0]\n"," [1.0 0.0 0.0 48.0 79000.0]\n"," [0.0 1.0 0.0 50.0 83000.0]\n"," [1.0 0.0 0.0 37.0 67000.0]]\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"DXh8oVSITIc6","colab_type":"text"},"source":["### Encoding the Dependent Variable"]},{"cell_type":"code","metadata":{"id":"XgHCShVyTOYY","colab_type":"code","colab":{}},"source":["from sklearn.preprocessing import LabelEncoder\n","le = LabelEncoder()\n","y = le.fit_transform(y)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"FyhY8-gPpFCa","colab_type":"code","outputId":"7f76ef29-5423-4c3e-cf69-45fbc366a997","executionInfo":{"status":"ok","timestamp":1587622297024,"user_tz":-240,"elapsed":657,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["print(y)"],"execution_count":10,"outputs":[{"output_type":"stream","text":["[0 1 0 0 1 1 0 1 0 1]\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"qb_vcgm3qZKW","colab_type":"text"},"source":["## Splitting the dataset into the Training set and Test set"]},{"cell_type":"code","metadata":{"id":"pXgA6CzlqbCl","colab_type":"code","colab":{}},"source":["from sklearn.model_selection import train_test_split\n","X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"GuwQhFdKrYTM","colab_type":"code","outputId":"de1e527f-c229-4daf-e7c5-ea9d2485148d","executionInfo":{"status":"ok","timestamp":1587622301522,"user_tz":-240,"elapsed":597,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}},"colab":{"base_uri":"https://localhost:8080/","height":154}},"source":["print(X_train)"],"execution_count":12,"outputs":[{"output_type":"stream","text":["[[0.0 0.0 1.0 38.77777777777778 52000.0]\n"," [0.0 1.0 0.0 40.0 63777.77777777778]\n"," [1.0 0.0 0.0 44.0 72000.0]\n"," [0.0 0.0 1.0 38.0 61000.0]\n"," [0.0 0.0 1.0 27.0 48000.0]\n"," [1.0 0.0 0.0 48.0 79000.0]\n"," [0.0 1.0 0.0 50.0 83000.0]\n"," [1.0 0.0 0.0 35.0 58000.0]]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"TUrX_Tvcrbi4","colab_type":"code","outputId":"9a041a9b-2642-4828-fa2f-a431d7d77631","executionInfo":{"status":"ok","timestamp":1587622305066,"user_tz":-240,"elapsed":835,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}},"colab":{"base_uri":"https://localhost:8080/","height":51}},"source":["print(X_test)"],"execution_count":13,"outputs":[{"output_type":"stream","text":["[[0.0 1.0 0.0 30.0 54000.0]\n"," [1.0 0.0 0.0 37.0 67000.0]]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"pSMHiIsWreQY","colab_type":"code","outputId":"5afe91e0-9244-4bf5-ec1b-e3e092b85c08","executionInfo":{"status":"ok","timestamp":1587622306938,"user_tz":-240,"elapsed":536,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["print(y_train)"],"execution_count":14,"outputs":[{"output_type":"stream","text":["[0 1 0 0 1 1 0 1]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"I_tW7H56rgtW","colab_type":"code","outputId":"2a93f141-2a99-4a69-eec5-c82a3bb8d36b","executionInfo":{"status":"ok","timestamp":1587622309210,"user_tz":-240,"elapsed":828,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["print(y_test)"],"execution_count":15,"outputs":[{"output_type":"stream","text":["[0 1]\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"TpGqbS4TqkIR","colab_type":"text"},"source":["## Feature Scaling"]},{"cell_type":"code","metadata":{"id":"AxjSUXFQqo-3","colab_type":"code","colab":{}},"source":["from sklearn.preprocessing import StandardScaler\n","sc = StandardScaler()\n","X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])\n","X_test[:, 3:] = sc.transform(X_test[:, 3:])"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"DWPET8ZdlMnu","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":154},"outputId":"dea86927-5124-4e2a-e974-2804df9a913c","executionInfo":{"status":"ok","timestamp":1587622313752,"user_tz":-240,"elapsed":767,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}}},"source":["print(X_train)"],"execution_count":17,"outputs":[{"output_type":"stream","text":["[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]\n"," [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]\n"," [1.0 0.0 0.0 0.566708506533324 0.633562432710455]\n"," [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]\n"," [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]\n"," [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]\n"," [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]\n"," [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"sTXykB_QlRjE","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"b68f0cfc-d07c-48cb-80d0-6800028c41f9","executionInfo":{"status":"ok","timestamp":1587622315942,"user_tz":-240,"elapsed":506,"user":{"displayName":"Hadelin de Ponteves","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64","userId":"15047218817161520419"}}},"source":["print(X_test)"],"execution_count":18,"outputs":[{"output_type":"stream","text":["[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]\n"," [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]\n"],"name":"stdout"}]}]} \ No newline at end of file