From 3d767613cec16a0148f9ee21be354081853a3b26 Mon Sep 17 00:00:00 2001 From: Ayush Singh <81796368+ayush714@users.noreply.github.com> Date: Sun, 22 Aug 2021 20:13:45 +0530 Subject: [PATCH] Notebook_Julia_Editing_Linear_Regression.ipynb Notebook_Julia_Editing_Linear_Regression.ipynb --- ...book_Julia_Editing_Linear_Regression.ipynb | 2500 +++++++++++++++++ 1 file changed, 2500 insertions(+) create mode 100644 2-Regression/3-Linear/solution/Notebook_Julia_Editing_Linear_Regression.ipynb diff --git a/2-Regression/3-Linear/solution/Notebook_Julia_Editing_Linear_Regression.ipynb b/2-Regression/3-Linear/solution/Notebook_Julia_Editing_Linear_Regression.ipynb new file mode 100644 index 00000000..36473d38 --- /dev/null +++ b/2-Regression/3-Linear/solution/Notebook_Julia_Editing_Linear_Regression.ipynb @@ -0,0 +1,2500 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Linear Regression using Julia \n", + "\n", + "In this notebook, I presented Julia version of the solutions, Here I have used Linear Regression on Boston dataset, This notebook does not contain polynomial regression part, because It is still in progress. " + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "# import Pkg\n", + "# Pkg.add(\"Plots\")\n", + "# Pkg.add(\"Lathe\")\n", + "# Pkg.add(\"GLM\")\n", + "# Pkg.add(\"StatsPlots\")\n", + "# Pkg.add(\"MLBase\")\n", + "# Pkg.add(\"Metrics\") \n", + "\n", + "# Uncomment this if you haven't installed the above libraries " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# packages we will use \n", + "using Plots\n", + "using Lathe\n", + "using GLM\n", + "using Statistics\n", + "using StatsPlots\n", + "using MLBase \n", + "using Metrics \n", + "using DataFrames, CSV\n", + "using Polynomials" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "

5 rows × 14 columns (omitted printing of 5 columns)

CRIMZNINDUSCHASNOXRMAGEDISRAD
Float64Float64Float64Float64Float64Float64Float64Float64Float64
10.0063218.02.310.00.5386.57565.24.091.0
20.027310.07.070.00.4696.42178.94.96712.0
30.027290.07.070.00.4697.18561.14.96712.0
40.032370.02.180.00.4586.99845.86.06223.0
50.069050.02.180.00.4587.14754.26.06223.0
" + ], + "text/latex": [ + "\\begin{tabular}{r|cccccccccc}\n", + "\t& CRIM & ZN & INDUS & CHAS & NOX & RM & AGE & DIS & RAD & \\\\\n", + "\t\\hline\n", + "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & \\\\\n", + "\t\\hline\n", + "\t1 & 0.00632 & 18.0 & 2.31 & 0.0 & 0.538 & 6.575 & 65.2 & 4.09 & 1.0 & $\\dots$ \\\\\n", + "\t2 & 0.02731 & 0.0 & 7.07 & 0.0 & 0.469 & 6.421 & 78.9 & 4.9671 & 2.0 & $\\dots$ \\\\\n", + "\t3 & 0.02729 & 0.0 & 7.07 & 0.0 & 0.469 & 7.185 & 61.1 & 4.9671 & 2.0 & $\\dots$ \\\\\n", + "\t4 & 0.03237 & 0.0 & 2.18 & 0.0 & 0.458 & 6.998 & 45.8 & 6.0622 & 3.0 & $\\dots$ \\\\\n", + "\t5 & 0.06905 & 0.0 & 2.18 & 0.0 & 0.458 & 7.147 & 54.2 & 6.0622 & 3.0 & $\\dots$ \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "\u001b[1m5×14 DataFrame\u001b[0m\n", + "\u001b[1m Row \u001b[0m│\u001b[1m CRIM \u001b[0m\u001b[1m ZN \u001b[0m\u001b[1m INDUS \u001b[0m\u001b[1m CHAS \u001b[0m\u001b[1m NOX \u001b[0m\u001b[1m RM \u001b[0m\u001b[1m AGE \u001b[0m\u001b[1m DIS \u001b[0m\u001b[1m\u001b[0m ⋯\n", + "\u001b[1m \u001b[0m│\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m\u001b[0m ⋯\n", + "─────┼──────────────────────────────────────────────────────────────────────────\n", + " 1 │ 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.09 ⋯\n", + " 2 │ 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671\n", + " 3 │ 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671\n", + " 4 │ 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622\n", + " 5 │ 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 ⋯\n", + "\u001b[36m 6 columns omitted\u001b[0m" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# loading the data \n", + "\n", + "df = CSV.read(\"data\\\\out.csv\", DataFrame)\n", + "first(df,5)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(506, 14)\n" + ] + } + ], + "source": [ + "# printing the size of the data \n", + "\n", + "println(size(df))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

14 rows × 7 columns

variablemeanminmedianmaxnmissingeltype
SymbolFloat64Float64Float64Float64Int64DataType
1CRIM3.613520.006320.2565188.97620Float64
2ZN11.36360.00.0100.00Float64
3INDUS11.13680.469.6927.740Float64
4CHAS0.069170.00.01.00Float64
5NOX0.5546950.3850.5380.8710Float64
6RM6.284633.5616.20858.780Float64
7AGE68.57492.977.5100.00Float64
8DIS3.795041.12963.2074512.12650Float64
9RAD9.549411.05.024.00Float64
10TAX408.237187.0330.0711.00Float64
11PTRATIO18.455512.619.0522.00Float64
12B356.6740.32391.44396.90Float64
13LSTAT12.65311.7311.3637.970Float64
14SalePrice22.53285.021.250.00Float64
" + ], + "text/latex": [ + "\\begin{tabular}{r|ccccccc}\n", + "\t& variable & mean & min & median & max & nmissing & eltype\\\\\n", + "\t\\hline\n", + "\t& Symbol & Float64 & Float64 & Float64 & Float64 & Int64 & DataType\\\\\n", + "\t\\hline\n", + "\t1 & CRIM & 3.61352 & 0.00632 & 0.25651 & 88.9762 & 0 & Float64 \\\\\n", + "\t2 & ZN & 11.3636 & 0.0 & 0.0 & 100.0 & 0 & Float64 \\\\\n", + "\t3 & INDUS & 11.1368 & 0.46 & 9.69 & 27.74 & 0 & Float64 \\\\\n", + "\t4 & CHAS & 0.06917 & 0.0 & 0.0 & 1.0 & 0 & Float64 \\\\\n", + "\t5 & NOX & 0.554695 & 0.385 & 0.538 & 0.871 & 0 & Float64 \\\\\n", + "\t6 & RM & 6.28463 & 3.561 & 6.2085 & 8.78 & 0 & Float64 \\\\\n", + "\t7 & AGE & 68.5749 & 2.9 & 77.5 & 100.0 & 0 & Float64 \\\\\n", + "\t8 & DIS & 3.79504 & 1.1296 & 3.20745 & 12.1265 & 0 & Float64 \\\\\n", + "\t9 & RAD & 9.54941 & 1.0 & 5.0 & 24.0 & 0 & Float64 \\\\\n", + "\t10 & TAX & 408.237 & 187.0 & 330.0 & 711.0 & 0 & Float64 \\\\\n", + "\t11 & PTRATIO & 18.4555 & 12.6 & 19.05 & 22.0 & 0 & Float64 \\\\\n", + "\t12 & B & 356.674 & 0.32 & 391.44 & 396.9 & 0 & Float64 \\\\\n", + "\t13 & LSTAT & 12.6531 & 1.73 & 11.36 & 37.97 & 0 & Float64 \\\\\n", + "\t14 & SalePrice & 22.5328 & 5.0 & 21.2 & 50.0 & 0 & Float64 \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "\u001b[1m14×7 DataFrame\u001b[0m\n", + "\u001b[1m Row \u001b[0m│\u001b[1m variable \u001b[0m\u001b[1m mean \u001b[0m\u001b[1m min \u001b[0m\u001b[1m median \u001b[0m\u001b[1m max \u001b[0m\u001b[1m nmissing \u001b[0m\u001b[1m eltype\u001b[0m ⋯\n", + "\u001b[1m \u001b[0m│\u001b[90m Symbol \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m DataTy\u001b[0m ⋯\n", + "─────┼──────────────────────────────────────────────────────────────────────────\n", + " 1 │ CRIM 3.61352 0.00632 0.25651 88.9762 0 Float6 ⋯\n", + " 2 │ ZN 11.3636 0.0 0.0 100.0 0 Float6\n", + " 3 │ INDUS 11.1368 0.46 9.69 27.74 0 Float6\n", + " 4 │ CHAS 0.06917 0.0 0.0 1.0 0 Float6\n", + " 5 │ NOX 0.554695 0.385 0.538 0.871 0 Float6 ⋯\n", + " 6 │ RM 6.28463 3.561 6.2085 8.78 0 Float6\n", + " 7 │ AGE 68.5749 2.9 77.5 100.0 0 Float6\n", + " 8 │ DIS 3.79504 1.1296 3.20745 12.1265 0 Float6\n", + " 9 │ RAD 9.54941 1.0 5.0 24.0 0 Float6 ⋯\n", + " 10 │ TAX 408.237 187.0 330.0 711.0 0 Float6\n", + " 11 │ PTRATIO 18.4555 12.6 19.05 22.0 0 Float6\n", + " 12 │ B 356.674 0.32 391.44 396.9 0 Float6\n", + " 13 │ LSTAT 12.6531 1.73 11.36 37.97 0 Float6 ⋯\n", + " 14 │ SalePrice 22.5328 5.0 21.2 50.0 0 Float6\n", + "\u001b[36m 1 column omitted\u001b[0m" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# describing the data \n", + "\n", + "describe(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14-element Vector{String}:\n", + " \"CRIM\"\n", + " \"ZN\"\n", + " \"INDUS\"\n", + " \"CHAS\"\n", + " \"NOX\"\n", + " \"RM\"\n", + " \"AGE\"\n", + " \"DIS\"\n", + " \"RAD\"\n", + " \"TAX\"\n", + " \"PTRATIO\"\n", + " \"B\"\n", + " \"LSTAT\"\n", + " \"SalePrice\"" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# printing the names of the dataframe \n", + "\n", + "names(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# boxplot for out target variable \n", + "\n", + "boxplot(df.SalePrice, title = \"Box Plot - SalePrice\", ylabel = \"CRIM\", legend = false)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# correlation plot \n", + "\n", + "train_plot = scatter(df.CRIM,df.SalePrice, title = \"Scatter Plot CRIM vs SalePrice\", ylabel = \"CRIM\", xlabel = \"SalePrice\",legend = false)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_plot = scatter(df.INDUS,df.SalePrice, title = \"Scatter Plot INDUS vs SalePrice\", ylabel = \"INDUS\", xlabel = \"SalePrice\",legend = false)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_plot = scatter(df.TAX,df.SalePrice, title = \"Scatter Plot TAX vs SalePrice\", ylabel = \"TAX\", xlabel = \"SalePrice\",legend = false)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

5 rows × 14 columns (omitted printing of 5 columns)

CRIMZNINDUSCHASNOXRMAGEDISRAD
Float64Float64Float64Float64Float64Float64Float64Float64Float64
10.027310.07.070.00.4696.42178.94.96712.0
20.027290.07.070.00.4697.18561.14.96712.0
30.032370.02.180.00.4586.99845.86.06223.0
40.069050.02.180.00.4587.14754.26.06223.0
50.0882912.57.870.00.5246.01266.65.56055.0
" + ], + "text/latex": [ + "\\begin{tabular}{r|cccccccccc}\n", + "\t& CRIM & ZN & INDUS & CHAS & NOX & RM & AGE & DIS & RAD & \\\\\n", + "\t\\hline\n", + "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & \\\\\n", + "\t\\hline\n", + "\t1 & 0.02731 & 0.0 & 7.07 & 0.0 & 0.469 & 6.421 & 78.9 & 4.9671 & 2.0 & $\\dots$ \\\\\n", + "\t2 & 0.02729 & 0.0 & 7.07 & 0.0 & 0.469 & 7.185 & 61.1 & 4.9671 & 2.0 & $\\dots$ \\\\\n", + "\t3 & 0.03237 & 0.0 & 2.18 & 0.0 & 0.458 & 6.998 & 45.8 & 6.0622 & 3.0 & $\\dots$ \\\\\n", + "\t4 & 0.06905 & 0.0 & 2.18 & 0.0 & 0.458 & 7.147 & 54.2 & 6.0622 & 3.0 & $\\dots$ \\\\\n", + "\t5 & 0.08829 & 12.5 & 7.87 & 0.0 & 0.524 & 6.012 & 66.6 & 5.5605 & 5.0 & $\\dots$ \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "\u001b[1m5×14 DataFrame\u001b[0m\n", + "\u001b[1m Row \u001b[0m│\u001b[1m CRIM \u001b[0m\u001b[1m ZN \u001b[0m\u001b[1m INDUS \u001b[0m\u001b[1m CHAS \u001b[0m\u001b[1m NOX \u001b[0m\u001b[1m RM \u001b[0m\u001b[1m AGE \u001b[0m\u001b[1m DIS \u001b[0m\u001b[1m\u001b[0m ⋯\n", + "\u001b[1m \u001b[0m│\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m\u001b[0m ⋯\n", + "─────┼──────────────────────────────────────────────────────────────────────────\n", + " 1 │ 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 ⋯\n", + " 2 │ 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671\n", + " 3 │ 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622\n", + " 4 │ 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622\n", + " 5 │ 0.08829 12.5 7.87 0.0 0.524 6.012 66.6 5.5605 ⋯\n", + "\u001b[36m 6 columns omitted\u001b[0m" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Train test split for diving our data \n", + "\n", + "using Lathe.preprocess: TrainTestSplit\n", + "train, test = TrainTestSplit(df,.75) \n", + "first(train, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(388, 14)\n" + ] + } + ], + "source": [ + "# taking a look at the size of training data \n", + "\n", + "println(size(train))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(118, 14)\n" + ] + } + ], + "source": [ + "# taking a look at the size of testing data \n", + "\n", + "println(size(test))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}\n", + "\n", + "SalePrice ~ 1 + CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT\n", + "\n", + "Coefficients:\n", + "────────────────────────────────────────────────────────────────────────────────\n", + " Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95%\n", + "────────────────────────────────────────────────────────────────────────────────\n", + "(Intercept) 32.2344 6.18734 5.21 <1e-06 20.068 44.4007\n", + "CRIM -0.096813 0.0353805 -2.74 0.0065 -0.166383 -0.0272434\n", + "ZN 0.0559225 0.0155536 3.60 0.0004 0.0253389 0.086506\n", + "INDUS 0.037159 0.0711401 0.52 0.6017 -0.102726 0.177044\n", + "CHAS 2.63139 1.02384 2.57 0.0106 0.618184 4.6446\n", + "NOX -12.7104 4.50354 -2.82 0.0050 -21.5658 -3.85495\n", + "RM 3.86041 0.495463 7.79 <1e-13 2.88616 4.83465\n", + "AGE -0.0029226 0.0157267 -0.19 0.8527 -0.0338464 0.0280012\n", + "DIS -1.41053 0.235675 -5.99 <1e-08 -1.87395 -0.947119\n", + "RAD 0.285815 0.075898 3.77 0.0002 0.136575 0.435056\n", + "TAX -0.0119566 0.00427923 -2.79 0.0055 -0.0203709 -0.00354219\n", + "PTRATIO -0.919455 0.154836 -5.94 <1e-08 -1.22391 -0.614998\n", + "B 0.0114862 0.00308641 3.72 0.0002 0.00541735 0.0175551\n", + "LSTAT -0.580496 0.0639863 -9.07 <1e-17 -0.706314 -0.454678\n", + "────────────────────────────────────────────────────────────────────────────────" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# building linear regression model with all the features and one target variable \n", + "\n", + "fm = @formula(SalePrice ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT) \n", + "linearregressor = lm(fm, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "118-element Vector{Union{Missing, Float64}}:\n", + " 30.257569399329533\n", + " 25.102897520234006\n", + " 10.213362687396213\n", + " 18.30609647011218\n", + " 15.918702530891178\n", + " 18.285305920340207\n", + " 11.723530559085212\n", + " 17.367774990939857\n", + " 15.245581037686078\n", + " 19.327415627467538\n", + " 7.4161933158744695\n", + " 23.451131696858674\n", + " 22.92595266546901\n", + " ⋮\n", + " 18.773461129422785\n", + " 22.759388697901052\n", + " 19.0436268251718\n", + " 20.536792254149326\n", + " 19.82291469224453\n", + " 25.6867877444079\n", + " 15.067772405729784\n", + " 19.393643735031073\n", + " 22.371533915466763\n", + " 19.451521650198\n", + " 2.897763250285564\n", + " 21.227743510092843" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# making predictions \n", + "\n", + "y_predicted = predict(linearregressor, test) " + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R2 Score0.7483442644093596\n", + "MAE 3.269229661186728\n", + "RMSE:- 4.548533879113553\n" + ] + } + ], + "source": [ + "# Evaluating our model using MAE \n", + "\n", + "println(\"R2 Score\", r2(linearregressor)) \n", + "println(\"MAE \", Metrics.mae(y_predicted, test.SalePrice)) \n", + "MSE = Metrics.mse(y_predicted, test.SalePrice) \n", + "println(\"RMSE:- \", sqrt(MSE)) \n", + "\n", + "\n", + "# here our model is performing bad because linear regression does not works well on non-linear data and with some other assumptions which are:- \n", + "# Linear relationship.\n", + "# Multivariate normality.\n", + "# No or little multicollinearity.\n", + "# No auto-correlation.\n", + "# Homoscedasticity\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 1.6.2", + "language": "julia", + "name": "julia-1.6" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "1.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}