{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Linear Regression using Julia \n", "\n", "In this notebook, I presented Julia version of the solutions, Here I have used Linear Regression on Boston dataset, This notebook does not contain polynomial regression part, because It is still in progress. " ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "# import Pkg\n", "# Pkg.add(\"Plots\")\n", "# Pkg.add(\"Lathe\")\n", "# Pkg.add(\"GLM\")\n", "# Pkg.add(\"StatsPlots\")\n", "# Pkg.add(\"MLBase\")\n", "# Pkg.add(\"Metrics\") \n", "\n", "# Uncomment this if you haven't installed the above libraries " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# packages we will use \n", "using Plots\n", "using Lathe\n", "using GLM\n", "using Statistics\n", "using StatsPlots\n", "using MLBase \n", "using Metrics \n", "using DataFrames, CSV\n", "using Polynomials" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "

5 rows × 14 columns (omitted printing of 5 columns)

CRIMZNINDUSCHASNOXRMAGEDISRAD
Float64Float64Float64Float64Float64Float64Float64Float64Float64
10.0063218.02.310.00.5386.57565.24.091.0
20.027310.07.070.00.4696.42178.94.96712.0
30.027290.07.070.00.4697.18561.14.96712.0
40.032370.02.180.00.4586.99845.86.06223.0
50.069050.02.180.00.4587.14754.26.06223.0
" ], "text/latex": [ "\\begin{tabular}{r|cccccccccc}\n", "\t& CRIM & ZN & INDUS & CHAS & NOX & RM & AGE & DIS & RAD & \\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & \\\\\n", "\t\\hline\n", "\t1 & 0.00632 & 18.0 & 2.31 & 0.0 & 0.538 & 6.575 & 65.2 & 4.09 & 1.0 & $\\dots$ \\\\\n", "\t2 & 0.02731 & 0.0 & 7.07 & 0.0 & 0.469 & 6.421 & 78.9 & 4.9671 & 2.0 & $\\dots$ \\\\\n", "\t3 & 0.02729 & 0.0 & 7.07 & 0.0 & 0.469 & 7.185 & 61.1 & 4.9671 & 2.0 & $\\dots$ \\\\\n", "\t4 & 0.03237 & 0.0 & 2.18 & 0.0 & 0.458 & 6.998 & 45.8 & 6.0622 & 3.0 & $\\dots$ \\\\\n", "\t5 & 0.06905 & 0.0 & 2.18 & 0.0 & 0.458 & 7.147 & 54.2 & 6.0622 & 3.0 & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m5×14 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m CRIM \u001b[0m\u001b[1m ZN \u001b[0m\u001b[1m INDUS \u001b[0m\u001b[1m CHAS \u001b[0m\u001b[1m NOX \u001b[0m\u001b[1m RM \u001b[0m\u001b[1m AGE \u001b[0m\u001b[1m DIS \u001b[0m\u001b[1m\u001b[0m ⋯\n", "\u001b[1m \u001b[0m│\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m\u001b[0m ⋯\n", "─────┼──────────────────────────────────────────────────────────────────────────\n", " 1 │ 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.09 ⋯\n", " 2 │ 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671\n", " 3 │ 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671\n", " 4 │ 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622\n", " 5 │ 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 ⋯\n", "\u001b[36m 6 columns omitted\u001b[0m" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# loading the data \n", "\n", "df = CSV.read(\"data\\\\out.csv\", DataFrame)\n", "first(df,5)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(506, 14)\n" ] } ], "source": [ "# printing the size of the data \n", "\n", "println(size(df))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

14 rows × 7 columns

variablemeanminmedianmaxnmissingeltype
SymbolFloat64Float64Float64Float64Int64DataType
1CRIM3.613520.006320.2565188.97620Float64
2ZN11.36360.00.0100.00Float64
3INDUS11.13680.469.6927.740Float64
4CHAS0.069170.00.01.00Float64
5NOX0.5546950.3850.5380.8710Float64
6RM6.284633.5616.20858.780Float64
7AGE68.57492.977.5100.00Float64
8DIS3.795041.12963.2074512.12650Float64
9RAD9.549411.05.024.00Float64
10TAX408.237187.0330.0711.00Float64
11PTRATIO18.455512.619.0522.00Float64
12B356.6740.32391.44396.90Float64
13LSTAT12.65311.7311.3637.970Float64
14SalePrice22.53285.021.250.00Float64
" ], "text/latex": [ "\\begin{tabular}{r|ccccccc}\n", "\t& variable & mean & min & median & max & nmissing & eltype\\\\\n", "\t\\hline\n", "\t& Symbol & Float64 & Float64 & Float64 & Float64 & Int64 & DataType\\\\\n", "\t\\hline\n", "\t1 & CRIM & 3.61352 & 0.00632 & 0.25651 & 88.9762 & 0 & Float64 \\\\\n", "\t2 & ZN & 11.3636 & 0.0 & 0.0 & 100.0 & 0 & Float64 \\\\\n", "\t3 & INDUS & 11.1368 & 0.46 & 9.69 & 27.74 & 0 & Float64 \\\\\n", "\t4 & CHAS & 0.06917 & 0.0 & 0.0 & 1.0 & 0 & Float64 \\\\\n", "\t5 & NOX & 0.554695 & 0.385 & 0.538 & 0.871 & 0 & Float64 \\\\\n", "\t6 & RM & 6.28463 & 3.561 & 6.2085 & 8.78 & 0 & Float64 \\\\\n", "\t7 & AGE & 68.5749 & 2.9 & 77.5 & 100.0 & 0 & Float64 \\\\\n", "\t8 & DIS & 3.79504 & 1.1296 & 3.20745 & 12.1265 & 0 & Float64 \\\\\n", "\t9 & RAD & 9.54941 & 1.0 & 5.0 & 24.0 & 0 & Float64 \\\\\n", "\t10 & TAX & 408.237 & 187.0 & 330.0 & 711.0 & 0 & Float64 \\\\\n", "\t11 & PTRATIO & 18.4555 & 12.6 & 19.05 & 22.0 & 0 & Float64 \\\\\n", "\t12 & B & 356.674 & 0.32 & 391.44 & 396.9 & 0 & Float64 \\\\\n", "\t13 & LSTAT & 12.6531 & 1.73 & 11.36 & 37.97 & 0 & Float64 \\\\\n", "\t14 & SalePrice & 22.5328 & 5.0 & 21.2 & 50.0 & 0 & Float64 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m14×7 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m variable \u001b[0m\u001b[1m mean \u001b[0m\u001b[1m min \u001b[0m\u001b[1m median \u001b[0m\u001b[1m max \u001b[0m\u001b[1m nmissing \u001b[0m\u001b[1m eltype\u001b[0m ⋯\n", "\u001b[1m \u001b[0m│\u001b[90m Symbol \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m DataTy\u001b[0m ⋯\n", "─────┼──────────────────────────────────────────────────────────────────────────\n", " 1 │ CRIM 3.61352 0.00632 0.25651 88.9762 0 Float6 ⋯\n", " 2 │ ZN 11.3636 0.0 0.0 100.0 0 Float6\n", " 3 │ INDUS 11.1368 0.46 9.69 27.74 0 Float6\n", " 4 │ CHAS 0.06917 0.0 0.0 1.0 0 Float6\n", " 5 │ NOX 0.554695 0.385 0.538 0.871 0 Float6 ⋯\n", " 6 │ RM 6.28463 3.561 6.2085 8.78 0 Float6\n", " 7 │ AGE 68.5749 2.9 77.5 100.0 0 Float6\n", " 8 │ DIS 3.79504 1.1296 3.20745 12.1265 0 Float6\n", " 9 │ RAD 9.54941 1.0 5.0 24.0 0 Float6 ⋯\n", " 10 │ TAX 408.237 187.0 330.0 711.0 0 Float6\n", " 11 │ PTRATIO 18.4555 12.6 19.05 22.0 0 Float6\n", " 12 │ B 356.674 0.32 391.44 396.9 0 Float6\n", " 13 │ LSTAT 12.6531 1.73 11.36 37.97 0 Float6 ⋯\n", " 14 │ SalePrice 22.5328 5.0 21.2 50.0 0 Float6\n", "\u001b[36m 1 column omitted\u001b[0m" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# describing the data \n", "\n", "describe(df)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "14-element Vector{String}:\n", " \"CRIM\"\n", " \"ZN\"\n", " \"INDUS\"\n", " \"CHAS\"\n", " \"NOX\"\n", " \"RM\"\n", " \"AGE\"\n", " \"DIS\"\n", " \"RAD\"\n", " \"TAX\"\n", " \"PTRATIO\"\n", " \"B\"\n", " \"LSTAT\"\n", " \"SalePrice\"" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# printing the names of the dataframe \n", "\n", "names(df)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# boxplot for out target variable \n", "\n", "boxplot(df.SalePrice, title = \"Box Plot - SalePrice\", ylabel = \"CRIM\", legend = false)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# correlation plot \n", "\n", "train_plot = scatter(df.CRIM,df.SalePrice, title = \"Scatter Plot CRIM vs SalePrice\", ylabel = \"CRIM\", xlabel = \"SalePrice\",legend = false)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_plot = scatter(df.INDUS,df.SalePrice, title = \"Scatter Plot INDUS vs SalePrice\", ylabel = \"INDUS\", xlabel = \"SalePrice\",legend = false)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_plot = scatter(df.TAX,df.SalePrice, title = \"Scatter Plot TAX vs SalePrice\", ylabel = \"TAX\", xlabel = \"SalePrice\",legend = false)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

5 rows × 14 columns (omitted printing of 5 columns)

CRIMZNINDUSCHASNOXRMAGEDISRAD
Float64Float64Float64Float64Float64Float64Float64Float64Float64
10.027310.07.070.00.4696.42178.94.96712.0
20.027290.07.070.00.4697.18561.14.96712.0
30.032370.02.180.00.4586.99845.86.06223.0
40.069050.02.180.00.4587.14754.26.06223.0
50.0882912.57.870.00.5246.01266.65.56055.0
" ], "text/latex": [ "\\begin{tabular}{r|cccccccccc}\n", "\t& CRIM & ZN & INDUS & CHAS & NOX & RM & AGE & DIS & RAD & \\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & \\\\\n", "\t\\hline\n", "\t1 & 0.02731 & 0.0 & 7.07 & 0.0 & 0.469 & 6.421 & 78.9 & 4.9671 & 2.0 & $\\dots$ \\\\\n", "\t2 & 0.02729 & 0.0 & 7.07 & 0.0 & 0.469 & 7.185 & 61.1 & 4.9671 & 2.0 & $\\dots$ \\\\\n", "\t3 & 0.03237 & 0.0 & 2.18 & 0.0 & 0.458 & 6.998 & 45.8 & 6.0622 & 3.0 & $\\dots$ \\\\\n", "\t4 & 0.06905 & 0.0 & 2.18 & 0.0 & 0.458 & 7.147 & 54.2 & 6.0622 & 3.0 & $\\dots$ \\\\\n", "\t5 & 0.08829 & 12.5 & 7.87 & 0.0 & 0.524 & 6.012 & 66.6 & 5.5605 & 5.0 & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m5×14 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m CRIM \u001b[0m\u001b[1m ZN \u001b[0m\u001b[1m INDUS \u001b[0m\u001b[1m CHAS \u001b[0m\u001b[1m NOX \u001b[0m\u001b[1m RM \u001b[0m\u001b[1m AGE \u001b[0m\u001b[1m DIS \u001b[0m\u001b[1m\u001b[0m ⋯\n", "\u001b[1m \u001b[0m│\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m\u001b[0m ⋯\n", "─────┼──────────────────────────────────────────────────────────────────────────\n", " 1 │ 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 ⋯\n", " 2 │ 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671\n", " 3 │ 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622\n", " 4 │ 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622\n", " 5 │ 0.08829 12.5 7.87 0.0 0.524 6.012 66.6 5.5605 ⋯\n", "\u001b[36m 6 columns omitted\u001b[0m" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Train test split for diving our data \n", "\n", "using Lathe.preprocess: TrainTestSplit\n", "train, test = TrainTestSplit(df,.75) \n", "first(train, 5)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(388, 14)\n" ] } ], "source": [ "# taking a look at the size of training data \n", "\n", "println(size(train))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(118, 14)\n" ] } ], "source": [ "# taking a look at the size of testing data \n", "\n", "println(size(test))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}\n", "\n", "SalePrice ~ 1 + CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT\n", "\n", "Coefficients:\n", "────────────────────────────────────────────────────────────────────────────────\n", " Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95%\n", "────────────────────────────────────────────────────────────────────────────────\n", "(Intercept) 32.2344 6.18734 5.21 <1e-06 20.068 44.4007\n", "CRIM -0.096813 0.0353805 -2.74 0.0065 -0.166383 -0.0272434\n", "ZN 0.0559225 0.0155536 3.60 0.0004 0.0253389 0.086506\n", "INDUS 0.037159 0.0711401 0.52 0.6017 -0.102726 0.177044\n", "CHAS 2.63139 1.02384 2.57 0.0106 0.618184 4.6446\n", "NOX -12.7104 4.50354 -2.82 0.0050 -21.5658 -3.85495\n", "RM 3.86041 0.495463 7.79 <1e-13 2.88616 4.83465\n", "AGE -0.0029226 0.0157267 -0.19 0.8527 -0.0338464 0.0280012\n", "DIS -1.41053 0.235675 -5.99 <1e-08 -1.87395 -0.947119\n", "RAD 0.285815 0.075898 3.77 0.0002 0.136575 0.435056\n", "TAX -0.0119566 0.00427923 -2.79 0.0055 -0.0203709 -0.00354219\n", "PTRATIO -0.919455 0.154836 -5.94 <1e-08 -1.22391 -0.614998\n", "B 0.0114862 0.00308641 3.72 0.0002 0.00541735 0.0175551\n", "LSTAT -0.580496 0.0639863 -9.07 <1e-17 -0.706314 -0.454678\n", "────────────────────────────────────────────────────────────────────────────────" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# building linear regression model with all the features and one target variable \n", "\n", "fm = @formula(SalePrice ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT) \n", "linearregressor = lm(fm, train)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "118-element Vector{Union{Missing, Float64}}:\n", " 30.257569399329533\n", " 25.102897520234006\n", " 10.213362687396213\n", " 18.30609647011218\n", " 15.918702530891178\n", " 18.285305920340207\n", " 11.723530559085212\n", " 17.367774990939857\n", " 15.245581037686078\n", " 19.327415627467538\n", " 7.4161933158744695\n", " 23.451131696858674\n", " 22.92595266546901\n", " ⋮\n", " 18.773461129422785\n", " 22.759388697901052\n", " 19.0436268251718\n", " 20.536792254149326\n", " 19.82291469224453\n", " 25.6867877444079\n", " 15.067772405729784\n", " 19.393643735031073\n", " 22.371533915466763\n", " 19.451521650198\n", " 2.897763250285564\n", " 21.227743510092843" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# making predictions \n", "\n", "y_predicted = predict(linearregressor, test) " ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "R2 Score0.7483442644093596\n", "MAE 3.269229661186728\n", "RMSE:- 4.548533879113553\n" ] } ], "source": [ "# Evaluating our model using MAE \n", "\n", "println(\"R2 Score\", r2(linearregressor)) \n", "println(\"MAE \", Metrics.mae(y_predicted, test.SalePrice)) \n", "MSE = Metrics.mse(y_predicted, test.SalePrice) \n", "println(\"RMSE:- \", sqrt(MSE)) \n", "\n", "\n", "# here our model is performing bad because linear regression does not works well on non-linear data and with some other assumptions which are:- \n", "# Linear relationship.\n", "# Multivariate normality.\n", "# No or little multicollinearity.\n", "# No auto-correlation.\n", "# Homoscedasticity\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Julia 1.6.2", "language": "julia", "name": "julia-1.6" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.6.2" } }, "nbformat": 4, "nbformat_minor": 4 }