diff --git a/4-Classification/2-Classifiers-1/notebook.ipynb b/4-Classification/2-Classifiers-1/notebook.ipynb
index 30778dee..0be30b73 100644
--- a/4-Classification/2-Classifiers-1/notebook.ipynb
+++ b/4-Classification/2-Classifiers-1/notebook.ipynb
@@ -1,5 +1,811 @@
{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Build Classification Models"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " cuisine | \n",
+ " almond | \n",
+ " angelica | \n",
+ " anise | \n",
+ " anise_seed | \n",
+ " apple | \n",
+ " apple_brandy | \n",
+ " apricot | \n",
+ " armagnac | \n",
+ " ... | \n",
+ " whiskey | \n",
+ " white_bread | \n",
+ " white_wine | \n",
+ " whole_grain_wheat_flour | \n",
+ " wine | \n",
+ " wood | \n",
+ " yam | \n",
+ " yeast | \n",
+ " yogurt | \n",
+ " zucchini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " indian | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " indian | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " indian | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " indian | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " indian | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 382 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 cuisine almond angelica anise anise_seed apple \\\n",
+ "0 0 indian 0 0 0 0 0 \n",
+ "1 1 indian 1 0 0 0 0 \n",
+ "2 2 indian 0 0 0 0 0 \n",
+ "3 3 indian 0 0 0 0 0 \n",
+ "4 4 indian 0 0 0 0 0 \n",
+ "\n",
+ " apple_brandy apricot armagnac ... whiskey white_bread white_wine \\\n",
+ "0 0 0 0 ... 0 0 0 \n",
+ "1 0 0 0 ... 0 0 0 \n",
+ "2 0 0 0 ... 0 0 0 \n",
+ "3 0 0 0 ... 0 0 0 \n",
+ "4 0 0 0 ... 0 0 0 \n",
+ "\n",
+ " whole_grain_wheat_flour wine wood yam yeast yogurt zucchini \n",
+ "0 0 0 0 0 0 0 0 \n",
+ "1 0 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 1 0 \n",
+ "\n",
+ "[5 rows x 382 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "cuisines_df = pd.read_csv(\"../data/cleaned_cuisines.csv\")\n",
+ "cuisines_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split, cross_val_score\n",
+ "from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve\n",
+ "from sklearn.svm import SVC\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 indian\n",
+ "1 indian\n",
+ "2 indian\n",
+ "3 indian\n",
+ "4 indian\n",
+ "Name: cuisine, dtype: object"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cuisines_label_df = cuisines_df['cuisine']\n",
+ "cuisines_label_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " almond | \n",
+ " angelica | \n",
+ " anise | \n",
+ " anise_seed | \n",
+ " apple | \n",
+ " apple_brandy | \n",
+ " apricot | \n",
+ " armagnac | \n",
+ " artemisia | \n",
+ " artichoke | \n",
+ " ... | \n",
+ " whiskey | \n",
+ " white_bread | \n",
+ " white_wine | \n",
+ " whole_grain_wheat_flour | \n",
+ " wine | \n",
+ " wood | \n",
+ " yam | \n",
+ " yeast | \n",
+ " yogurt | \n",
+ " zucchini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 380 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " almond angelica anise anise_seed apple apple_brandy apricot \\\n",
+ "0 0 0 0 0 0 0 0 \n",
+ "1 1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 0 0 \n",
+ "\n",
+ " armagnac artemisia artichoke ... whiskey white_bread white_wine \\\n",
+ "0 0 0 0 ... 0 0 0 \n",
+ "1 0 0 0 ... 0 0 0 \n",
+ "2 0 0 0 ... 0 0 0 \n",
+ "3 0 0 0 ... 0 0 0 \n",
+ "4 0 0 0 ... 0 0 0 \n",
+ "\n",
+ " whole_grain_wheat_flour wine wood yam yeast yogurt zucchini \n",
+ "0 0 0 0 0 0 0 0 \n",
+ "1 0 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 1 0 \n",
+ "\n",
+ "[5 rows x 380 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)\n",
+ "cuisines_feature_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy is 0.7881567973311092\n"
+ ]
+ }
+ ],
+ "source": [
+ "lr = LogisticRegression(multi_class='ovr',solver='liblinear')\n",
+ "model = lr.fit(X_train, np.ravel(y_train))\n",
+ "\n",
+ "accuracy = model.score(X_test, y_test)\n",
+ "print (\"Accuracy is {}\".format(accuracy))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ingredients: Index(['black_pepper', 'carrot', 'corn', 'onion', 'pineapple', 'pork',\n",
+ " 'shiitake', 'soy_sauce', 'starch', 'vinegar'],\n",
+ " dtype='object')\n",
+ "cuisine: chinese\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')\n",
+ "print(f'cuisine: {y_test.iloc[50]}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/ray/Desktop/ML-For-Beginners/.venv/lib/python3.12/site-packages/sklearn/base.py:465: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " chinese | \n",
+ " 0.954918 | \n",
+ "
\n",
+ " \n",
+ " korean | \n",
+ " 0.023422 | \n",
+ "
\n",
+ " \n",
+ " japanese | \n",
+ " 0.018495 | \n",
+ "
\n",
+ " \n",
+ " thai | \n",
+ " 0.002817 | \n",
+ "
\n",
+ " \n",
+ " indian | \n",
+ " 0.000348 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0\n",
+ "chinese 0.954918\n",
+ "korean 0.023422\n",
+ "japanese 0.018495\n",
+ "thai 0.002817\n",
+ "indian 0.000348"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test= X_test.iloc[50].values.reshape(-1, 1).T\n",
+ "proba = model.predict_proba(test)\n",
+ "classes = model.classes_\n",
+ "resultdf = pd.DataFrame(data=proba, columns=classes)\n",
+ "\n",
+ "topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])\n",
+ "topPrediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " chinese 0.74 0.74 0.74 234\n",
+ " indian 0.92 0.91 0.92 254\n",
+ " japanese 0.63 0.75 0.68 222\n",
+ " korean 0.80 0.78 0.79 235\n",
+ " thai 0.86 0.75 0.80 254\n",
+ "\n",
+ " accuracy 0.79 1199\n",
+ " macro avg 0.79 0.79 0.79 1199\n",
+ "weighted avg 0.80 0.79 0.79 1199\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_pred = model.predict(X_test)\n",
+ "print(classification_report(y_test,y_pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy is 0.8006672226855713\n"
+ ]
+ }
+ ],
+ "source": [
+ "lr = LogisticRegression(multi_class='ovr',solver='liblinear')\n",
+ "model = lr.fit(X_train, np.ravel(y_train))\n",
+ "\n",
+ "accuracy = model.score(X_test, y_test)\n",
+ "print (\"Accuracy is {}\".format(accuracy))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ingredients: Index(['bell_pepper', 'carrot', 'egg', 'pork', 'shiitake', 'soy_sauce',\n",
+ " 'starch', 'vegetable', 'vegetable_oil'],\n",
+ " dtype='object')\n",
+ "cuisine: chinese\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')\n",
+ "print(f'cuisine: {y_test.iloc[50]}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/ray/Desktop/ML-For-Beginners/.venv/lib/python3.12/site-packages/sklearn/base.py:465: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " chinese | \n",
+ " 0.928517 | \n",
+ "
\n",
+ " \n",
+ " japanese | \n",
+ " 0.042258 | \n",
+ "
\n",
+ " \n",
+ " korean | \n",
+ " 0.026577 | \n",
+ "
\n",
+ " \n",
+ " thai | \n",
+ " 0.002424 | \n",
+ "
\n",
+ " \n",
+ " indian | \n",
+ " 0.000223 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0\n",
+ "chinese 0.928517\n",
+ "japanese 0.042258\n",
+ "korean 0.026577\n",
+ "thai 0.002424\n",
+ "indian 0.000223"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test= X_test.iloc[50].values.reshape(-1, 1).T\n",
+ "proba = model.predict_proba(test)\n",
+ "classes = model.classes_\n",
+ "resultdf = pd.DataFrame(data=proba, columns=classes)\n",
+ "\n",
+ "topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])\n",
+ "topPrediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " chinese 0.74 0.69 0.71 243\n",
+ " indian 0.91 0.90 0.90 233\n",
+ " japanese 0.74 0.76 0.75 238\n",
+ " korean 0.82 0.81 0.81 238\n",
+ " thai 0.81 0.84 0.83 247\n",
+ "\n",
+ " accuracy 0.80 1199\n",
+ " macro avg 0.80 0.80 0.80 1199\n",
+ "weighted avg 0.80 0.80 0.80 1199\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_pred = model.predict(X_test)\n",
+ "print(classification_report(y_test,y_pred))"
+ ]
+ }
+ ],
"metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
"language_info": {
"codemirror_mode": {
"name": "ipython",
@@ -10,19 +816,10 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": 3
+ "version": "3.12.0"
},
"orig_nbformat": 2
},
"nbformat": 4,
- "nbformat_minor": 2,
- "cells": [
- {
- "source": [
- "# Build Classification Models"
- ],
- "cell_type": "markdown",
- "metadata": {}
- }
- ]
-}
\ No newline at end of file
+ "nbformat_minor": 2
+}