Create gb_ml_model_param_grid.ipynb

model parameter adjustment

Create gb_ml_model_param_grid.ipynb
model parameter adjustment
fde333ce · Sewwandi W.M.C · 3f346e12 · fde333ce
Commit fde333ce authored Mar 23, 2024 by Sewwandi W.M.C
Hide whitespace changes
Inline Side-by-side

Showing with 300 additions and 0 deletions

Function 01/Function 01/Function1/March_Update/SkillPredict/gb_ml_model_param_grid.ipynb ...n1/March_Update/SkillPredict/gb_ml_model_param_grid.ipynb +300 -0

No files found.
--- a/Function 01/Function 01/Function1/March_Update/SkillPredict/gb_ml_model_param_grid.ipynb
+++ b/Function 01/Function 01/Function1/March_Update/SkillPredict/gb_ml_model_param_grid.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Skills Prediction ML Model - GB Classifier with GridSearchCV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.metrics import accuracy_score, classification_report, mean_squared_error\n",
+    "\n",
+    "# Load the dataset\n",
+    "df = pd.read_csv('./Dataset/skills_list_dataset_32.csv', encoding='latin1')\n",
+    "\n",
+    "# Map labels to numbers\n",
+    "label_mapping = {label: idx + 1 for idx, label in enumerate(df['Label'].unique())}\n",
+    "df['Label'] = df['Label'].map(label_mapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Keywords_List</th>\n",
+       "      <th>Label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>['React', 'ReactJS', 'Frontend', 'Web Developm...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>['React Developer', 'Web Design', 'Responsive ...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>['JavaScript Library', 'Frontend Engineering',...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>['Frontend Engineering', 'JavaScript Framework...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>['Stateful Components', 'Stateless Components'...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                       Keywords_List  Label\n",
+       "0  ['React', 'ReactJS', 'Frontend', 'Web Developm...      1\n",
+       "1  ['React Developer', 'Web Design', 'Responsive ...      1\n",
+       "2  ['JavaScript Library', 'Frontend Engineering',...      1\n",
+       "3  ['Frontend Engineering', 'JavaScript Framework...      1\n",
+       "4  ['Stateful Components', 'Stateless Components'...      1"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Label Mapping:\n",
+      "React: 1\n",
+      "React Native: 2\n",
+      "Angular: 3\n",
+      "PHP: 4\n",
+      "Java: 5\n",
+      "HTML/HTML5: 6\n",
+      "CSS: 7\n",
+      "Python: 8\n",
+      "Node.js: 9\n",
+      "Full-Stack Development: 10\n",
+      "Mobile App Development (iOS): 11\n",
+      "Mobile App Development (Android): 12\n",
+      "JavaScript: 13\n",
+      "TypeScript: 14\n",
+      "DevOps: 15\n",
+      "QA/Testing: 16\n",
+      "UI: 17\n",
+      "UX: 18\n",
+      "UI/UX: 19\n",
+      "Cloud (AWS, Google, Azure): 20\n",
+      "Graphics Designing: 21\n",
+      "VFX Designing: 22\n",
+      "GitHub/Version Control: 23\n",
+      "Video Editing: 24\n",
+      "Project Management: 25\n",
+      "Problem Solving: 26\n",
+      "Team Collaboration: 27\n",
+      "3D Designing: 28\n",
+      "Leadership: 29\n",
+      "Animation Editing: 30\n",
+      "Time Management: 31\n",
+      "Digital Marketing: 32\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the mapping of labels to encoded numbers\n",
+    "print(\"Label Mapping:\")\n",
+    "for label, encoded_number in label_mapping.items():\n",
+    "    print(f\"{label}: {encoded_number}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Split the data into training and testing sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(df['Keywords_List'], df['Label'], test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Models\n",
+    "param_grid = {\n",
+    "    'gradientboostingclassifier__n_estimators': [50, 100, 150, 200, 300],\n",
+    "    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2, 0.3],\n",
+    "    'gradientboostingclassifier__max_depth': [3, 5, 7, 9, 11, None],\n",
+    "}\n",
+    "\n",
+    "model = GridSearchCV(\n",
+    "    make_pipeline(TfidfVectorizer(), GradientBoostingClassifier()),\n",
+    "    param_grid=param_grid,\n",
+    "    scoring='accuracy',\n",
+    "    cv=5\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best Hyperparameters: {'gradientboostingclassifier__learning_rate': 0.2, 'gradientboostingclassifier__max_depth': 3, 'gradientboostingclassifier__n_estimators': 200}\n",
+      "Optimized Accuracy:  0.6296296296296297\n",
+      "Classification Report:\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "           1       0.17      0.50      0.25         4\n",
+      "           2       0.75      0.60      0.67         5\n",
+      "           3       1.00      0.86      0.92         7\n",
+      "           4       1.00      0.90      0.95        10\n",
+      "           5       1.00      0.67      0.80         3\n",
+      "           6       1.00      0.80      0.89         5\n",
+      "           7       0.00      0.00      0.00         5\n",
+      "           8       0.71      1.00      0.83         5\n",
+      "           9       1.00      1.00      1.00         3\n",
+      "          10       0.33      0.25      0.29         4\n",
+      "          11       0.67      0.40      0.50         5\n",
+      "          12       0.67      1.00      0.80         2\n",
+      "          13       0.00      0.00      0.00         3\n",
+      "          14       0.71      0.83      0.77         6\n",
+      "          15       0.50      0.33      0.40         6\n",
+      "          16       0.80      1.00      0.89         4\n",
+      "          17       0.43      0.60      0.50         5\n",
+      "          18       0.33      0.25      0.29         4\n",
+      "          19       1.00      0.50      0.67         2\n",
+      "          20       0.00      0.00      0.00         0\n",
+      "          21       0.50      1.00      0.67         3\n",
+      "          22       0.75      1.00      0.86         3\n",
+      "          23       1.00      0.57      0.73         7\n",
+      "          24       0.50      1.00      0.67         2\n",
+      "          25       0.67      0.50      0.57         4\n",
+      "          26       0.20      0.33      0.25         3\n",
+      "          27       0.50      0.50      0.50         4\n",
+      "          28       1.00      0.50      0.67         2\n",
+      "          29       0.50      0.50      0.50         2\n",
+      "          30       1.00      0.71      0.83         7\n",
+      "          31       0.25      0.25      0.25         4\n",
+      "          32       0.83      0.83      0.83         6\n",
+      "\n",
+      "    accuracy                           0.63       135\n",
+      "   macro avg       0.62      0.60      0.59       135\n",
+      "weighted avg       0.67      0.63      0.63       135\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\niwar\\anaconda3\\envs\\resume-ranker\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "c:\\Users\\niwar\\anaconda3\\envs\\resume-ranker\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1471: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "c:\\Users\\niwar\\anaconda3\\envs\\resume-ranker\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "c:\\Users\\niwar\\anaconda3\\envs\\resume-ranker\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1471: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "c:\\Users\\niwar\\anaconda3\\envs\\resume-ranker\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n",
+      "c:\\Users\\niwar\\anaconda3\\envs\\resume-ranker\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1471: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, msg_start, len(result))\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Train and evaluate the model\n",
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "# Get the best parameters and print the results\n",
+    "best_params = model.best_params_\n",
+    "print(f\"Best Hyperparameters: {best_params}\")\n",
+    "\n",
+    "predictions = model.predict(X_test)\n",
+    "\n",
+    "# Use accuracy and classification report\n",
+    "accuracy = accuracy_score(y_test, predictions)\n",
+    "classification_report_result = classification_report(y_test, predictions)\n",
+    "\n",
+    "print(\"Optimized Accuracy: \", accuracy)\n",
+    "print(\"Classification Report:\\n\", classification_report_result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "resume-ranker",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}