model

model development

model
model development
3f346e12 · Sewwandi W.M.C · 9ac1a033 · 3f346e12
Commit 3f346e12 authored Mar 23, 2024 by Sewwandi W.M.C
Hide whitespace changes
Inline Side-by-side

Showing with 245 additions and 0 deletions

Function 01/Function 01/Function1/March_Update/SkillPredict/gb_ml_model_dev copy.ipynb ...ion1/March_Update/SkillPredict/gb_ml_model_dev copy.ipynb +245 -0

No files found.
--- a/Function 01/Function 01/Function1/March_Update/SkillPredict/gb_ml_model_dev copy.ipynb
+++ b/Function 01/Function 01/Function1/March_Update/SkillPredict/gb_ml_model_dev copy.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Skills Suggestion ML Model - GB Classifier ML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "from joblib import dump\n",
+    "\n",
+    "# Load the dataset\n",
+    "df = pd.read_csv('./Dataset/skills_list_dataset_32.csv', encoding='latin1')\n",
+    "\n",
+    "# Map labels to numbers\n",
+    "label_mapping = {label: idx + 1 for idx, label in enumerate(df['Label'].unique())}\n",
+    "df['Label'] = df['Label'].map(label_mapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lowercase the Keywords_List column and combine keywords into a single string\n",
+    "df['Keywords_List'] = df['Keywords_List'].apply(lambda x: ' '.join(x.lower().split(',')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Label Mapping:\n",
+      "React: 1\n",
+      "React Native: 2\n",
+      "Angular: 3\n",
+      "PHP: 4\n",
+      "Java: 5\n",
+      "HTML/HTML5: 6\n",
+      "CSS: 7\n",
+      "Python: 8\n",
+      "Node.js: 9\n",
+      "Full-Stack Development: 10\n",
+      "Mobile App Development (iOS): 11\n",
+      "Mobile App Development (Android): 12\n",
+      "JavaScript: 13\n",
+      "TypeScript: 14\n",
+      "DevOps: 15\n",
+      "QA/Testing: 16\n",
+      "UI: 17\n",
+      "UX: 18\n",
+      "UI/UX: 19\n",
+      "Cloud (AWS, Google, Azure): 20\n",
+      "Graphics Designing: 21\n",
+      "VFX Designing: 22\n",
+      "GitHub/Version Control: 23\n",
+      "Video Editing: 24\n",
+      "Project Management: 25\n",
+      "Problem Solving: 26\n",
+      "Team Collaboration: 27\n",
+      "3D Designing: 28\n",
+      "Leadership: 29\n",
+      "Animation Editing: 30\n",
+      "Time Management: 31\n",
+      "Digital Marketing: 32\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the mapping of labels to encoded numbers\n",
+    "print(\"Label Mapping:\")\n",
+    "for label, encoded_number in label_mapping.items():\n",
+    "    print(f\"{label}: {encoded_number}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## GB Classifier ML Model Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Accuracy: 94.7205%\n",
+      "Validation Accuracy: 47.8261%\n",
+      "Testing Accuracy: 57.3913%\n",
+      "Classification Report:\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "           1       0.33      0.20      0.25         5\n",
+      "           2       0.50      0.50      0.50         2\n",
+      "           3       1.00      1.00      1.00         1\n",
+      "           4       0.50      1.00      0.67         2\n",
+      "           5       1.00      0.75      0.86        12\n",
+      "           6       0.60      1.00      0.75         3\n",
+      "           7       1.00      0.33      0.50         3\n",
+      "           8       1.00      1.00      1.00         2\n",
+      "           9       1.00      1.00      1.00         5\n",
+      "          10       0.50      0.25      0.33         4\n",
+      "          11       1.00      0.60      0.75         5\n",
+      "          12       1.00      0.67      0.80         6\n",
+      "          13       0.00      0.00      0.00         2\n",
+      "          14       0.20      0.50      0.29         2\n",
+      "          15       0.67      0.33      0.44         6\n",
+      "          16       0.83      0.83      0.83         6\n",
+      "          17       0.00      0.00      0.00         4\n",
+      "          18       0.00      0.00      0.00         1\n",
+      "          19       0.67      1.00      0.80         4\n",
+      "          20       0.07      1.00      0.13         1\n",
+      "          21       0.00      0.00      0.00         1\n",
+      "          22       0.00      0.00      0.00         2\n",
+      "          23       0.50      0.50      0.50         4\n",
+      "          24       1.00      1.00      1.00         3\n",
+      "          25       0.67      1.00      0.80         2\n",
+      "          26       0.25      1.00      0.40         1\n",
+      "          27       0.33      0.33      0.33         3\n",
+      "          28       1.00      0.33      0.50         3\n",
+      "          29       1.00      0.67      0.80         3\n",
+      "          30       0.50      0.14      0.22         7\n",
+      "          31       0.67      0.50      0.57         4\n",
+      "          32       1.00      0.83      0.91         6\n",
+      "\n",
+      "    accuracy                           0.57       115\n",
+      "   macro avg       0.59      0.57      0.53       115\n",
+      "weighted avg       0.69      0.57      0.59       115\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['regularized_gb_model.joblib']"
+      ]
+     },
+     "execution_count": 111,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "from joblib import dump\n",
+    "\n",
+    "# Split the data into training, validation, and testing sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(df['Keywords_List'], df['Label'], test_size=0.2, random_state=42)\n",
+    "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)\n",
+    "\n",
+    "# Define and fit the TF-IDF vectorizer\n",
+    "tfidf_vectorizer = TfidfVectorizer()\n",
+    "X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
+    "X_val_tfidf = tfidf_vectorizer.transform(X_val)\n",
+    "X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
+    "\n",
+    "# Train the model with regularization and early stopping\n",
+    "model = GradientBoostingClassifier(\n",
+    "    learning_rate=0.01,\n",
+    "    max_depth=8,  # Adjust max_depth to control tree complexity\n",
+    "    n_estimators=400,\n",
+    "    n_iter_no_change=5,  # Stop training if validation score does not improve for 5 iterations\n",
+    "    tol=0.01  # Tolerance for the early stopping\n",
+    ")\n",
+    "\n",
+    "model.fit(X_train_tfidf, y_train)\n",
+    "\n",
+    "# Evaluate the model on training set\n",
+    "train_predictions = model.predict(X_train_tfidf)\n",
+    "train_accuracy = accuracy_score(y_train, train_predictions)\n",
+    "\n",
+    "# Print training accuracy\n",
+    "print(f\"Training Accuracy: {round(train_accuracy * 100, 4)}%\")\n",
+    "\n",
+    "# Evaluate the model on validation set\n",
+    "val_predictions = model.predict(X_val_tfidf)\n",
+    "val_accuracy = accuracy_score(y_val, val_predictions)\n",
+    "\n",
+    "# Print validation accuracy\n",
+    "print(f\"Validation Accuracy: {round(val_accuracy * 100, 4)}%\")\n",
+    "\n",
+    "# Make predictions on the test set\n",
+    "test_predictions = model.predict(X_test_tfidf)\n",
+    "\n",
+    "# Evaluate the model on test data\n",
+    "test_accuracy = accuracy_score(y_test, test_predictions)\n",
+    "classification_report_result = classification_report(y_test, test_predictions)\n",
+    "\n",
+    "print(f\"Testing Accuracy: {round(test_accuracy * 100, 4)}%\")\n",
+    "print(\"Classification Report:\\n\", classification_report_result)\n",
+    "\n",
+    "# Save the trained model\n",
+    "dump(model, 'regularized_gb_model.joblib')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "resume-ranker",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}