Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2023-24-051 Resume_Ranker
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
TMP-2023-24-051
2023-24-051 Resume_Ranker
Commits
3f346e12
Commit
3f346e12
authored
Mar 23, 2024
by
Sewwandi W.M.C
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
model
model development
parent
9ac1a033
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
245 additions
and
0 deletions
+245
-0
Function 01/Function 01/Function1/March_Update/SkillPredict/gb_ml_model_dev copy.ipynb
...ion1/March_Update/SkillPredict/gb_ml_model_dev copy.ipynb
+245
-0
No files found.
Function 01/Function 01/Function1/March_Update/SkillPredict/gb_ml_model_dev copy.ipynb
0 → 100644
View file @
3f346e12
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Skills Suggestion ML Model - GB Classifier ML"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from joblib import dump\n",
"\n",
"# Load the dataset\n",
"df = pd.read_csv('./Dataset/skills_list_dataset_32.csv', encoding='latin1')\n",
"\n",
"# Map labels to numbers\n",
"label_mapping = {label: idx + 1 for idx, label in enumerate(df['Label'].unique())}\n",
"df['Label'] = df['Label'].map(label_mapping)"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"# Lowercase the Keywords_List column and combine keywords into a single string\n",
"df['Keywords_List'] = df['Keywords_List'].apply(lambda x: ' '.join(x.lower().split(',')))"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Label Mapping:\n",
"React: 1\n",
"React Native: 2\n",
"Angular: 3\n",
"PHP: 4\n",
"Java: 5\n",
"HTML/HTML5: 6\n",
"CSS: 7\n",
"Python: 8\n",
"Node.js: 9\n",
"Full-Stack Development: 10\n",
"Mobile App Development (iOS): 11\n",
"Mobile App Development (Android): 12\n",
"JavaScript: 13\n",
"TypeScript: 14\n",
"DevOps: 15\n",
"QA/Testing: 16\n",
"UI: 17\n",
"UX: 18\n",
"UI/UX: 19\n",
"Cloud (AWS, Google, Azure): 20\n",
"Graphics Designing: 21\n",
"VFX Designing: 22\n",
"GitHub/Version Control: 23\n",
"Video Editing: 24\n",
"Project Management: 25\n",
"Problem Solving: 26\n",
"Team Collaboration: 27\n",
"3D Designing: 28\n",
"Leadership: 29\n",
"Animation Editing: 30\n",
"Time Management: 31\n",
"Digital Marketing: 32\n"
]
}
],
"source": [
"# Print the mapping of labels to encoded numbers\n",
"print(\"Label Mapping:\")\n",
"for label, encoded_number in label_mapping.items():\n",
" print(f\"{label}: {encoded_number}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GB Classifier ML Model Train"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training Accuracy: 94.7205%\n",
"Validation Accuracy: 47.8261%\n",
"Testing Accuracy: 57.3913%\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 1 0.33 0.20 0.25 5\n",
" 2 0.50 0.50 0.50 2\n",
" 3 1.00 1.00 1.00 1\n",
" 4 0.50 1.00 0.67 2\n",
" 5 1.00 0.75 0.86 12\n",
" 6 0.60 1.00 0.75 3\n",
" 7 1.00 0.33 0.50 3\n",
" 8 1.00 1.00 1.00 2\n",
" 9 1.00 1.00 1.00 5\n",
" 10 0.50 0.25 0.33 4\n",
" 11 1.00 0.60 0.75 5\n",
" 12 1.00 0.67 0.80 6\n",
" 13 0.00 0.00 0.00 2\n",
" 14 0.20 0.50 0.29 2\n",
" 15 0.67 0.33 0.44 6\n",
" 16 0.83 0.83 0.83 6\n",
" 17 0.00 0.00 0.00 4\n",
" 18 0.00 0.00 0.00 1\n",
" 19 0.67 1.00 0.80 4\n",
" 20 0.07 1.00 0.13 1\n",
" 21 0.00 0.00 0.00 1\n",
" 22 0.00 0.00 0.00 2\n",
" 23 0.50 0.50 0.50 4\n",
" 24 1.00 1.00 1.00 3\n",
" 25 0.67 1.00 0.80 2\n",
" 26 0.25 1.00 0.40 1\n",
" 27 0.33 0.33 0.33 3\n",
" 28 1.00 0.33 0.50 3\n",
" 29 1.00 0.67 0.80 3\n",
" 30 0.50 0.14 0.22 7\n",
" 31 0.67 0.50 0.57 4\n",
" 32 1.00 0.83 0.91 6\n",
"\n",
" accuracy 0.57 115\n",
" macro avg 0.59 0.57 0.53 115\n",
"weighted avg 0.69 0.57 0.59 115\n",
"\n"
]
},
{
"data": {
"text/plain": [
"['regularized_gb_model.joblib']"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from joblib import dump\n",
"\n",
"# Split the data into training, validation, and testing sets\n",
"X_train, X_test, y_train, y_test = train_test_split(df['Keywords_List'], df['Label'], test_size=0.2, random_state=42)\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)\n",
"\n",
"# Define and fit the TF-IDF vectorizer\n",
"tfidf_vectorizer = TfidfVectorizer()\n",
"X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
"X_val_tfidf = tfidf_vectorizer.transform(X_val)\n",
"X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
"\n",
"# Train the model with regularization and early stopping\n",
"model = GradientBoostingClassifier(\n",
" learning_rate=0.01,\n",
" max_depth=8, # Adjust max_depth to control tree complexity\n",
" n_estimators=400,\n",
" n_iter_no_change=5, # Stop training if validation score does not improve for 5 iterations\n",
" tol=0.01 # Tolerance for the early stopping\n",
")\n",
"\n",
"model.fit(X_train_tfidf, y_train)\n",
"\n",
"# Evaluate the model on training set\n",
"train_predictions = model.predict(X_train_tfidf)\n",
"train_accuracy = accuracy_score(y_train, train_predictions)\n",
"\n",
"# Print training accuracy\n",
"print(f\"Training Accuracy: {round(train_accuracy * 100, 4)}%\")\n",
"\n",
"# Evaluate the model on validation set\n",
"val_predictions = model.predict(X_val_tfidf)\n",
"val_accuracy = accuracy_score(y_val, val_predictions)\n",
"\n",
"# Print validation accuracy\n",
"print(f\"Validation Accuracy: {round(val_accuracy * 100, 4)}%\")\n",
"\n",
"# Make predictions on the test set\n",
"test_predictions = model.predict(X_test_tfidf)\n",
"\n",
"# Evaluate the model on test data\n",
"test_accuracy = accuracy_score(y_test, test_predictions)\n",
"classification_report_result = classification_report(y_test, test_predictions)\n",
"\n",
"print(f\"Testing Accuracy: {round(test_accuracy * 100, 4)}%\")\n",
"print(\"Classification Report:\\n\", classification_report_result)\n",
"\n",
"# Save the trained model\n",
"dump(model, 'regularized_gb_model.joblib')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "resume-ranker",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment