Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2023-24-051 Resume_Ranker
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
TMP-2023-24-051
2023-24-051 Resume_Ranker
Commits
ea8cfd2b
Commit
ea8cfd2b
authored
Mar 23, 2024
by
Sewwandi W.M.C
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Create function2.py
cleaning
parent
fea03af9
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
388 additions
and
0 deletions
+388
-0
Function 01/Function 01/APIv3.0/app/function2.py
Function 01/Function 01/APIv3.0/app/function2.py
+388
-0
No files found.
Function 01/Function 01/APIv3.0/app/function2.py
0 → 100644
View file @
ea8cfd2b
import
os
from
fastapi.responses
import
FileResponse
import
gensim.downloader
as
api
import
pandas
as
pd
import
nltk
from
nltk.tokenize
import
word_tokenize
from
nltk.corpus
import
stopwords
from
nltk.stem
import
WordNetLemmatizer
import
string
from
keybert
import
KeyBERT
import
pickle
import
time
import
ast
from
openai
import
OpenAI
import
os
from
dotenv
import
load_dotenv
import
re
load_dotenv
()
client
=
OpenAI
(
base_url
=
"http://localhost:1234/v1"
,
api_key
=
"not-needed"
)
nltk
.
download
(
'punkt'
)
nltk
.
download
(
'stopwords'
)
nltk
.
download
(
'wordnet'
)
def
process_function2_data
(
cv_data_path
:
str
,
job_data_path
:
str
):
# Use the current script file path to determine the project directory
project_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
print
(
"Keywords and Synonyms generation started..."
)
# Load CV data
cv_data
=
pd
.
read_csv
(
cv_data_path
,
encoding
=
'latin1'
)
# Load job ad data
job_ad_data
=
pd
.
read_csv
(
job_data_path
,
encoding
=
'latin1'
)
job_ad_data
.
columns
=
job_ad_data
.
columns
.
str
.
strip
()
# Remove empty columns
cv_data
=
cv_data
.
dropna
(
axis
=
1
,
how
=
'all'
)
job_ad_data
=
job_ad_data
.
dropna
(
axis
=
1
,
how
=
'all'
)
cv_data
[
'Other_Data'
]
=
cv_data
[
'Other_Data'
]
.
str
.
lower
()
cv_other_data
=
cv_data
[
'Other_Data'
]
job_ad_data
[[
'Resume 1'
,
'Resume 2'
,
'Resume 3'
]]
=
job_ad_data
[[
'Resume 1'
,
'Resume 2'
,
'Resume 3'
]]
.
applymap
(
lambda
x
:
x
.
lower
()
if
pd
.
notna
(
x
)
else
x
)
# Tokenization and Lemmatization function
def
tokenize_and_lemmatize
(
text
):
# Check if the value is NaN
if
pd
.
isna
(
text
):
return
[]
# Replace '/' with space and then tokenize
text
=
text
.
replace
(
'/'
,
' '
)
# Remove punctuation and symbols
text
=
text
.
translate
(
str
.
maketrans
(
''
,
''
,
string
.
punctuation
))
# Remove numbers
text
=
''
.
join
([
char
for
char
in
text
if
not
char
.
isdigit
()])
# Tokenize the text
tokens
=
word_tokenize
(
text
)
# Lemmatize the tokens
lemmatizer
=
WordNetLemmatizer
()
tokens
=
[
lemmatizer
.
lemmatize
(
token
)
for
token
in
tokens
if
token
.
isalnum
()]
# Remove stopwords
stop_words
=
set
(
stopwords
.
words
(
'english'
))
tokens
=
[
token
for
token
in
tokens
if
token
not
in
stop_words
]
# Remove consecutive single-letter tokens if they occur more than three times
cleaned_tokens
=
[]
count_consecutive_single_letter
=
0
for
token
in
tokens
:
if
len
(
token
)
==
1
:
count_consecutive_single_letter
+=
1
if
count_consecutive_single_letter
<=
2
:
cleaned_tokens
.
append
(
token
)
else
:
count_consecutive_single_letter
=
0
cleaned_tokens
.
append
(
token
)
return
cleaned_tokens
# Tokenize and lemmatize CV data
cv_tokens
=
cv_other_data
.
apply
(
tokenize_and_lemmatize
)
# Tokenize and lemmatize CV data for each resume column
for
i
in
range
(
1
,
4
):
col_name
=
f
'Resume {i}'
job_ad_data
[
f
'{col_name} Tokens'
]
=
job_ad_data
[
col_name
]
.
apply
(
tokenize_and_lemmatize
)
def
generate_cleaned_response
(
user_prompt
):
# Introduce a delay to avoid RateLimitError
delay
=
int
(
os
.
environ
.
get
(
"DELAY"
))
time
.
sleep
(
delay
)
system_prompt_keywords_path
=
os
.
path
.
join
(
project_path
,
'system_prompt_keywords.txt'
)
# Load default prompt from a text file
with
open
(
system_prompt_keywords_path
,
"r"
)
as
file
:
system_prompt
=
file
.
read
()
# Limit the maximum token length to 10000
max_token_length
=
10000
if
len
(
user_prompt
.
split
())
>
max_token_length
:
user_prompt
=
' '
.
join
(
user_prompt
.
split
()[:
max_token_length
])
completion
=
client
.
chat
.
completions
.
create
(
model
=
"gpt-3.5-turbo"
,
messages
=
[
{
"role"
:
"system"
,
"content"
:
system_prompt
},
{
"role"
:
"user"
,
"content"
:
user_prompt
}
]
)
# Organize and assign the assistant's response to a variable
assistant_response
=
completion
.
choices
[
0
]
.
message
.
content
# Define a cleaning function using regex to remove '*' or '**' and backticks
def
clean_response
(
response
):
cleaned_response
=
re
.
sub
(
r'\*+'
,
''
,
response
)
cleaned_response
=
re
.
sub
(
r'`'
,
''
,
cleaned_response
)
bracket_text
=
re
.
findall
(
r'\[(.*?)\]'
,
cleaned_response
)
first_bracket_text
=
bracket_text
[
0
]
if
bracket_text
else
""
final_cleaned_response
=
f
"[{first_bracket_text}]"
return
final_cleaned_response
cleaned_assistant_response
=
clean_response
(
assistant_response
)
if
cleaned_assistant_response
==
"[]"
:
tokens
=
tokenize_and_lemmatize
(
assistant_response
)
bracketed_tokens
=
"["
+
', '
.
join
([
f
"'{token}'"
for
token
in
tokens
])
+
"]"
return
bracketed_tokens
else
:
return
cleaned_assistant_response
## CV Keywords
# Apply the function to each row in the 'cv_tokens' column
cv_data
[
'Matched_Keywords'
]
=
cv_tokens
.
apply
(
lambda
tokens
:
generate_cleaned_response
(
' '
.
join
(
tokens
)))
def
generate_cleaned_response_job
(
user_prompt
):
# Introduce a delay to avoid RateLimitError
delay
=
int
(
os
.
environ
.
get
(
"DELAY"
))
time
.
sleep
(
delay
)
system_prompt_keywords_job_path
=
os
.
path
.
join
(
project_path
,
'system_prompt_keywords_job.txt'
)
# Load default prompt from a text file
with
open
(
system_prompt_keywords_job_path
,
"r"
)
as
file
:
system_prompt
=
file
.
read
()
# Limit the maximum token length to 10000
max_token_length
=
10000
if
len
(
user_prompt
.
split
())
>
max_token_length
:
user_prompt
=
' '
.
join
(
user_prompt
.
split
()[:
max_token_length
])
completion
=
client
.
chat
.
completions
.
create
(
model
=
"gpt-3.5-turbo"
,
messages
=
[
{
"role"
:
"system"
,
"content"
:
system_prompt
},
{
"role"
:
"user"
,
"content"
:
user_prompt
}
]
)
# Organize and assign the assistant's response to a variable
assistant_response
=
completion
.
choices
[
0
]
.
message
.
content
# Define a cleaning function using regex to remove '*' or '**' and backticks
def
clean_response
(
response
):
cleaned_response
=
re
.
sub
(
r'\*+'
,
''
,
response
)
cleaned_response
=
re
.
sub
(
r'`'
,
''
,
cleaned_response
)
bracket_text
=
re
.
findall
(
r'\[(.*?)\]'
,
cleaned_response
)
first_bracket_text
=
bracket_text
[
0
]
if
bracket_text
else
""
final_cleaned_response
=
f
"[{first_bracket_text}]"
return
final_cleaned_response
cleaned_assistant_response
=
clean_response
(
assistant_response
)
if
cleaned_assistant_response
==
"[]"
:
tokens
=
tokenize_and_lemmatize
(
assistant_response
)
bracketed_tokens
=
"["
+
', '
.
join
([
f
"'{token}'"
for
token
in
tokens
])
+
"]"
return
bracketed_tokens
else
:
return
cleaned_assistant_response
## Job Resumes Keywords
# Apply the function to each resume column
for
i
in
range
(
1
,
4
):
# Introduce a delay to avoid RateLimitError
delay
=
int
(
os
.
environ
.
get
(
"DELAY"
))
time
.
sleep
(
delay
)
col_name_tokens
=
f
'Resume {i} Tokens'
col_name_matched_keywords
=
f
'Resume {i} Matched_Keywords'
job_ad_data
[
col_name_matched_keywords
]
=
job_ad_data
[
col_name_tokens
]
.
apply
(
lambda
tokens
:
generate_cleaned_response_job
(
' '
.
join
(
tokens
)))
print
(
f
'Processed Resume {i} Matched_Keywords'
)
# Apply the function to each matched keywords column
for
i
in
range
(
1
,
4
):
col_name_matched_keywords
=
f
'Resume {i} Matched_Keywords'
col_name_meaningful_keywords
=
f
'Resume {i} Meaningful_Keywords'
job_ad_data
[
col_name_meaningful_keywords
]
=
job_ad_data
[
col_name_matched_keywords
]
print
(
f
'Processed Resume {i} Meaningful_Keywords'
)
## KeyBERT Keywords Extraction - CV Matched_Keywords
# Function to safely convert string representation of list to list
def
safe_list_conversion
(
string
):
try
:
return
ast
.
literal_eval
(
string
)
except
(
SyntaxError
,
ValueError
):
return
None
# Return None instead of an empty list for easy removal
# Create a KeyBERT model with adjusted parameters
kw_model
=
KeyBERT
(
model
=
'distilbert-base-nli-mean-tokens'
)
# Function to extract keywords using KeyBERT
def
extract_keywords_with_keybert
(
matched_keywords
):
# Convert the string representation of the list to an actual list
matched_keywords_list
=
safe_list_conversion
(
matched_keywords
)
# If matched_keywords_list is None, return an empty list
if
matched_keywords_list
is
None
:
return
[]
# Join the matched keywords into a single string
text
=
' '
.
join
(
matched_keywords_list
)
# Use KeyBERT to extract keywords
keywords
=
kw_model
.
extract_keywords
(
text
,
top_n
=
40
,
keyphrase_ngram_range
=
(
1
,
1
),
stop_words
=
"english"
)
extracted_keywords
=
[
keyword
[
0
]
for
keyword
in
keywords
]
return
extracted_keywords
# Apply the function to each row in the 'Matched_Keywords' column
cv_data
[
'KeyBERT_Extracted_Keywords'
]
=
cv_data
[
'Matched_Keywords'
]
.
apply
(
extract_keywords_with_keybert
)
# Remove rows where 'Matched_Keywords' could not be converted
cv_data
=
cv_data
[
cv_data
[
'KeyBERT_Extracted_Keywords'
]
.
apply
(
len
)
>
0
]
# Function to create final keywords combining 'Matched_Keywords' and 'KeyBERT_Extracted_Keywords'
def
create_final_keywords
(
row
):
matched_keywords_list
=
safe_list_conversion
(
row
[
'Matched_Keywords'
])
keybert_extracted_keywords
=
row
[
'KeyBERT_Extracted_Keywords'
]
if
matched_keywords_list
is
None
:
return
keybert_extracted_keywords
else
:
final_keywords
=
list
(
set
(
matched_keywords_list
+
keybert_extracted_keywords
))
return
final_keywords
# Apply the function to each row in the DataFrame
cv_data
[
'Final_Keywords'
]
=
cv_data
.
apply
(
create_final_keywords
,
axis
=
1
)
## KeyBERT Keywords Extraction - Job Resumes Matched_Keywords
# Apply the function to each matched keywords column for Job Resumes
for
i
in
range
(
1
,
4
):
col_name_matched_keywords
=
f
'Resume {i} Matched_Keywords'
col_name_keybert_extracted_keywords
=
f
'Resume {i} KeyBERT_Extracted_Keywords'
job_ad_data
[
col_name_keybert_extracted_keywords
]
=
job_ad_data
[
col_name_matched_keywords
]
.
apply
(
extract_keywords_with_keybert
)
job_ad_data
=
job_ad_data
[
job_ad_data
[
col_name_keybert_extracted_keywords
]
.
apply
(
len
)
>
0
]
# Function to create final keywords combining 'Matched_Keywords' and 'KeyBERT_Extracted_Keywords'
def
create_final_keywords_job
(
matched_keywords
,
keybert_extracted_keywords
):
matched_keywords_list
=
matched_keywords
keybert_extracted_keywords
=
keybert_extracted_keywords
if
matched_keywords_list
is
None
:
return
keybert_extracted_keywords
else
:
final_keywords
=
list
(
set
(
matched_keywords_list
+
keybert_extracted_keywords
))
return
final_keywords
# Apply the function to each row in the DataFrame for Job Resumes
for
i
in
range
(
1
,
4
):
col_name_meaningful_keywords
=
f
'Resume {i} Matched_Keywords'
col_name_keybert_extracted_keywords
=
f
'Resume {i} KeyBERT_Extracted_Keywords'
col_name_final_keywords
=
f
'Resume {i} Final_Keywords'
job_ad_data
[
col_name_final_keywords
]
=
job_ad_data
.
apply
(
lambda
row
:
create_final_keywords_job
(
safe_list_conversion
(
row
[
col_name_meaningful_keywords
]),
row
[
col_name_keybert_extracted_keywords
]),
axis
=
1
)
### AI-based Synonym Generation - CV Final_Keywords
# Load Word2Vec model
w2v_model
=
api
.
load
(
'word2vec-google-news-300'
)
# Function to generate synonyms using Word2Vec model
def
generate_synonyms
(
word
):
try
:
# Get synonyms using the Word2Vec model and convert to lowercase
synonyms
=
[
synonym
.
lower
()
for
synonym
,
similarity
in
w2v_model
.
most_similar
(
word
,
topn
=
5
)]
return
synonyms
except
KeyError
:
# Handle the case where the word is not in the vocabulary
return
[]
# Function to generate synonyms for a list of words
def
generate_synonyms_list
(
word_list
):
synonyms_list
=
[
generate_synonyms
(
word
)
for
word
in
word_list
]
return
synonyms_list
# Apply the function to each row in the DataFrame
cv_data
[
'Synonyms'
]
=
cv_data
[
'Final_Keywords'
]
.
apply
(
generate_synonyms_list
)
# Function to clean and get the final list of synonyms
def
clean_and_get_final_synonyms
(
synonyms_list
):
final_synonyms
=
[
synonym
for
sublist
in
synonyms_list
for
synonym
in
sublist
]
final_synonyms
=
list
(
set
(
final_synonyms
))
# Remove duplicates
final_synonyms
=
[
synonym
for
synonym
in
final_synonyms
if
synonym
]
# Remove empty strings
return
final_synonyms
# Apply the function to each row in the DataFrame and convert to lowercase
cv_data
[
'Final_Synonyms'
]
=
cv_data
[
'Synonyms'
]
.
apply
(
clean_and_get_final_synonyms
)
### AI-based Synonym Generation - Job Resumes Final_Keywords
# Apply the function to each resume column for Job Resumes
for
i
in
range
(
1
,
4
):
col_name_final_keywords
=
f
'Resume {i} Final_Keywords'
col_name_synonyms
=
f
'Resume {i} Synonyms'
job_ad_data
[
col_name_synonyms
]
=
job_ad_data
[
col_name_final_keywords
]
.
apply
(
generate_synonyms_list
)
# Function to clean and get the final list of synonyms
def
clean_and_get_final_synonyms
(
synonyms_list
):
final_synonyms
=
[
synonym
for
sublist
in
synonyms_list
for
synonym
in
sublist
]
final_synonyms
=
list
(
set
(
final_synonyms
))
# Remove duplicates
final_synonyms
=
[
synonym
for
synonym
in
final_synonyms
if
synonym
]
# Remove empty strings
return
final_synonyms
# Apply the function to each row in the DataFrame
job_ad_data
[
col_name_synonyms
]
=
job_ad_data
[
col_name_synonyms
]
.
apply
(
clean_and_get_final_synonyms
)
## Final Keywords + Synonyms - CV Data
# Combine 'Final_Keywords' and 'Final_Synonyms' into 'Final_Keywords_Synonyms' column with uniqueness
cv_data
[
'Final_Keywords_Synonyms'
]
=
cv_data
.
apply
(
lambda
row
:
list
(
set
(
row
[
'Final_Keywords'
]
+
row
[
'Final_Synonyms'
])),
axis
=
1
)
# Save DataFrames to CSV files
final_cv_keywords_synonyms_path
=
os
.
path
.
join
(
project_path
,
'KeywordFiles'
,
'final_cv_keywords_synonyms.csv'
)
cv_data
.
to_csv
(
final_cv_keywords_synonyms_path
,
index
=
False
)
## Final Keywords + Synonyms - Job Resumes Data
# Apply the function to each resume column for Job Resumes
for
i
in
range
(
1
,
4
):
col_name_final_keywords
=
f
'Resume {i} Final_Keywords'
col_name_synonyms
=
f
'Resume {i} Synonyms'
col_name_combined
=
f
'Resume {i} Final_Keywords_Synonyms'
def
combine_keywords_and_synonyms
(
row
):
if
row
[
col_name_final_keywords
]
and
row
[
col_name_synonyms
]:
return
list
(
set
(
row
[
col_name_final_keywords
]
+
row
[
col_name_synonyms
]))
elif
row
[
col_name_final_keywords
]:
return
row
[
col_name_final_keywords
]
elif
row
[
col_name_synonyms
]:
return
row
[
col_name_synonyms
]
else
:
return
[]
job_ad_data
[
col_name_combined
]
=
job_ad_data
.
apply
(
combine_keywords_and_synonyms
,
axis
=
1
)
# Save the updated dataset to a CSV file for Job Resumes
final_job_resumes_keywords_synonyms_path
=
os
.
path
.
join
(
project_path
,
'KeywordFiles'
,
'final_job_resumes_keywords_synonyms.csv'
)
job_ad_data
.
to_csv
(
final_job_resumes_keywords_synonyms_path
,
index
=
False
)
print
(
"Keywords and Synonyms saved to CSV files:"
)
print
(
f
"Final CV Keywords and Synonyms saved to {final_cv_keywords_synonyms_path}"
)
print
(
f
"Final Job Resumes Keywords and Synonyms saved to {final_job_resumes_keywords_synonyms_path}"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment