Commit ea8cfd2b authored by Sewwandi W.M.C's avatar Sewwandi W.M.C

Create function2.py

cleaning
parent fea03af9
import os
from fastapi.responses import FileResponse
import gensim.downloader as api
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from keybert import KeyBERT
import pickle
import time
import ast
from openai import OpenAI
import os
from dotenv import load_dotenv
import re
load_dotenv()
client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def process_function2_data(cv_data_path: str, job_data_path: str):
# Use the current script file path to determine the project directory
project_path = os.path.dirname(os.path.abspath(__file__))
print("Keywords and Synonyms generation started...")
# Load CV data
cv_data = pd.read_csv(cv_data_path, encoding='latin1')
# Load job ad data
job_ad_data = pd.read_csv(job_data_path, encoding='latin1')
job_ad_data.columns = job_ad_data.columns.str.strip()
# Remove empty columns
cv_data = cv_data.dropna(axis=1, how='all')
job_ad_data = job_ad_data.dropna(axis=1, how='all')
cv_data['Other_Data'] = cv_data['Other_Data'].str.lower()
cv_other_data = cv_data['Other_Data']
job_ad_data[['Resume 1', 'Resume 2', 'Resume 3']] = job_ad_data[['Resume 1', 'Resume 2', 'Resume 3']].applymap(lambda x: x.lower() if pd.notna(x) else x)
# Tokenization and Lemmatization function
def tokenize_and_lemmatize(text):
# Check if the value is NaN
if pd.isna(text):
return []
# Replace '/' with space and then tokenize
text = text.replace('/', ' ')
# Remove punctuation and symbols
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove numbers
text = ''.join([char for char in text if not char.isdigit()])
# Tokenize the text
tokens = word_tokenize(text)
# Lemmatize the tokens
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
# Remove consecutive single-letter tokens if they occur more than three times
cleaned_tokens = []
count_consecutive_single_letter = 0
for token in tokens:
if len(token) == 1:
count_consecutive_single_letter += 1
if count_consecutive_single_letter <= 2:
cleaned_tokens.append(token)
else:
count_consecutive_single_letter = 0
cleaned_tokens.append(token)
return cleaned_tokens
# Tokenize and lemmatize CV data
cv_tokens = cv_other_data.apply(tokenize_and_lemmatize)
# Tokenize and lemmatize CV data for each resume column
for i in range(1, 4):
col_name = f'Resume {i}'
job_ad_data[f'{col_name} Tokens'] = job_ad_data[col_name].apply(tokenize_and_lemmatize)
def generate_cleaned_response(user_prompt):
# Introduce a delay to avoid RateLimitError
delay = int(os.environ.get("DELAY"))
time.sleep(delay)
system_prompt_keywords_path = os.path.join(project_path, 'system_prompt_keywords.txt')
# Load default prompt from a text file
with open(system_prompt_keywords_path, "r") as file:
system_prompt = file.read()
# Limit the maximum token length to 10000
max_token_length = 10000
if len(user_prompt.split()) > max_token_length:
user_prompt = ' '.join(user_prompt.split()[:max_token_length])
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
# Organize and assign the assistant's response to a variable
assistant_response = completion.choices[0].message.content
# Define a cleaning function using regex to remove '*' or '**' and backticks
def clean_response(response):
cleaned_response = re.sub(r'\*+', '', response)
cleaned_response = re.sub(r'`', '', cleaned_response)
bracket_text = re.findall(r'\[(.*?)\]', cleaned_response)
first_bracket_text = bracket_text[0] if bracket_text else ""
final_cleaned_response = f"[{first_bracket_text}]"
return final_cleaned_response
cleaned_assistant_response = clean_response(assistant_response)
if cleaned_assistant_response == "[]":
tokens = tokenize_and_lemmatize(assistant_response)
bracketed_tokens = "[" + ', '.join([f"'{token}'" for token in tokens]) + "]"
return bracketed_tokens
else:
return cleaned_assistant_response
## CV Keywords
# Apply the function to each row in the 'cv_tokens' column
cv_data['Matched_Keywords'] = cv_tokens.apply(lambda tokens: generate_cleaned_response(' '.join(tokens)))
def generate_cleaned_response_job(user_prompt):
# Introduce a delay to avoid RateLimitError
delay = int(os.environ.get("DELAY"))
time.sleep(delay)
system_prompt_keywords_job_path = os.path.join(project_path, 'system_prompt_keywords_job.txt')
# Load default prompt from a text file
with open(system_prompt_keywords_job_path, "r") as file:
system_prompt = file.read()
# Limit the maximum token length to 10000
max_token_length = 10000
if len(user_prompt.split()) > max_token_length:
user_prompt = ' '.join(user_prompt.split()[:max_token_length])
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
# Organize and assign the assistant's response to a variable
assistant_response = completion.choices[0].message.content
# Define a cleaning function using regex to remove '*' or '**' and backticks
def clean_response(response):
cleaned_response = re.sub(r'\*+', '', response)
cleaned_response = re.sub(r'`', '', cleaned_response)
bracket_text = re.findall(r'\[(.*?)\]', cleaned_response)
first_bracket_text = bracket_text[0] if bracket_text else ""
final_cleaned_response = f"[{first_bracket_text}]"
return final_cleaned_response
cleaned_assistant_response = clean_response(assistant_response)
if cleaned_assistant_response == "[]":
tokens = tokenize_and_lemmatize(assistant_response)
bracketed_tokens = "[" + ', '.join([f"'{token}'" for token in tokens]) + "]"
return bracketed_tokens
else:
return cleaned_assistant_response
## Job Resumes Keywords
# Apply the function to each resume column
for i in range(1, 4):
# Introduce a delay to avoid RateLimitError
delay = int(os.environ.get("DELAY"))
time.sleep(delay)
col_name_tokens = f'Resume {i} Tokens'
col_name_matched_keywords = f'Resume {i} Matched_Keywords'
job_ad_data[col_name_matched_keywords] = job_ad_data[col_name_tokens].apply(lambda tokens: generate_cleaned_response_job(' '.join(tokens)))
print(f'Processed Resume {i} Matched_Keywords')
# Apply the function to each matched keywords column
for i in range(1, 4):
col_name_matched_keywords = f'Resume {i} Matched_Keywords'
col_name_meaningful_keywords = f'Resume {i} Meaningful_Keywords'
job_ad_data[col_name_meaningful_keywords] = job_ad_data[col_name_matched_keywords]
print(f'Processed Resume {i} Meaningful_Keywords')
## KeyBERT Keywords Extraction - CV Matched_Keywords
# Function to safely convert string representation of list to list
def safe_list_conversion(string):
try:
return ast.literal_eval(string)
except (SyntaxError, ValueError):
return None # Return None instead of an empty list for easy removal
# Create a KeyBERT model with adjusted parameters
kw_model = KeyBERT(model='distilbert-base-nli-mean-tokens')
# Function to extract keywords using KeyBERT
def extract_keywords_with_keybert(matched_keywords):
# Convert the string representation of the list to an actual list
matched_keywords_list = safe_list_conversion(matched_keywords)
# If matched_keywords_list is None, return an empty list
if matched_keywords_list is None:
return []
# Join the matched keywords into a single string
text = ' '.join(matched_keywords_list)
# Use KeyBERT to extract keywords
keywords = kw_model.extract_keywords(text, top_n=40, keyphrase_ngram_range=(1, 1), stop_words="english")
extracted_keywords = [keyword[0] for keyword in keywords]
return extracted_keywords
# Apply the function to each row in the 'Matched_Keywords' column
cv_data['KeyBERT_Extracted_Keywords'] = cv_data['Matched_Keywords'].apply(extract_keywords_with_keybert)
# Remove rows where 'Matched_Keywords' could not be converted
cv_data = cv_data[cv_data['KeyBERT_Extracted_Keywords'].apply(len) > 0]
# Function to create final keywords combining 'Matched_Keywords' and 'KeyBERT_Extracted_Keywords'
def create_final_keywords(row):
matched_keywords_list = safe_list_conversion(row['Matched_Keywords'])
keybert_extracted_keywords = row['KeyBERT_Extracted_Keywords']
if matched_keywords_list is None:
return keybert_extracted_keywords
else:
final_keywords = list(set(matched_keywords_list + keybert_extracted_keywords))
return final_keywords
# Apply the function to each row in the DataFrame
cv_data['Final_Keywords'] = cv_data.apply(create_final_keywords, axis=1)
## KeyBERT Keywords Extraction - Job Resumes Matched_Keywords
# Apply the function to each matched keywords column for Job Resumes
for i in range(1, 4):
col_name_matched_keywords = f'Resume {i} Matched_Keywords'
col_name_keybert_extracted_keywords = f'Resume {i} KeyBERT_Extracted_Keywords'
job_ad_data[col_name_keybert_extracted_keywords] = job_ad_data[col_name_matched_keywords].apply(extract_keywords_with_keybert)
job_ad_data = job_ad_data[job_ad_data[col_name_keybert_extracted_keywords].apply(len) > 0]
# Function to create final keywords combining 'Matched_Keywords' and 'KeyBERT_Extracted_Keywords'
def create_final_keywords_job(matched_keywords, keybert_extracted_keywords):
matched_keywords_list = matched_keywords
keybert_extracted_keywords = keybert_extracted_keywords
if matched_keywords_list is None:
return keybert_extracted_keywords
else:
final_keywords = list(set(matched_keywords_list + keybert_extracted_keywords))
return final_keywords
# Apply the function to each row in the DataFrame for Job Resumes
for i in range(1, 4):
col_name_meaningful_keywords = f'Resume {i} Matched_Keywords'
col_name_keybert_extracted_keywords = f'Resume {i} KeyBERT_Extracted_Keywords'
col_name_final_keywords = f'Resume {i} Final_Keywords'
job_ad_data[col_name_final_keywords] = job_ad_data.apply(lambda row: create_final_keywords_job(safe_list_conversion(row[col_name_meaningful_keywords]), row[col_name_keybert_extracted_keywords]), axis=1)
### AI-based Synonym Generation - CV Final_Keywords
# Load Word2Vec model
w2v_model = api.load('word2vec-google-news-300')
# Function to generate synonyms using Word2Vec model
def generate_synonyms(word):
try:
# Get synonyms using the Word2Vec model and convert to lowercase
synonyms = [synonym.lower() for synonym, similarity in w2v_model.most_similar(word, topn=5)]
return synonyms
except KeyError:
# Handle the case where the word is not in the vocabulary
return []
# Function to generate synonyms for a list of words
def generate_synonyms_list(word_list):
synonyms_list = [generate_synonyms(word) for word in word_list]
return synonyms_list
# Apply the function to each row in the DataFrame
cv_data['Synonyms'] = cv_data['Final_Keywords'].apply(generate_synonyms_list)
# Function to clean and get the final list of synonyms
def clean_and_get_final_synonyms(synonyms_list):
final_synonyms = [synonym for sublist in synonyms_list for synonym in sublist]
final_synonyms = list(set(final_synonyms)) # Remove duplicates
final_synonyms = [synonym for synonym in final_synonyms if synonym] # Remove empty strings
return final_synonyms
# Apply the function to each row in the DataFrame and convert to lowercase
cv_data['Final_Synonyms'] = cv_data['Synonyms'].apply(clean_and_get_final_synonyms)
### AI-based Synonym Generation - Job Resumes Final_Keywords
# Apply the function to each resume column for Job Resumes
for i in range(1, 4):
col_name_final_keywords = f'Resume {i} Final_Keywords'
col_name_synonyms = f'Resume {i} Synonyms'
job_ad_data[col_name_synonyms] = job_ad_data[col_name_final_keywords].apply(generate_synonyms_list)
# Function to clean and get the final list of synonyms
def clean_and_get_final_synonyms(synonyms_list):
final_synonyms = [synonym for sublist in synonyms_list for synonym in sublist]
final_synonyms = list(set(final_synonyms)) # Remove duplicates
final_synonyms = [synonym for synonym in final_synonyms if synonym] # Remove empty strings
return final_synonyms
# Apply the function to each row in the DataFrame
job_ad_data[col_name_synonyms] = job_ad_data[col_name_synonyms].apply(clean_and_get_final_synonyms)
## Final Keywords + Synonyms - CV Data
# Combine 'Final_Keywords' and 'Final_Synonyms' into 'Final_Keywords_Synonyms' column with uniqueness
cv_data['Final_Keywords_Synonyms'] = cv_data.apply(lambda row: list(set(row['Final_Keywords'] + row['Final_Synonyms'])), axis=1)
# Save DataFrames to CSV files
final_cv_keywords_synonyms_path = os.path.join(project_path, 'KeywordFiles', 'final_cv_keywords_synonyms.csv')
cv_data.to_csv(final_cv_keywords_synonyms_path, index=False)
## Final Keywords + Synonyms - Job Resumes Data
# Apply the function to each resume column for Job Resumes
for i in range(1, 4):
col_name_final_keywords = f'Resume {i} Final_Keywords'
col_name_synonyms = f'Resume {i} Synonyms'
col_name_combined = f'Resume {i} Final_Keywords_Synonyms'
def combine_keywords_and_synonyms(row):
if row[col_name_final_keywords] and row[col_name_synonyms]:
return list(set(row[col_name_final_keywords] + row[col_name_synonyms]))
elif row[col_name_final_keywords]:
return row[col_name_final_keywords]
elif row[col_name_synonyms]:
return row[col_name_synonyms]
else:
return []
job_ad_data[col_name_combined] = job_ad_data.apply(combine_keywords_and_synonyms, axis=1)
# Save the updated dataset to a CSV file for Job Resumes
final_job_resumes_keywords_synonyms_path = os.path.join(project_path, 'KeywordFiles', 'final_job_resumes_keywords_synonyms.csv')
job_ad_data.to_csv(final_job_resumes_keywords_synonyms_path, index=False)
print("Keywords and Synonyms saved to CSV files:")
print(f"Final CV Keywords and Synonyms saved to {final_cv_keywords_synonyms_path}")
print(f"Final Job Resumes Keywords and Synonyms saved to {final_job_resumes_keywords_synonyms_path}")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment