Create function2.py

cleaning

Create function2.py
cleaning
ea8cfd2b · Sewwandi W.M.C · fea03af9 · ea8cfd2b
Commit ea8cfd2b authored Mar 23, 2024 by Sewwandi W.M.C
Hide whitespace changes
Inline Side-by-side

Showing with 388 additions and 0 deletions

Function 01/Function 01/APIv3.0/app/function2.py Function 01/Function 01/APIv3.0/app/function2.py +388 -0

No files found.
--- a/Function 01/Function 01/APIv3.0/app/function2.py
+++ b/Function 01/Function 01/APIv3.0/app/function2.py
+import os
+from fastapi.responses import FileResponse
+import gensim.downloader as api
+import pandas as pd
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import string
+from keybert import KeyBERT
+import pickle
+import time
+import ast
+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+import re
+load_dotenv()
+
+client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
+
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+
+
+def process_function2_data(cv_data_path: str, job_data_path: str):
+    
+    # Use the current script file path to determine the project directory
+    project_path = os.path.dirname(os.path.abspath(__file__))
+    
+    print("Keywords and Synonyms generation started...")
+
+    # Load CV data
+    cv_data = pd.read_csv(cv_data_path, encoding='latin1')
+
+    # Load job ad data
+    job_ad_data = pd.read_csv(job_data_path, encoding='latin1')
+    job_ad_data.columns = job_ad_data.columns.str.strip()
+
+    # Remove empty columns
+    cv_data = cv_data.dropna(axis=1, how='all')
+    job_ad_data = job_ad_data.dropna(axis=1, how='all')
+
+
+    cv_data['Other_Data'] = cv_data['Other_Data'].str.lower() 
+
+    cv_other_data = cv_data['Other_Data']
+
+    job_ad_data[['Resume 1', 'Resume 2', 'Resume 3']] = job_ad_data[['Resume 1', 'Resume 2', 'Resume 3']].applymap(lambda x: x.lower() if pd.notna(x) else x)
+
+
+    # Tokenization and Lemmatization function
+    def tokenize_and_lemmatize(text):
+        # Check if the value is NaN
+        if pd.isna(text):
+            return []
+        
+        # Replace '/' with space and then tokenize
+        text = text.replace('/', ' ')
+        
+        # Remove punctuation and symbols
+        text = text.translate(str.maketrans('', '', string.punctuation))
+        
+        # Remove numbers
+        text = ''.join([char for char in text if not char.isdigit()])
+        
+        # Tokenize the text
+        tokens = word_tokenize(text)
+        
+        # Lemmatize the tokens
+        lemmatizer = WordNetLemmatizer()
+        tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
+        
+        # Remove stopwords
+        stop_words = set(stopwords.words('english'))
+        tokens = [token for token in tokens if token not in stop_words]
+        
+        # Remove consecutive single-letter tokens if they occur more than three times
+        cleaned_tokens = []
+        count_consecutive_single_letter = 0
+        for token in tokens:
+            if len(token) == 1:
+                count_consecutive_single_letter += 1
+                if count_consecutive_single_letter <= 2:
+                    cleaned_tokens.append(token)
+            else:
+                count_consecutive_single_letter = 0
+                cleaned_tokens.append(token)
+        
+        return cleaned_tokens
+
+
+    # Tokenize and lemmatize CV data
+    cv_tokens = cv_other_data.apply(tokenize_and_lemmatize)
+
+    # Tokenize and lemmatize CV data for each resume column
+    for i in range(1, 4):
+        col_name = f'Resume {i}'
+        job_ad_data[f'{col_name} Tokens'] = job_ad_data[col_name].apply(tokenize_and_lemmatize)
+
+
+    def generate_cleaned_response(user_prompt):
+        # Introduce a delay to avoid RateLimitError
+        delay = int(os.environ.get("DELAY"))
+        time.sleep(delay)
+        
+        system_prompt_keywords_path = os.path.join(project_path, 'system_prompt_keywords.txt')
+        
+        # Load default prompt from a text file
+        with open(system_prompt_keywords_path, "r") as file:
+            system_prompt = file.read()
+        
+        # Limit the maximum token length to 10000
+        max_token_length = 10000
+        if len(user_prompt.split()) > max_token_length:
+            user_prompt = ' '.join(user_prompt.split()[:max_token_length])
+
+        completion = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ]
+        )
+
+        # Organize and assign the assistant's response to a variable
+        assistant_response = completion.choices[0].message.content
+
+        # Define a cleaning function using regex to remove '*' or '**' and backticks
+        def clean_response(response):
+            cleaned_response = re.sub(r'\*+', '', response)
+            cleaned_response = re.sub(r'`', '', cleaned_response)
+            bracket_text = re.findall(r'\[(.*?)\]', cleaned_response)
+            first_bracket_text = bracket_text[0] if bracket_text else ""
+            final_cleaned_response = f"[{first_bracket_text}]"
+            return final_cleaned_response
+
+        cleaned_assistant_response = clean_response(assistant_response)
+
+        if cleaned_assistant_response == "[]":
+            tokens = tokenize_and_lemmatize(assistant_response)
+            bracketed_tokens = "[" + ', '.join([f"'{token}'" for token in tokens]) + "]"
+            return bracketed_tokens
+        else:
+            return cleaned_assistant_response
+
+
+    ## CV Keywords
+    # Apply the function to each row in the 'cv_tokens' column
+    cv_data['Matched_Keywords'] = cv_tokens.apply(lambda tokens: generate_cleaned_response(' '.join(tokens)))
+
+
+    def generate_cleaned_response_job(user_prompt):
+        # Introduce a delay to avoid RateLimitError
+        delay = int(os.environ.get("DELAY"))
+        time.sleep(delay)
+        
+        system_prompt_keywords_job_path = os.path.join(project_path, 'system_prompt_keywords_job.txt')
+        
+        # Load default prompt from a text file
+        with open(system_prompt_keywords_job_path, "r") as file:
+            system_prompt = file.read()
+        
+        # Limit the maximum token length to 10000
+        max_token_length = 10000
+        if len(user_prompt.split()) > max_token_length:
+            user_prompt = ' '.join(user_prompt.split()[:max_token_length])
+
+        completion = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ]
+        )
+
+        # Organize and assign the assistant's response to a variable
+        assistant_response = completion.choices[0].message.content
+
+        # Define a cleaning function using regex to remove '*' or '**' and backticks
+        def clean_response(response):
+            cleaned_response = re.sub(r'\*+', '', response)
+            cleaned_response = re.sub(r'`', '', cleaned_response)
+            bracket_text = re.findall(r'\[(.*?)\]', cleaned_response)
+            first_bracket_text = bracket_text[0] if bracket_text else ""
+            final_cleaned_response = f"[{first_bracket_text}]"
+            return final_cleaned_response
+
+        cleaned_assistant_response = clean_response(assistant_response)
+
+        if cleaned_assistant_response == "[]":
+            tokens = tokenize_and_lemmatize(assistant_response)
+            bracketed_tokens = "[" + ', '.join([f"'{token}'" for token in tokens]) + "]"
+            return bracketed_tokens
+        else:
+            return cleaned_assistant_response
+
+    ## Job Resumes Keywords
+    # Apply the function to each resume column
+    for i in range(1, 4):
+        # Introduce a delay to avoid RateLimitError
+        delay = int(os.environ.get("DELAY"))
+        time.sleep(delay)
+        
+        col_name_tokens = f'Resume {i} Tokens'
+        col_name_matched_keywords = f'Resume {i} Matched_Keywords'
+        
+        job_ad_data[col_name_matched_keywords] = job_ad_data[col_name_tokens].apply(lambda tokens: generate_cleaned_response_job(' '.join(tokens)))
+        print(f'Processed Resume {i} Matched_Keywords')
+
+    # Apply the function to each matched keywords column
+    for i in range(1, 4):
+        col_name_matched_keywords = f'Resume {i} Matched_Keywords'
+        col_name_meaningful_keywords = f'Resume {i} Meaningful_Keywords'
+        
+        job_ad_data[col_name_meaningful_keywords] = job_ad_data[col_name_matched_keywords]
+        print(f'Processed Resume {i} Meaningful_Keywords')
+
+
+    ## KeyBERT Keywords Extraction - CV Matched_Keywords
+    # Function to safely convert string representation of list to list
+    def safe_list_conversion(string):
+        try:
+            return ast.literal_eval(string)
+        except (SyntaxError, ValueError):
+            return None  # Return None instead of an empty list for easy removal
+
+    # Create a KeyBERT model with adjusted parameters
+    kw_model = KeyBERT(model='distilbert-base-nli-mean-tokens')
+
+    # Function to extract keywords using KeyBERT
+    def extract_keywords_with_keybert(matched_keywords):
+        # Convert the string representation of the list to an actual list
+        matched_keywords_list = safe_list_conversion(matched_keywords)
+        
+        # If matched_keywords_list is None, return an empty list
+        if matched_keywords_list is None:
+            return []
+        
+        # Join the matched keywords into a single string
+        text = ' '.join(matched_keywords_list)
+        
+        # Use KeyBERT to extract keywords
+        keywords = kw_model.extract_keywords(text, top_n=40, keyphrase_ngram_range=(1, 1), stop_words="english")
+        extracted_keywords = [keyword[0] for keyword in keywords]
+        
+        return extracted_keywords
+
+    # Apply the function to each row in the 'Matched_Keywords' column
+    cv_data['KeyBERT_Extracted_Keywords'] = cv_data['Matched_Keywords'].apply(extract_keywords_with_keybert)
+
+    # Remove rows where 'Matched_Keywords' could not be converted
+    cv_data = cv_data[cv_data['KeyBERT_Extracted_Keywords'].apply(len) > 0]
+
+    # Function to create final keywords combining 'Matched_Keywords' and 'KeyBERT_Extracted_Keywords'
+    def create_final_keywords(row):
+        matched_keywords_list = safe_list_conversion(row['Matched_Keywords'])
+        keybert_extracted_keywords = row['KeyBERT_Extracted_Keywords']
+        
+        if matched_keywords_list is None:
+            return keybert_extracted_keywords
+        else:
+            final_keywords = list(set(matched_keywords_list + keybert_extracted_keywords))
+            return final_keywords
+
+    # Apply the function to each row in the DataFrame
+    cv_data['Final_Keywords'] = cv_data.apply(create_final_keywords, axis=1)
+
+
+    ## KeyBERT Keywords Extraction - Job Resumes Matched_Keywords
+    # Apply the function to each matched keywords column for Job Resumes
+    for i in range(1, 4):
+        col_name_matched_keywords = f'Resume {i} Matched_Keywords'
+        col_name_keybert_extracted_keywords = f'Resume {i} KeyBERT_Extracted_Keywords'
+        
+        job_ad_data[col_name_keybert_extracted_keywords] = job_ad_data[col_name_matched_keywords].apply(extract_keywords_with_keybert)
+        
+        job_ad_data = job_ad_data[job_ad_data[col_name_keybert_extracted_keywords].apply(len) > 0]
+
+    # Function to create final keywords combining 'Matched_Keywords' and 'KeyBERT_Extracted_Keywords'
+    def create_final_keywords_job(matched_keywords, keybert_extracted_keywords):
+        matched_keywords_list = matched_keywords
+        keybert_extracted_keywords = keybert_extracted_keywords
+        
+        if matched_keywords_list is None:
+            return keybert_extracted_keywords
+        else:
+            final_keywords = list(set(matched_keywords_list + keybert_extracted_keywords))
+            return final_keywords
+
+    # Apply the function to each row in the DataFrame for Job Resumes
+    for i in range(1, 4):
+        col_name_meaningful_keywords = f'Resume {i} Matched_Keywords'
+        col_name_keybert_extracted_keywords = f'Resume {i} KeyBERT_Extracted_Keywords'
+        col_name_final_keywords = f'Resume {i} Final_Keywords'
+        
+        job_ad_data[col_name_final_keywords] = job_ad_data.apply(lambda row: create_final_keywords_job(safe_list_conversion(row[col_name_meaningful_keywords]), row[col_name_keybert_extracted_keywords]), axis=1)
+
+
+    ### AI-based Synonym Generation - CV Final_Keywords
+    # Load Word2Vec model
+    w2v_model = api.load('word2vec-google-news-300')
+
+    # Function to generate synonyms using Word2Vec model
+    def generate_synonyms(word):
+        try:
+            # Get synonyms using the Word2Vec model and convert to lowercase
+            synonyms = [synonym.lower() for synonym, similarity in w2v_model.most_similar(word, topn=5)]
+            return synonyms
+        except KeyError:
+            # Handle the case where the word is not in the vocabulary
+            return []
+
+    # Function to generate synonyms for a list of words
+    def generate_synonyms_list(word_list):
+        synonyms_list = [generate_synonyms(word) for word in word_list]
+        return synonyms_list
+
+    # Apply the function to each row in the DataFrame
+    cv_data['Synonyms'] = cv_data['Final_Keywords'].apply(generate_synonyms_list)
+
+    # Function to clean and get the final list of synonyms
+    def clean_and_get_final_synonyms(synonyms_list):
+        final_synonyms = [synonym for sublist in synonyms_list for synonym in sublist]
+        final_synonyms = list(set(final_synonyms))  # Remove duplicates
+        final_synonyms = [synonym for synonym in final_synonyms if synonym]  # Remove empty strings
+        return final_synonyms
+
+    # Apply the function to each row in the DataFrame and convert to lowercase
+    cv_data['Final_Synonyms'] = cv_data['Synonyms'].apply(clean_and_get_final_synonyms)
+
+
+    ### AI-based Synonym Generation - Job Resumes Final_Keywords
+    # Apply the function to each resume column for Job Resumes
+    for i in range(1, 4):
+        col_name_final_keywords = f'Resume {i} Final_Keywords'
+        col_name_synonyms = f'Resume {i} Synonyms'
+        
+        job_ad_data[col_name_synonyms] = job_ad_data[col_name_final_keywords].apply(generate_synonyms_list)
+
+        # Function to clean and get the final list of synonyms
+        def clean_and_get_final_synonyms(synonyms_list):
+            final_synonyms = [synonym for sublist in synonyms_list for synonym in sublist]
+            final_synonyms = list(set(final_synonyms))  # Remove duplicates
+            final_synonyms = [synonym for synonym in final_synonyms if synonym]  # Remove empty strings
+            return final_synonyms
+
+        # Apply the function to each row in the DataFrame
+        job_ad_data[col_name_synonyms] = job_ad_data[col_name_synonyms].apply(clean_and_get_final_synonyms)
+
+
+    ## Final Keywords + Synonyms - CV Data
+    # Combine 'Final_Keywords' and 'Final_Synonyms' into 'Final_Keywords_Synonyms' column with uniqueness
+    cv_data['Final_Keywords_Synonyms'] = cv_data.apply(lambda row: list(set(row['Final_Keywords'] + row['Final_Synonyms'])), axis=1)
+
+    # Save DataFrames to CSV files
+    final_cv_keywords_synonyms_path = os.path.join(project_path, 'KeywordFiles', 'final_cv_keywords_synonyms.csv')
+    cv_data.to_csv(final_cv_keywords_synonyms_path, index=False)
+
+
+    ## Final Keywords + Synonyms - Job Resumes Data
+    # Apply the function to each resume column for Job Resumes
+    for i in range(1, 4):
+        col_name_final_keywords = f'Resume {i} Final_Keywords'
+        col_name_synonyms = f'Resume {i} Synonyms'
+        col_name_combined = f'Resume {i} Final_Keywords_Synonyms'
+        
+        def combine_keywords_and_synonyms(row):
+            if row[col_name_final_keywords] and row[col_name_synonyms]:
+                return list(set(row[col_name_final_keywords] + row[col_name_synonyms]))
+            elif row[col_name_final_keywords]:
+                return row[col_name_final_keywords]
+            elif row[col_name_synonyms]:
+                return row[col_name_synonyms]
+            else:
+                return []
+        
+        job_ad_data[col_name_combined] = job_ad_data.apply(combine_keywords_and_synonyms, axis=1)
+
+    # Save the updated dataset to a CSV file for Job Resumes
+    final_job_resumes_keywords_synonyms_path = os.path.join(project_path, 'KeywordFiles', 'final_job_resumes_keywords_synonyms.csv')
+    job_ad_data.to_csv(final_job_resumes_keywords_synonyms_path, index=False)
+    
+
+    print("Keywords and Synonyms saved to CSV files:")
+    print(f"Final CV Keywords and Synonyms saved to {final_cv_keywords_synonyms_path}")
+    print(f"Final Job Resumes Keywords and Synonyms saved to {final_job_resumes_keywords_synonyms_path}")