Update 1.Resume_Parser.py

c0684ebe · Ramachandran Rajeevaletshanth · f91e96f7 · c0684ebe
Commit c0684ebe authored Jul 09, 2021 by Ramachandran Rajeevaletshanth
Hide whitespace changes
Inline Side-by-side

Showing with 192 additions and 21 deletions

IT17163682/1.Resume_Parser.py IT17163682/1.Resume_Parser.py +192 -21

No files found.
--- a/IT17163682/1.Resume_Parser.py
+++ b/IT17163682/1.Resume_Parser.py
-from resume_parser import resumeparse
+import docx2txt
-from pyresparser import ResumeParser
+import nltk
-import os
+from sklearn.feature_extraction.text import CountVectorizer
-import spacy
+from sklearn.metrics.pairwise import cosine_similarity
-spacy.load('en_core_web_sm')  #Load Negomi's model
-Skills = open("Tweet.txt", "w")
+#nltk.download('stopwords')
+#nltk.download('punkt')
+#nltk.download('averaged_perceptron_tagger')
+#nltk.download('maxent_ne_chunker')
+#nltk.download('words')
+Skills = open("Skills.txt", "w")
 Degree = open("Degree.txt", "w")
+F_score = open("Final_Score.txt", "w")
 Experience = open("Experience.txt", "w")
+Skill_point = open("skl_point.txt", "w")
 def deleteContent(pfile):
    pfile.seek(0)
    pfile.truncate()
@@ -15,24 +23,187 @@ def deleteContent(pfile):
 deleteContent(Skills)
 deleteContent(Degree)
 deleteContent(Experience)
+deleteContent(Skill_point)
+deleteContent(F_score)
+def extract_text_from_docx(docx_path):
+    txt = docx2txt.process(docx_path)
+    if txt:
+        return txt.replace('\t', ' ')
+    return None
+#Extract name
+def extract_names(txt):
+    person_names = []
+    for sent in nltk.sent_tokenize(txt):
+        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
+            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
+                person_names.append(
+                    ' '.join(chunk_leave[0] for chunk_leave in chunk.leaves())
+                )
+    return person_names
+if __name__ == '__main__':
+    text = extract_text_from_docx('C:/Users/User/Desktop/New folder (12)/Applicant_Ranker/UploadedCV/Mahela.docx')
+    names = extract_names(text)
+    has_items = bool(names)
+    if (has_items == True):
+        name = "\n".join(names)
+        with open('Final_Score.txt', 'a') as the_file:
+            Name = names[0]
+            the_file.write('Name '+ Name + '\n')
+            print('Name : ' + Name)
+    else:
+        print('Degree not found !!!')
+#SKILLS EXTRACTION
+SKILLS_DB = [
+    'machine learning', 'angular',
+    'data science', 'asp',
+    'python', 'ruby', 'c',
+    'java', 'swift',
+    'mysql',  'php',
+    'English', 'objective c'
+]
+def extract_skills(input_text):
+    stop_words = set(nltk.corpus.stopwords.words('english'))
+    word_tokens = nltk.tokenize.word_tokenize(input_text)
+    filtered_tokens = [w for w in word_tokens if w not in stop_words]
+    filtered_tokens = [w for w in word_tokens if w.isalpha()]
+    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))
+    found_skills = set()
+    for token in filtered_tokens:
+        if token.lower() in SKILLS_DB:
+            found_skills.add(token)
+    for ngram in bigrams_trigrams:
+        if ngram.lower() in SKILLS_DB:
+            found_skills.add(ngram)
+    return found_skills
+if __name__ == '__main__':
+    text = extract_text_from_docx('C:/Users/User/Desktop/New folder (12)/Applicant_Ranker/UploadedCV/Mahela.docx')
+    skills = extract_skills(text)
+    has_items = bool(skills)
+    if (has_items == True):
+        Skills = "\n".join(skills)
+        with open('Skills.txt', 'w') as the_file:
+            the_file.write(Skills + '\n')
+            print('Skills : \n' + Skills + '\n')
+        skl = open("Programming_Languages.txt").read()
+        skl_txt = [Skills, skl]
+        sk = CountVectorizer()
+        pos_count_matrix = sk.fit_transform(skl_txt)
+        Skl_match = cosine_similarity(pos_count_matrix)[0][1]
+        Skl_match = Skl_match * 100
+        Skl_match = round(Skl_match, 2)
+        with open('Final_Score.txt', 'a') as the_file:
+            the_file.write('Skills ' + str(Skl_match) + '\n')
+        with open('skl_point.txt', 'a') as the_file:
+            the_file.write(str(Skl_match))
+    #print(skills)
+#EDUCATION / DEGREE EXTRACTION
+RESERVED_WORDS = [
+    'school',
+    'college',
+    'univers',
+    'academy',
+    'faculty',
+    'institute',
+    'faculdades',
+    'Schola',
+    'schule',
+    'lise',
+    'lyceum',
+    'lycee',
+    'polytechnic',
+    'kolej',
+    'ünivers',
+    'okul',
+    'BSc',
+    'Bachelor Degree',
+    'Degree'
+]
+def extract_education(input_text):
+    organizations = []
+    for sent in nltk.sent_tokenize(input_text):
+        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
+            if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
+                organizations.append(' '.join(c[0] for c in chunk.leaves()))
+    education = set()
+    for org in organizations:
+        for word in RESERVED_WORDS:
+            if org.lower().find(word) >= 0:
+                education.add(org)
+    return education
+if __name__ == '__main__':
+    text = extract_text_from_docx('C:/Users/User/Desktop/New folder (12)/Applicant_Ranker/UploadedCV/Mahela.docx')
+    education_information = extract_education(text)
+    has_items = bool(education_information)
+    if (has_items == True):
+        Degree = "\n".join(education_information)
+        with open('Degree.txt', 'w') as the_file:
+            the_file.write("Degree " + Degree + '\n')
+            print('Degree / Education : \n' + Degree + '\n')
+    else:
+        print('Degree not found !!!')
+    #print(education_information)
+#EXPERIENCE EXTRACTION
+"""
+EXP_WORDS = [
+    'experience',
+    'engineer'
+]
+def extract_experience(input_text):
+    experience = []
+    for sent in nltk.sent_tokenize(input_text):
+        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
+            if hasattr(chunk, 'label') and chunk.label() == 'EXPERIENCE':
+                experience.append(' '.join(c[0] for c in chunk.leaves()))
+    education = set()
+    for org in experience:
+        for word in EXP_WORDS:
+            if org.lower().find(word) >= 0:
+                experience.add(org)
-data = ResumeParser("Sample_CV.docx").get_extracted_data()
+    return experience
-list = []
-list = data['skills']
+if __name__ == '__main__':
-print("Skills : ")
+    text = extract_text_from_docx('./Sample_CV2.docx')
-with open('Skills.txt', 'a') as the_file:
+    experience_information = extract_experience(text)
-    the_file.write(str(list) + '\n')
-    print(str(list))
-'''
+    print(experience_information)
-list = data['degree']
-with open('Degree.txt', 'a') as the_file:
-    the_file.write(str(list) + '\n')
-list = data['experience']
-with open('Experience.txt', 'a') as the_file:
-    the_file.write(str(list) + '\n')
-'''
+"""