Update 1.Resume_Parser.py

parent f91e96f7
from resume_parser import resumeparse import docx2txt
from pyresparser import ResumeParser import nltk
import os from sklearn.feature_extraction.text import CountVectorizer
import spacy from sklearn.metrics.pairwise import cosine_similarity
spacy.load('en_core_web_sm') #Load Negomi's model
Skills = open("Tweet.txt", "w") #nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
Skills = open("Skills.txt", "w")
Degree = open("Degree.txt", "w") Degree = open("Degree.txt", "w")
F_score = open("Final_Score.txt", "w")
Experience = open("Experience.txt", "w") Experience = open("Experience.txt", "w")
Skill_point = open("skl_point.txt", "w")
def deleteContent(pfile): def deleteContent(pfile):
pfile.seek(0) pfile.seek(0)
pfile.truncate() pfile.truncate()
...@@ -15,24 +23,187 @@ def deleteContent(pfile): ...@@ -15,24 +23,187 @@ def deleteContent(pfile):
deleteContent(Skills) deleteContent(Skills)
deleteContent(Degree) deleteContent(Degree)
deleteContent(Experience) deleteContent(Experience)
deleteContent(Skill_point)
deleteContent(F_score)
def extract_text_from_docx(docx_path):
txt = docx2txt.process(docx_path)
if txt:
return txt.replace('\t', ' ')
return None
#Extract name
def extract_names(txt):
person_names = []
for sent in nltk.sent_tokenize(txt):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
person_names.append(
' '.join(chunk_leave[0] for chunk_leave in chunk.leaves())
)
return person_names
if __name__ == '__main__':
text = extract_text_from_docx('C:/Users/User/Desktop/New folder (12)/Applicant_Ranker/UploadedCV/Mahela.docx')
names = extract_names(text)
has_items = bool(names)
if (has_items == True):
name = "\n".join(names)
with open('Final_Score.txt', 'a') as the_file:
Name = names[0]
the_file.write('Name '+ Name + '\n')
print('Name : ' + Name)
else:
print('Degree not found !!!')
#SKILLS EXTRACTION
SKILLS_DB = [
'machine learning', 'angular',
'data science', 'asp',
'python', 'ruby', 'c',
'java', 'swift',
'mysql', 'php',
'English', 'objective c'
]
def extract_skills(input_text):
stop_words = set(nltk.corpus.stopwords.words('english'))
word_tokens = nltk.tokenize.word_tokenize(input_text)
filtered_tokens = [w for w in word_tokens if w not in stop_words]
filtered_tokens = [w for w in word_tokens if w.isalpha()]
bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))
found_skills = set()
for token in filtered_tokens:
if token.lower() in SKILLS_DB:
found_skills.add(token)
for ngram in bigrams_trigrams:
if ngram.lower() in SKILLS_DB:
found_skills.add(ngram)
return found_skills
if __name__ == '__main__':
text = extract_text_from_docx('C:/Users/User/Desktop/New folder (12)/Applicant_Ranker/UploadedCV/Mahela.docx')
skills = extract_skills(text)
has_items = bool(skills)
if (has_items == True):
Skills = "\n".join(skills)
with open('Skills.txt', 'w') as the_file:
the_file.write(Skills + '\n')
print('Skills : \n' + Skills + '\n')
skl = open("Programming_Languages.txt").read()
skl_txt = [Skills, skl]
sk = CountVectorizer()
pos_count_matrix = sk.fit_transform(skl_txt)
Skl_match = cosine_similarity(pos_count_matrix)[0][1]
Skl_match = Skl_match * 100
Skl_match = round(Skl_match, 2)
with open('Final_Score.txt', 'a') as the_file:
the_file.write('Skills ' + str(Skl_match) + '\n')
with open('skl_point.txt', 'a') as the_file:
the_file.write(str(Skl_match))
#print(skills)
#EDUCATION / DEGREE EXTRACTION
RESERVED_WORDS = [
'school',
'college',
'univers',
'academy',
'faculty',
'institute',
'faculdades',
'Schola',
'schule',
'lise',
'lyceum',
'lycee',
'polytechnic',
'kolej',
'ünivers',
'okul',
'BSc',
'Bachelor Degree',
'Degree'
]
def extract_education(input_text):
organizations = []
for sent in nltk.sent_tokenize(input_text):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
organizations.append(' '.join(c[0] for c in chunk.leaves()))
education = set()
for org in organizations:
for word in RESERVED_WORDS:
if org.lower().find(word) >= 0:
education.add(org)
return education
if __name__ == '__main__':
text = extract_text_from_docx('C:/Users/User/Desktop/New folder (12)/Applicant_Ranker/UploadedCV/Mahela.docx')
education_information = extract_education(text)
has_items = bool(education_information)
if (has_items == True):
Degree = "\n".join(education_information)
with open('Degree.txt', 'w') as the_file:
the_file.write("Degree " + Degree + '\n')
print('Degree / Education : \n' + Degree + '\n')
else:
print('Degree not found !!!')
#print(education_information)
#EXPERIENCE EXTRACTION
"""
EXP_WORDS = [
'experience',
'engineer'
]
def extract_experience(input_text):
experience = []
for sent in nltk.sent_tokenize(input_text):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label') and chunk.label() == 'EXPERIENCE':
experience.append(' '.join(c[0] for c in chunk.leaves()))
education = set()
for org in experience:
for word in EXP_WORDS:
if org.lower().find(word) >= 0:
experience.add(org)
data = ResumeParser("Sample_CV.docx").get_extracted_data() return experience
list = []
list = data['skills'] if __name__ == '__main__':
print("Skills : ") text = extract_text_from_docx('./Sample_CV2.docx')
with open('Skills.txt', 'a') as the_file: experience_information = extract_experience(text)
the_file.write(str(list) + '\n')
print(str(list))
''' print(experience_information)
list = data['degree']
with open('Degree.txt', 'a') as the_file:
the_file.write(str(list) + '\n')
list = data['experience']
with open('Experience.txt', 'a') as the_file:
the_file.write(str(list) + '\n')
'''
"""
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment