Commit bdf13466 authored by Birahavi Kugathasan's avatar Birahavi Kugathasan

Resume analyzer

parent 9f96db15
__pycache__
\ No newline at end of file
filename,name,mobile_number,email,company_names,college_name,experience,skills,experience_age,degree,words,primary_score,primary_match,secondary_score,secondary_match,no_of_pages,document_similarity,document_score,Score
resumes/Dhaval_Thakkar_Resume.pdf,Dhaval Thakkar,9191729595,thakkar.dhaval.haresh@gmail.com,['UNIFYND TECHNOLOGIES PVT. LTD'],None,"['UNIFYND TECHNOLOGIES PVT. LTD. | Data Scientist', 'Mumbai, MH, India | June 2018 – Present', '• Led the development of a Templatized OCR Engine with GUI to onboard 2000+ retailers from different malls. The', 'microservice deployed is currently operating at an accuracy of 81%', '• Built a Customer Segmentation model to target customers with relevant coupons, rewards, and content resulting', 'in a 3x increase in revenue and 2x increase in coupon utilization', '• Built a Dynamic Coupon Pricing Engine for malls that led to a 5x increase in coupon consumption on the coupon', 'marketplace', '• Built a Pricing Engine and Customer Segmentation Model for a logistics company which saw a 32% reduction in', 'Customer Attrition and a 12% increase in Repeat Purchase Rate', '• Developed an Automated End to End Reporting system to track KPIs performance for 10 malls that saves 60', 'hours of manual labour each month', 'UNIFYND TECHNOLOGIES PVT. LTD. | Intern Data Scientist Mumbai, MH, India | Sept 2017 - June 2018', '• Built a Smart Cryptocurrency trading platform which used social data and historical prices to optimize current', 'portfolio. Boosted overall profit from the portfolio by 30%', '• Worked with Product and Marketing teams to identify the power users of an app which resulted in 43% increase in', 'activity and a 65% increase in revenue from these users', 'ZIFF, INC | Deep Learning Intern', 'Provo, UT, USA | May 2017 – Aug 2017', '• Demonstrated competency in Hyperparameter Optimization, Image Augmentation and Learning Rate decay', 'strategies using the Keras Library', '• Deployed a Multi-Class Image classifier microservice written on Flask as a container on AWS EC2 using Docker']","['System', 'Github', 'Numpy', 'Mysql', 'Css', 'Data analytics', 'R', 'Segmentation', 'Logistics', 'Scrapy', 'Content', 'Keras', 'Engineering', 'Security', 'Machine learning', 'Docker', 'Testing', 'Reporting', 'Aws', 'Analytics', 'Anaconda', 'Sql', 'Html', 'Algorithms', 'Operating systems', 'Marketing', 'Flask', 'Kpis', 'Pandas', 'Python', 'Networking']",5.5,['Bachelor of Engineering'],350,44,"['ocr', 'aws', 'python', 'gcp']",42,"['data', 'ocr', 'science']",1,26,50.0,162.0
resumes/Santhosh_Narayanan.pdf,SANTHOSH NARAYANAN,417-6755,santhosn@usc.edu,None,None,"['on an EC2 server supported by S3 and RDS.', '\uf0a7 Maintained AWS infrastructure for institute’s annual technical festival website, by hosting the website', 'on an EC2 Ubuntu server.', 'K J Somaiya Inst. of Engg. & I.T – Penetration tester', 'December 2016 – January 2016', '\uf0a7 Conducted penetration testing for institute’s online admission and examination portal.', '\uf0a7 Performed authentication checks, access control checks, per screen checks (XSS, SQL injection.).', '\uf0a7 Delivered error free application, incorporating patches for the respective bugs using ASP.NET']","['Java', 'Computer science', 'System', 'Spyder', 'Numpy', 'Scheduling', 'Mysql', 'Css', 'Scrapy', 'Oracle', 'Certification', 'Schedule', 'Keras', 'Ubuntu', 'Engineering', 'Website', 'Php', 'Security', 'Testing', 'Aws', 'Access', 'Sql', 'Html', 'Wordpress', 'Technical', 'Javascript', 'Jupyter', 'Purchasing', 'Flask', 'Programming', 'Pandas', 'Matplotlib', 'Python', 'Lan']",,None,367,22,"['python', 'aws']",14,['science'],1,5,50.0,91.0
About the job
Borneo.io is building the next-generation ML Powered data privacy platform for hyper-growth companies. The Data Scientist role is at the core of Borneo’s engineering. You will be building models, manipulating big data, and working with APIs essential to the Borneo product.
We are growing fast and expanding our data science family with outstanding minds and diverse personalities.
As a Data Scientist at Borneo, you’ll have the opportunity to:
Work with some of the largest data sets used by some of the leading global technology companies
Help build a predictive product and inform features at the ground level.
Lead the way in leveraging unstructured data for predictive modeling, anomaly detection, and drive privacy compliance.
Responsibilities:
Identify, automate data collection processes
Dive into complex data sets to analyze trends and identify opportunities for improvement.
Build predictive models and machine-learning algorithms
Present information using data visualization techniques
Propose solutions and strategies to business challenges
Have a data-driven decision making approach
Requirements:
5-8 years of relevant experience, B2B startup experience preferred
Proven experience as a Data Scientist or Data Analyst
Experience in building ML models and deploying them to production
A solid understanding of data science fundamentals, statistical techniques, NLP algorithms
Understand research papers and create quick proof of concept relevant to the product
Expert in implementing quick prototypes that shows business value
Experience with programming languages such as NodeJs /Python/JavaScript: Cloud technologies: AWS/GCP/K8 etc.
import pandas as pd
import os
from scripts.processing import document_processing
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
skills = {
"primary" : ['Python', 'Machine Learning', 'node.js', 'AWS', 'Kubernetese', 'NLP', 'GCP', 'predective', 'OCR'],
"secondary" : ['data', 'science', 'modeling', 'anomaly', 'privacy', 'visualization', 'OCR'],
}
def document_score(df):
# Page score
df.loc[df['no_of_pages'] == 1, ['page_score']] = 100
df.loc[df['no_of_pages'] == 2, ['page_score']] = 60
df.loc[(df['no_of_pages'] > 2) |
(df['no_of_pages'] == 0), ['page_score']] = 30
# Word score
df.loc[(df['words'] >= 200) & (df['words'] < 400),
['word_score']] = 100
df.loc[(df['words'] >= 400) & (df['words'] < 600),
['word_score']] = 70
df.loc[((df['words'] > 0) & (df['words'] < 200))|
(df['words'] > 600) | (df['words'].isnull()),
['word_score']] = 40
df['document_score'] = (df['page_score'] + df['word_score']) * 0.25
df.drop(['word_score', 'page_score'], axis=1, inplace=True)
return df
if __name__=='__main__':
resume_dir = 'resumes/'
jd_file = 'Job_description.txt'
list_of_resumes = os.listdir(resume_dir)
df = pd.DataFrame()
for file in tqdm(list_of_resumes):
result = document_processing(resume_dir+file, skills, jd_file)
candidate = result.skills_match()
df = pd.concat([df, candidate], ignore_index=True)
df = document_score(df)
# Final score
df['Score'] = df['primary_score'] + df['secondary_score'] + df['document_score'] + df['document_similarity']
df = df.sort_values('Score', ascending=False)
df = df.reset_index(drop=True)
print(df)
df.to_csv('Candidates_score.csv', index=False)
\ No newline at end of file
tqdm==4.61.2
pdf2image==1.16.0
pandas==1.3.0
pyresparser==1.0.6
pytesseract==0.3.8
texthero==1.1.0
numpy==1.21.0
requests==2.25.1
cleantext==1.1.3
Pillow==8.3.1
scikit_learn==0.24.2
import pandas as pd
import numpy as np
import pytesseract
import os
import glob
import texthero as hero
from pyresparser import ResumeParser
from pyresparser.utils import extract_text
from PIL import Image
from pdf2image import convert_from_path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class document_processing:
def __init__(self, resume, skills, job_desc):
with open('Job_description.txt', 'rb') as file:
job_desc = file.read()
self.resume = resume
self.skills = skills
self.job_desc = job_desc
def extract_resume(self):
filepath = self.resume
extension = filepath.split('.')[-1]
extension = '.'+extension
resume_ner = ResumeParser(filepath).get_extracted_data()
resume_txt = extract_text(filepath, extension=extension)
return resume_ner, resume_txt
def ocr_text(self):
filepath = self.resume
files = glob.glob('temp/*')
for f in files:
os.remove(f)
# Store all the pages of the PDF in a variable
pages = convert_from_path(filepath, 500)
# Counter to store images of each page of PDF to image
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
# PDF page n -> page_n.jpg
filename = "page_"+str(image_counter)+".jpg"
# Save the image of the page in system
page.save('temp/'+filename, 'JPEG')
# Increment the counter to update filename
image_counter = image_counter + 1
########## OCR ##########
# Variable to get count of total number of pages
filelimit = image_counter-1
text_op = ''
count = 0
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
filename = "temp/page_"+str(i)+".jpg"
# Recognize the text as string in image using pytesserct
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
# Finally, write the processed text to the file.
text_op+=text
count+=1
with open('out_text.txt', 'w') as f:
f.write(text_op)
return text_op, count
def find_unigram(df, column):
unigrams = (df[column].str.lower()
.str.replace(r'[^a-z\s]', '')
.str.split(expand=True)
.stack()).reset_index(drop=True)
unigrams = hero.clean(unigrams)
un_df = pd.DataFrame(unigrams, columns = ['text'])
return un_df
def find_match(self, source, match):
# Remove the null values
match.dropna(inplace=True)
match.reset_index(drop=True)
match.columns = ['text']
match['text'] = hero.clean(match['text'])
# Find the max val
max_val = len(match)
# Find the skills that match with the resume
df = pd.merge(source, match, on = 'text')
df.drop_duplicates(inplace=True)
df.reset_index(drop=True)
# Skills matching
match_skills = len(df)
if match_skills == 0:
lst_skills = []
score = 0
elif match_skills > 0:
lst_skills = df['text'].tolist()
score = int((match_skills / max_val) * 100)
return score, lst_skills
def fill_data(self, source, target, column):
source.loc[0, column] = str(target[column])
return source
def resume_cosine_score(self, text):
jd_txt = self.job_desc
jd_txt = pd.Series(jd_txt)
jd_txt = hero.clean(jd_txt)
jd_txt = jd_txt[0]
text_list = [text, jd_txt]
cv = CountVectorizer()
count_matrix = cv.fit_transform(text_list)
match_percentage = cosine_similarity(count_matrix)[0][1] * 80
match_percentage = round(match_percentage, 2)
return match_percentage
def skills_match(self):
skills = self.skills
# Load data from pyresparser
pyres_data, pyres_text = self.extract_resume()
self.data = pyres_data
self.text = pyres_text
ocr_ser = pd.Series(pyres_text)
cleaned_words = hero.clean(ocr_ser)
# Main dataframe for manipulation
main_df = pd.DataFrame(cleaned_words[0].split(), columns = ['text'])
self.clean_data = main_df
words = len(main_df)
columns = ['filename', 'name', 'mobile_number', 'email', 'company_names',
'college_name', 'experience', 'skills', 'experience_age',
'degree', 'words',
'primary_score', 'primary_match',
'secondary_score', 'secondary_match',
'no_of_pages', 'document_similarity']
details = pd.DataFrame(columns = columns)
# print(skills[['Primary']])
primary = ['Python', 'Machine Learning', 'node.js', 'AWS', 'Kubernetese', 'NLP', 'GCP', 'predective', 'OCR']
# Add the primary match and score
pri_score, pri_match = self.find_match(main_df, pd.DataFrame(skills['primary']))
sec_score, sec_match = self.find_match(main_df, pd.DataFrame(skills['secondary']))
# Add the document similarity score
doc_sim = self.resume_cosine_score(cleaned_words[0])
# Add details in a dataframe
details.loc[0, 'filename'] = self.resume
details = self.fill_data(details, pyres_data, 'name')
details = self.fill_data(details, pyres_data, 'mobile_number')
details = self.fill_data(details, pyres_data, 'email')
details = self.fill_data(details, pyres_data, 'company_names')
details = self.fill_data(details, pyres_data, 'college_name')
details = self.fill_data(details, pyres_data, 'degree')
details = self.fill_data(details, pyres_data, 'experience')
details = self.fill_data(details, pyres_data, 'skills')
details.loc[0, 'words'] = words
if pyres_data['no_of_pages'] == None:
details.loc[0, 'no_of_pages'] = 0
else:
details = self.fill_data(details, pyres_data, 'no_of_pages')
details.loc[0, 'primary_score'] = pri_score
details.loc[0, 'primary_match'] = str(pri_match)
details.loc[0, 'secondary_score'] = sec_score
details.loc[0, 'secondary_match'] = str(sec_match)
details.loc[0, 'document_similarity'] = int(doc_sim)
if pyres_data['total_experience'] > 0:
details.loc[0, 'experience_age'] = pyres_data['total_experience']
else:
details.loc[0, 'experience_age'] = np.NaN
details['no_of_pages'] = details['no_of_pages'].astype(int)
return details
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment