source file added

parent fc37de49
#!flask/bin/python
from flask import Flask, jsonify, abort, make_response, request
from flask_cors import CORS
import concurrent.futures
import collections
import json
import os
import re
from datetime import datetime
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus.reader import lin, toolbox
from nltk.corpus.reader.wordnet import WordNetError
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from printer import Printer
import time
import shutil
from gensim.models import Word2Vec
from gensim import models
import pickle
import slate3k as slate
import base64
import pdfplumber
#====================== RANKER STARTS ==============================
TOP_WORD_LIMIT_FOR_API = 1000
basePath = "E:/FINAL SEMESTER/Research/FINAL/PROJECT"
modelPath = basePath + "/model/w2v-model-V_300-MC_1-W_8-E_25.model"
# modelPath = basePath + "/model/GoogleNews-vectors-negative300.bin"
tokensPath = basePath + "/histogram/tokens/all-words-bin-without-pos"
personality_score = "E:/FINAL SEMESTER/Research/FINAL/PROJECT/rajeev/Applicant_ranker-Copy"
model = Word2Vec.load(modelPath)
# model = models.KeyedVectors.load_word2vec_format(modelPath, binary=True)
# final_score = 0
# personality_scores_file = final_score(personality_score)
print("Model loaded.")
modelTokens = []
with open(tokensPath, 'rb+') as pickle_file:
modelTokens = set(pickle.load(pickle_file))
print("Model Tokens loaded.", len(modelTokens))
lemmatizer = WordNetLemmatizer()
linePrinter = Printer()
cachedStopWords = stopwords.words("english")
listToken = []
MIN_CHAR_IN_WORD = 2
uniqueIdentifier = "uniqueidentifierusedtoidentify"
exceptions = { "c++": "cplusplus", "c#": "csharp", ".net": "dotnet" }
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return 'a'
elif treebank_tag.startswith('V'):
return 'v'
elif treebank_tag.startswith('N') and treebank_tag.endswith('S'):
return 'n'
elif treebank_tag.startswith('R'):
return 'r'
else:
# all other tags get mapped to x
return 'x'
def preprocessSentence(sentence):
sentence=str(sentence)
sentence = sentence.lower()
sentence=sentence.replace('{html}',"")
for key, value in exceptions.items():
sentence = re.sub(re.escape(key), value+uniqueIdentifier, sentence)
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', sentence)
rem_url=re.sub(r'http\S+', '',cleantext)
rem_num = re.sub('[0-9]+', '', rem_url)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(rem_num)
mergedData = " ".join(tokens)
for key, value in exceptions.items():
mergedData = re.sub(value+uniqueIdentifier, key, mergedData)
return mergedData
def getNMostFromDict(dict, N):
data = []
for word, count in dict.items():
if word in modelTokens:
data.append((count, word))
data.sort(key = lambda x: (-x[0]))
return [word for count,word in data[:N]]
def runCodeForLine(line, preprocessedWords):
err=0
sentencesInLine = sent_tokenize(line)
for sentence in sentencesInLine:
# print(sentence)
sentence = sentence.lower()
sentence = preprocessSentence(sentence)
words = word_tokenize(sentence)
posTagWords = pos_tag(words)
wordIndex = 0
while wordIndex < len(words) - 1:
token = words[wordIndex].lower()
if token and len(token) >= MIN_CHAR_IN_WORD:
if token not in cachedStopWords:
try:
nltk_pos_tag = get_wordnet_pos(posTagWords[wordIndex][1])
preprocessedWords[lemmatizer.lemmatize(token, pos=nltk_pos_tag)] += 1
except:
preprocessedWords[token] += 1
err += 1
wordIndex += 1
return err
def calculateScore(topWordsInResume, topWordsInRequirements):
score = 0
for requirement in topWordsInRequirements:
topScore = 0
for word in topWordsInResume:
if requirement in model and word in model:
topScore = max(topScore, model.similarity(requirement, word))
score += topScore
return score / len(topWordsInRequirements) *100
def doWorker(qualifications):
preprocessedWords = collections.defaultdict(int)
err = runCodeForLine(qualifications, preprocessedWords)
return preprocessedWords
def calculatScore(topWordsInResume, topWordsInRequirements, personality_skills_score):
total = 0
score = 0
for requirement in topWordsInRequirements:
for word in topWordsInResume:
if requirement in model and word in model:
score += model.similarity(requirement, word)
total += score + personality_score
return total / (TOP_WORD_LIMIT_FOR_API * TOP_WORD_LIMIT_FOR_API)*100
#====================== RANKER ENDS ================================
app = Flask(__name__)
CORS(app)
@app.route('/status', methods=['GET'])
def get_status():
return jsonify({'status': 'running'})
@app.errorhandler(404)
def not_found(error):
return make_response(jsonify({'error': 'Not found'}), 404)
@app.route('/rank', methods=['POST'])
def create_task():
data = request.get_json()
requirmentsConent = ""
if data and data["requirments"] and data["requirments"]["data"]:
requirmentsConent = data["requirments"]["data"]
requirements = doWorker(requirmentsConent)
top20WordsInRequirements = getNMostFromDict(requirements, TOP_WORD_LIMIT_FOR_API)
resumeRanks = []
if data and data["resumes"]:
for resume in data["resumes"]:
if resume:
resumeName = resume["name"]
resumeData = resume["data"]
preprocessedWords = doWorker(resumeData)
top20WordsInResume = getNMostFromDict(preprocessedWords, TOP_WORD_LIMIT_FOR_API)
score = calculateScore(top20WordsInResume, top20WordsInRequirements)
resumeRanks.append((score, resumeName))
response = {}
if resumeRanks:
resumeRanks.sort(key = lambda x: (-x[0]))
currRank = 1
for index in range(len(resumeRanks)):
rank = resumeRanks[index]
if index > 0 and rank[0] != resumeRanks[index-1][0]:
currRank += 1
response[rank[1]] = {
"resumeName": rank[1],
"rank": currRank,
"score": round(rank[0], 4),
}
return jsonify(response)
@app.route('/resume-submission', methods=['POST'])
def upload_resumes():
try:
if 'file' not in request.files:
print('No file part')
return make_response(jsonify({'error': 'File found'}), 400)
if 'requirements' not in request.files:
print('No requirements')
return make_response(jsonify({'error': 'requirements found'}), 400)
requirmentsConent = (request.files['requirements'].read()).decode("utf-8")
# print(requirmentsConent)
requirements = doWorker(requirmentsConent)
top20WordsInRequirements = getNMostFromDict(requirements, TOP_WORD_LIMIT_FOR_API)
resumeRanks = []
uploaded_files = request.files.getlist("file")
for uploadedFile in uploaded_files:
# print(uploadedFile)
pdf = pdfplumber.open(uploadedFile)
first_page = pdf.pages[0]
resumeData = first_page.extract_text()
preprocessedWords = doWorker(resumeData)
# resumeData = slate.PDF(uploadedFile)
resumeName = uploadedFile.filename
# preprocessedWords = doWorker(resumeData[0])
top20WordsInResume = getNMostFromDict(preprocessedWords, TOP_WORD_LIMIT_FOR_API)
score = calculateScore(top20WordsInResume, top20WordsInRequirements)
resumeRanks.append((score, resumeName))
response = []
if resumeRanks:
resumeRanks.sort(key = lambda x: (-x[0]))
currRank = 0
for index in range(len(resumeRanks)):
rank = resumeRanks[index]
# if index > 0 and rank[0] != resumeRanks[index-1][0]:
currRank += 1
response.append({
"resumeName": rank[1],
"rank": currRank,
"score": round(rank[0], 4),
})
return jsonify(response)
except:
return make_response(jsonify({'error': 'An error occurred'}), 500)
if __name__ == '__main__':
app.run(host='localhost')
# For URL query parameters, use request.args.
# search = request.args.get("search")
# page = request.args.get("page")
# For posted form input, use request.form.
# email = request.form.get('email')
# password = request.form.get('password')
# For JSON posted with content type application/json, use request.get_json().
# data = request.get_json()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment