Upload New File

parent a9259197
import concurrent.futures
import collections
import json
import os
import re
from datetime import datetime
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus.reader import lin, toolbox
from nltk.corpus.reader.wordnet import WordNetError
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from printer import Printer
import time
import shutil
from gensim.models import Word2Vec
import pickle
basePath = "E:/FINAL SEMESTER/Research/FINAL/PROJECT"
resumesPath = basePath + "/test-data/resume"
requirmentPath = basePath + "/test-data/requirements.txt"
modelPath = basePath + "/model/w2v-model-V_300-MC_1-W_8-E_25.model"
tokensPath = basePath + "/histogram/tokens/all-words-bin-without-pos"
model = Word2Vec.load(modelPath)
modelTokens = []
with open(tokensPath, 'rb+') as pickle_file:
modelTokens = set(pickle.load(pickle_file))
lemmatizer = WordNetLemmatizer()
linePrinter = Printer()
cachedStopWords = stopwords.words("english")
listToken = []
MIN_CHAR_IN_WORD = 2
uniqueIdentifier = "uniqueidentifierusedtoidentify"
exceptions = { "c++": "cplusplus", "c#": "csharp", ".net": "dotnet" }
def preprocessSentence(sentence):
sentence=str(sentence)
sentence = sentence.lower()
sentence=sentence.replace('{html}',"")
for key, value in exceptions.items():
sentence = re.sub(re.escape(key), value+uniqueIdentifier, sentence)
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', sentence)
rem_url=re.sub(r'http\S+', '',cleantext)
rem_num = re.sub('[0-9]+', '', rem_url)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(rem_num)
mergedData = " ".join(tokens)
for key, value in exceptions.items():
mergedData = re.sub(value+uniqueIdentifier, key, mergedData)
return mergedData
def getNMostFromDict(dict, N):
data = []
for word, count in dict.items():
if word in modelTokens:
data.append((count, word))
data.sort(key = lambda x: (-x[0]))
return [word for count,word in data[:N]]
def runCodeForLine(line, preprocessedWords):
err=0
sentencesInLine = sent_tokenize(line)
for sentence in sentencesInLine:
sentence = preprocessSentence(sentence)
words = word_tokenize(sentence)
wordIndex = 0
while wordIndex < len(words) - 1:
# Attaching TITLE within the body
token = words[wordIndex].lower()
wordIndex += 1
if token and len(token) >= MIN_CHAR_IN_WORD:
if token not in cachedStopWords:
try:
lemWord = lemmatizer.lemmatize(token)
preprocessedWords[lemWord] += 1
except:
err += 1
return err
def calculateScore(topWordsInResume, topWordsInRequirements):
score = 0
for requirement in topWordsInRequirements:
for word in topWordsInResume:
score += model.similarity(requirement, word)
return score
def doWorker(inputFileName, requireRanking = False, topWordsInRequirements = [], resumeName = ""):
inputFile = open(inputFileName, "r")
corpusLines = inputFile.readlines()
preprocessedWords = collections.defaultdict(int)
err = 0
for eachLine in corpusLines:
err += runCodeForLine(eachLine, preprocessedWords)
inputFile.close()
if requireRanking:
# DO_RANKING
top20WordsInResume = getNMostFromDict(preprocessedWords, 20)
score = calculateScore(top20WordsInResume, topWordsInRequirements)
print("RESUME: ", resumeName, " | SCORE: ", score)
return preprocessedWords
startTime = datetime.now()
print("Ranking started : ", startTime)
requirements = doWorker(requirmentPath, False)
top20WordsInRequirements = getNMostFromDict(requirements, 20)
FileNames = []
all_files = os.listdir(resumesPath)
for inputFileName in all_files:
fileName = resumesPath + "/" + inputFileName
if os.path.isfile(fileName):
FileNames.append((fileName, inputFileName))
totalFiles = len(FileNames)
print("Total Resumes : ", totalFiles)
for filePath, resumeName in FileNames:
doWorker(filePath, True, top20WordsInRequirements, resumeName)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment