Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2020_21 J-25
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
2020_21 J-25
2020_21 J-25
Commits
32fd4d32
Commit
32fd4d32
authored
Jul 09, 2021
by
Jeyasumangala Rasanayagam
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
a9259197
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
129 additions
and
0 deletions
+129
-0
IT17160162/ranker.py
IT17160162/ranker.py
+129
-0
No files found.
IT17160162/ranker.py
0 → 100644
View file @
32fd4d32
import
concurrent.futures
import
collections
import
json
import
os
import
re
from
datetime
import
datetime
from
nltk.corpus
import
stopwords
from
nltk.corpus
import
wordnet
from
nltk.corpus.reader
import
lin
,
toolbox
from
nltk.corpus.reader.wordnet
import
WordNetError
from
nltk.stem
import
WordNetLemmatizer
from
nltk.tag
import
pos_tag
from
nltk.tokenize
import
sent_tokenize
from
nltk.tokenize
import
word_tokenize
from
nltk.tokenize
import
RegexpTokenizer
from
printer
import
Printer
import
time
import
shutil
from
gensim.models
import
Word2Vec
import
pickle
basePath
=
"E:/FINAL SEMESTER/Research/FINAL/PROJECT"
resumesPath
=
basePath
+
"/test-data/resume"
requirmentPath
=
basePath
+
"/test-data/requirements.txt"
modelPath
=
basePath
+
"/model/w2v-model-V_300-MC_1-W_8-E_25.model"
tokensPath
=
basePath
+
"/histogram/tokens/all-words-bin-without-pos"
model
=
Word2Vec
.
load
(
modelPath
)
modelTokens
=
[]
with
open
(
tokensPath
,
'rb+'
)
as
pickle_file
:
modelTokens
=
set
(
pickle
.
load
(
pickle_file
))
lemmatizer
=
WordNetLemmatizer
()
linePrinter
=
Printer
()
cachedStopWords
=
stopwords
.
words
(
"english"
)
listToken
=
[]
MIN_CHAR_IN_WORD
=
2
uniqueIdentifier
=
"uniqueidentifierusedtoidentify"
exceptions
=
{
"c++"
:
"cplusplus"
,
"c#"
:
"csharp"
,
".net"
:
"dotnet"
}
def
preprocessSentence
(
sentence
):
sentence
=
str
(
sentence
)
sentence
=
sentence
.
lower
()
sentence
=
sentence
.
replace
(
'{html}'
,
""
)
for
key
,
value
in
exceptions
.
items
():
sentence
=
re
.
sub
(
re
.
escape
(
key
),
value
+
uniqueIdentifier
,
sentence
)
cleanr
=
re
.
compile
(
'<.*?>'
)
cleantext
=
re
.
sub
(
cleanr
,
''
,
sentence
)
rem_url
=
re
.
sub
(
r'http\S+'
,
''
,
cleantext
)
rem_num
=
re
.
sub
(
'[0-9]+'
,
''
,
rem_url
)
tokenizer
=
RegexpTokenizer
(
r'\w+'
)
tokens
=
tokenizer
.
tokenize
(
rem_num
)
mergedData
=
" "
.
join
(
tokens
)
for
key
,
value
in
exceptions
.
items
():
mergedData
=
re
.
sub
(
value
+
uniqueIdentifier
,
key
,
mergedData
)
return
mergedData
def
getNMostFromDict
(
dict
,
N
):
data
=
[]
for
word
,
count
in
dict
.
items
():
if
word
in
modelTokens
:
data
.
append
((
count
,
word
))
data
.
sort
(
key
=
lambda
x
:
(
-
x
[
0
]))
return
[
word
for
count
,
word
in
data
[:
N
]]
def
runCodeForLine
(
line
,
preprocessedWords
):
err
=
0
sentencesInLine
=
sent_tokenize
(
line
)
for
sentence
in
sentencesInLine
:
sentence
=
preprocessSentence
(
sentence
)
words
=
word_tokenize
(
sentence
)
wordIndex
=
0
while
wordIndex
<
len
(
words
)
-
1
:
# Attaching TITLE within the body
token
=
words
[
wordIndex
]
.
lower
()
wordIndex
+=
1
if
token
and
len
(
token
)
>=
MIN_CHAR_IN_WORD
:
if
token
not
in
cachedStopWords
:
try
:
lemWord
=
lemmatizer
.
lemmatize
(
token
)
preprocessedWords
[
lemWord
]
+=
1
except
:
err
+=
1
return
err
def
calculateScore
(
topWordsInResume
,
topWordsInRequirements
):
score
=
0
for
requirement
in
topWordsInRequirements
:
for
word
in
topWordsInResume
:
score
+=
model
.
similarity
(
requirement
,
word
)
return
score
def
doWorker
(
inputFileName
,
requireRanking
=
False
,
topWordsInRequirements
=
[],
resumeName
=
""
):
inputFile
=
open
(
inputFileName
,
"r"
)
corpusLines
=
inputFile
.
readlines
()
preprocessedWords
=
collections
.
defaultdict
(
int
)
err
=
0
for
eachLine
in
corpusLines
:
err
+=
runCodeForLine
(
eachLine
,
preprocessedWords
)
inputFile
.
close
()
if
requireRanking
:
# DO_RANKING
top20WordsInResume
=
getNMostFromDict
(
preprocessedWords
,
20
)
score
=
calculateScore
(
top20WordsInResume
,
topWordsInRequirements
)
print
(
"RESUME: "
,
resumeName
,
" | SCORE: "
,
score
)
return
preprocessedWords
startTime
=
datetime
.
now
()
print
(
"Ranking started : "
,
startTime
)
requirements
=
doWorker
(
requirmentPath
,
False
)
top20WordsInRequirements
=
getNMostFromDict
(
requirements
,
20
)
FileNames
=
[]
all_files
=
os
.
listdir
(
resumesPath
)
for
inputFileName
in
all_files
:
fileName
=
resumesPath
+
"/"
+
inputFileName
if
os
.
path
.
isfile
(
fileName
):
FileNames
.
append
((
fileName
,
inputFileName
))
totalFiles
=
len
(
FileNames
)
print
(
"Total Resumes : "
,
totalFiles
)
for
filePath
,
resumeName
in
FileNames
:
doWorker
(
filePath
,
True
,
top20WordsInRequirements
,
resumeName
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment