Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2020_21 J-25
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
2020_21 J-25
2020_21 J-25
Commits
fc37de49
Commit
fc37de49
authored
Jul 09, 2021
by
Jeyasumangala Rasanayagam
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
source file added
parent
9b8b3402
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
50 additions
and
0 deletions
+50
-0
IT17160162/gensim-w2v.py
IT17160162/gensim-w2v.py
+50
-0
No files found.
IT17160162/gensim-w2v.py
0 → 100644
View file @
fc37de49
import
concurrent.futures
import
json
import
os
import
time
from
datetime
import
datetime
from
printer
import
Printer
from
nltk.tokenize
import
word_tokenize
from
gensim.models
import
Word2Vec
basePath
=
"E:/FINAL SEMESTER/Research/FINAL/PROJECT/"
inputPath
=
basePath
+
"preprocessed-corpus/"
outputPath
=
basePath
+
"model/"
corpusPath
=
inputPath
+
"merged-corpus-withoutPOS-1619336556.txt"
MIN_WORD_COUNT_FOR_EMBEDDING
=
1
#min occurance of word > 1
VECTOR_SIZE
=
300
EMBEDDING_WINDOW
=
8
#window size
EPOCHS
=
25
# Epoch - Kind of reembedded 25 times.
def
doWorker
(
corpusPath
):
sentences
=
[]
inputFile
=
open
(
corpusPath
,
"r"
)
corpusLines
=
inputFile
.
readlines
()
for
line
in
corpusLines
:
words
=
word_tokenize
(
line
)
sentences
.
append
(
words
)
model
=
Word2Vec
(
sentences
,
min_count
=
MIN_WORD_COUNT_FOR_EMBEDDING
,
size
=
VECTOR_SIZE
,
workers
=
3
,
window
=
EMBEDDING_WINDOW
,
sg
=
1
,
iter
=
EPOCHS
)
fileName
=
"w2v-model-V_"
+
str
(
VECTOR_SIZE
)
+
"-MC_"
+
str
(
MIN_WORD_COUNT_FOR_EMBEDDING
)
+
"-W_"
+
str
(
EMBEDDING_WINDOW
)
+
"-E_"
+
str
(
EPOCHS
)
+
".model"
model
.
save
(
"model/"
+
fileName
)
fileUniqueId
=
str
(
int
(
time
.
time
()))
startTime
=
datetime
.
now
()
print
(
"Embedding started"
,
startTime
)
doWorker
(
corpusPath
)
endTime
=
datetime
.
now
()
print
(
"Embedding Finished"
,
endTime
)
print
(
"
\n
Duration : "
,
endTime
-
startTime
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment