source file added

parent 9b8b3402
import concurrent.futures
import json
import os
import time
from datetime import datetime
from printer import Printer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
basePath = "E:/FINAL SEMESTER/Research/FINAL/PROJECT/"
inputPath = basePath + "preprocessed-corpus/"
outputPath = basePath + "model/"
corpusPath = inputPath + "merged-corpus-withoutPOS-1619336556.txt"
MIN_WORD_COUNT_FOR_EMBEDDING = 1 #min occurance of word > 1
VECTOR_SIZE = 300
EMBEDDING_WINDOW = 8 #window size
EPOCHS = 25 # Epoch - Kind of reembedded 25 times.
def doWorker(corpusPath):
sentences = []
inputFile = open(corpusPath, "r")
corpusLines = inputFile.readlines()
for line in corpusLines:
words = word_tokenize(line)
sentences.append(words)
model = Word2Vec(sentences,
min_count = MIN_WORD_COUNT_FOR_EMBEDDING ,
size = VECTOR_SIZE,
workers = 3,
window = EMBEDDING_WINDOW,
sg = 1,
iter = EPOCHS)
fileName = "w2v-model-V_" + str(VECTOR_SIZE) + "-MC_" + str(MIN_WORD_COUNT_FOR_EMBEDDING) + "-W_" + str(EMBEDDING_WINDOW) + "-E_" + str(EPOCHS) + ".model"
model.save("model/" + fileName)
fileUniqueId = str(int(time.time()))
startTime = datetime.now()
print("Embedding started", startTime)
doWorker(corpusPath)
endTime = datetime.now()
print("Embedding Finished", endTime)
print("\nDuration : ", endTime - startTime)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment