Commit df8ceba1 authored by ThushanSandeepa's avatar ThushanSandeepa

Merge remote-tracking branch 'origin/IT19114736' into IT19408316

parents 07324af0 da86d1ac
from flask import Flask, request, url_for, redirect, render_template
from flask_cors import CORS
import werkzeug
import summerise.summary as summarizeed
import json
import textract
from pptx import Presentation
import os
app = Flask(__name__)
CORS(app)
@app.route('/summerize', methods=['GET', 'POST'])
def summerize():
file = request.files['file']
ratio = float(request.form['ratio'])
filename = werkzeug.utils.secure_filename(file.filename)
print("\nReceived image File name : " + file.filename)
file.save('upload/' + filename)
f, file_extension = os.path.splitext('upload/' + filename)
print(file_extension)
if file_extension == '.docx':
text = textract.process('upload/' + filename)
arr = str(text).replace("\\n", "")
arr = arr.replace("\\t", "")
arr = arr.replace("\\", "")
prs = Presentation()
lyt = prs.slide_layouts[0] # choosing a slide layout
for x in range(0, 3):
if x == 2:
slide = prs.slides.add_slide(lyt) # adding a slide
title = slide.shapes.title # assigning a title
subtitle = slide.placeholders[1] # placeholder for subtitle
subtitle.text = arr
else:
slide = prs.slides.add_slide(lyt) # adding a slide
title = slide.shapes.title # assigning a title
subtitle = slide.placeholders[1] # placeholder for subtitle
title.text = "ignore" # title
subtitle.text = "ignore" # subtitle
prs.save("upload/slide3.pptx") # saving file
print('file saved')
res = summarizeed.create_sumall('upload/slide3.pptx', ratio)
else:
res = summarizeed.create_sumall('upload/' + filename, ratio)
rr = []
for r in res[0]:
rr.append(r[0].replace('"', ''))
return_str = '{ "result" : ['
for i in range(len(rr)):
if i == len(rr) - 1:
return_str += '"' + rr[i] + '"'
else:
return_str += '"' + rr[i] + '"' + ','
return_str += ']}'
print(return_str)
return json.loads(return_str)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=5005, debug=True)
import glob
from pptx import Presentation
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import textract
import os.path
import nltk
nltk.download('stopwords')
nltk.download('punkt')
def create_sumall(abc, ratio):
if abc:
filename = abc
stop_word = ['is', 'a', 'and', 'the']
# Function to create Text summarization
def create_summ(text):
stopWords = set(stopwords.words("english"))
words = word_tokenize(text)
freqTable = dict()
for word in words:
word = word.lower()
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
sentences = sent_tokenize(text)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
else:
sentenceValue[sentence] = freq
sumValues = 0
for sentence in sentenceValue:
sumValues += sentenceValue[sentence]
lensenvalu = len(sentenceValue)
if lensenvalu == 0:
lensenvalu = 1
average = int(sumValues / lensenvalu)
else:
average = int(sumValues / lensenvalu)
summary = ''
for sentence in sentences:
if (sentence in sentenceValue) and (sentenceValue[sentence] > (
ratio * average)):
summary += " " + sentence
return summary
def read_full_pptxe(filename):
sentences = []
b = []
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
s = create_summ(shape.text.replace("\n", " "))
s = str(s)
if (len(s)) >= 1:
f = ["Slide " + str(a) + "-" + s]
sentences.append(f)
return sentences
def read_full_docx(filename):
sentences = []
text = textract.process(filename)
temp = text.split(".")
for t in temp:
sentences.append(t.replace("\n", " "))
return sentences
extension = os.path.splitext(filename)[1]
if extension == 'docx':
read_full_docx(filename)
else:
read_full_pptxe(filename)
def Convert(string):
li = list(string.split(" "))
return li
def Convert2(string):
li = list(string.split("\n"))
return li
def read_slide3(filename):
a = 1
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if a == 4 and shape.shape_id == 3:
s3 = str(shape.text)
return s3
def read_full_pptx(filename, sss):
numberslide = []
numberslide.append(sss)
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if shape.shape_id != 2:
s = shape.text.replace("\n", " ")
s = str(s)
if (len(s)) >= 20 and a != 3 and a != 1 and a != 2:
lo_1 = [a for a in new_l1 if a in s.lower()]
f_lo_l = round((len(lo_1) / len_of_l1) * 100)
if f_lo_l >= 50:
f = "Slide " + str(a)
numberslide.append(f)
return numberslide
loooo = Convert2(read_slide3(filename))
abc = []
for i in loooo:
l1 = Convert(i.lower())
new_l1 = [w for w in l1 if w not in stop_word]
len_of_l1 = len(new_l1)
read_full_pptx(filename, i)
abc.append(read_full_pptx(filename, i))
return (read_full_pptxe(filename), abc)
else:
print('error')
This source diff could not be displayed because it is too large. You can view the blob instead.
# import libraries
import json
import os
import audio_gen as topic_gen
import bert as bert
import werkzeug
from flask import Flask, request, send_file
from flask_cors import CORS
from nltk.corpus import stopwords
s = set(stopwords.words('english'))
app = Flask(__name__)
CORS(app)
download_file = ''
# Topics API
@app.route('/topic', methods=['GET', 'POST'])
def topic():
imagefile = request.files['video']
filename = werkzeug.utils.secure_filename(imagefile.filename)
print("\nReceived image File name : " + imagefile.filename)
imagefile.save('upload/' + filename)
global download_file
download_file = 'upload/' + str(filename).replace('.mp4', '.txt')
text_list_from_video, all_text = topic_gen.split_video_file('upload/' + filename)
# Writing to a file
file1 = open(download_file, 'w')
file1.writelines(all_text)
file1.close()
topic_list = []
for index in text_list_from_video:
temp_topic = bert.get_topics_new(index[1])
filtered_topics = [elem for elem in temp_topic if elem not in s]
topic_list.append(filtered_topics[0])
return_json = '[ '
for i, topic in enumerate(topic_list):
if i == len(topic_list) - 1:
return_json += '{ "index" : "' + str(i) + '", "topic" : "' + str(topic) + '", "time_frame" : "' + str(
i * 240) + ' to end" } ]'
else:
return_json += '{ "index" : "' + str(i) + '", "topic" : "' + str(topic) + '", "time_frame" : "' + str(
i * 240) + ' to ' + str((i + 1) * 240) + ' seconds"} ,'
print(return_json)
return json.loads(return_json)
# Transcript API
@app.route('/transcript', methods=['GET', 'POST'])
def transcript():
global download_file
doc = download_file
return send_file(doc, as_attachment=True)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=1100, debug=True)
import moviepy.editor as mp
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from moviepy.editor import VideoFileClip
import os
import text_gen
def convert_video_to_audio(filename):
clip = mp.VideoFileClip(r"" + filename)
audio_file_name = str(filename).split('/')[-1].replace('.mp4', '.wav')
clip.audio.write_audiofile(r"audio_input/" + audio_file_name)
return text_gen.convert_audio_to_text("audio_input/" + audio_file_name)
def split_video_file(filename):
return_list = []
all_text = ''
required_video_file = filename
files = os.listdir('video_input')
for filename in files:
os.remove('video_input/' + filename)
total_length = VideoFileClip(required_video_file).duration
# print(total_length)
no_of_slices = int(total_length / 240) + 1
time_grid = []
for i in range(0, no_of_slices):
time_grid.append(i * 240)
for i in range(no_of_slices):
if i == len(time_grid) - 1:
# ffmpeg_extract_subclip(required_video_file, time_grid[i], total_length - time_grid[i],
# targetname='videos/' + str(i) + ".mp4")
pass
else:
ffmpeg_extract_subclip(required_video_file, time_grid[i], time_grid[i + 1],
targetname='video_input/' + str(i) + ".mp4")
text = convert_video_to_audio('video_input/' + str(i) + ".mp4")
all_text += text + ' '
return_list.append([i, text])
return return_list, all_text
import nltk
import question_generator as q_gen
from bertopic import BERTopic
from nltk.corpus import words
model = BERTopic(verbose=True)
def get_topics(file):
topics_outputs = []
docs = []
with open(file) as file:
for line in file:
docs.append(line.rstrip())
topics, probabilities = model.fit_transform(docs)
print(model.get_topic_freq())
print('done')
print(model.get_topics())
for i in model.get_topic(0):
if i[0] in words.words():
pass
else:
print(i[0])
topics_outputs.append(i[0])
return topics_outputs
def get_topics_new(text):
topics = q_gen.get_keywords(text, q_gen.summarizer(text))
return topics
from textwrap3 import wrap
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import random
import numpy as np
import nltk
# nltk.download('punkt')
# nltk.download('brown')
# nltk.download('wordnet')
# nltk.download('stopwords')
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
import pke
import traceback
from flashtext import KeywordProcessor
summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary_model = summary_model.to(device)
def set_seed(seed: int):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def postprocesstext(content):
final = ""
for sent in sent_tokenize(content):
sent = sent.capitalize()
final = final + " " + sent
return final
# text summarizing
def summarizer(text, model=summary_model, tokenizer=summary_tokenizer):
text = text.strip().replace("\n", " ")
text = "summarize: " + text
max_len = 512
encoding = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=False, truncation=True,
return_tensors="pt").to(device)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
outs = model.generate(input_ids=input_ids,
attention_mask=attention_mask,
early_stopping=True,
num_beams=3,
num_return_sequences=1,
no_repeat_ngram_size=2,
min_length=75,
max_length=300)
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
summary = dec[0]
summary = postprocesstext(summary)
summary = summary.strip()
return summary
def get_nouns_multipartite(content):
out = []
try:
extractor = pke.unsupervised.MultipartiteRank()
extractor.load_document(input=content)
# not contain punctuation marks or stopwords as candidates.
pos = {'PROPN', 'NOUN'}
stoplist = list(string.punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')
extractor.candidate_selection(pos=pos, stoplist=stoplist)
# 4. build the Multipartite graph and rank candidates using random walk,
# alpha controls the weight adjustment mechanism, see TopicRank for
# threshold/method parameters.
extractor.candidate_weighting(alpha=1.1,
threshold=0.75,
method='average')
keyphrases = extractor.get_n_best(n=15)
for val in keyphrases:
out.append(val[0])
except:
out = []
traceback.print_exc()
return out
def get_keywords(originaltext, summarytext):
keywords = get_nouns_multipartite(originaltext)
# print("keywords unsummarized: ", keywords)
keyword_processor = KeywordProcessor()
for keyword in keywords:
keyword_processor.add_keyword(keyword)
keywords_found = keyword_processor.extract_keywords(summarytext)
keywords_found = list(set(keywords_found))
# print("keywords_found in summarized: ", keywords_found)
important_keywords = []
for keyword in keywords:
if keyword in keywords_found:
important_keywords.append(keyword)
return important_keywords[:4]
question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_model = question_model.to(device)
def get_question(context, answer, model, tokenizer):
text = "context: {} answer: {}".format(context, answer)
encoding = tokenizer.encode_plus(text, max_length=384, pad_to_max_length=False, truncation=True,
return_tensors="pt").to(device)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
outs = model.generate(input_ids=input_ids,
attention_mask=attention_mask,
early_stopping=True,
num_beams=5,
num_return_sequences=1,
no_repeat_ngram_size=2,
max_length=72)
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
Question = dec[0].replace("question:", "")
Question = Question.strip()
return Question
def generate_questions_and_answers(text):
set_seed(42)
summarized_text = summarizer(text, summary_model, summary_tokenizer)
imp_keywords = get_keywords(text, summarized_text)
question_and_answer_list = []
for answer in imp_keywords:
ques = get_question(summarized_text, answer, question_model, question_tokenizer)
question_and_answer_list.append([ques, answer.capitalize()])
return question_and_answer_list
pip install --ignore-installed nltk pywsd scikit-learn flask Flask-Cors PyPDF2 textwrap3 transformers pke-tool flashtext sentence_transformers spacy pydot bertopic pandas rake-nltk protobuf==3.20.0 moviepy SpeechRecognition
python -m spacy download en_core_web_sm
\ No newline at end of file
import glob
import math
import os.path
import nltk
import textract
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from pptx import Presentation
def create_sumall(abc, ratio):
if abc:
filename = abc
stop_word = ['is', 'a', 'and', 'the']
# Function to create Text summarization
def create_summ(text):
stopWords = set(stopwords.words("english"))
words = word_tokenize(text)
freqTable = dict()
for word in words:
word = word.lower()
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
sentences = sent_tokenize(text)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
else:
sentenceValue[sentence] = freq
sumValues = 0
for sentence in sentenceValue:
sumValues += sentenceValue[sentence]
lensenvalu = len(sentenceValue)
if lensenvalu == 0:
lensenvalu = 1
average = int(sumValues / lensenvalu)
else:
average = int(sumValues / lensenvalu)
summary = ''
for sentence in sentences:
if (sentence in sentenceValue) and (sentenceValue[sentence] > (
ratio * average)):
summary += " " + sentence
return summary
def read_full_pptxe(filename):
sentences = []
b = []
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
s = create_summ(shape.text.replace("\n", " "))
s = str(s)
if (len(s)) >= 1:
f = ["Slide " + str(a) + "-" + s]
sentences.append(f)
return sentences
def read_full_docx(filename):
sentences = []
text = textract.process(filename)
temp = text.split(".")
for t in temp:
sentences.append(t.replace("\n", " "))
return sentences
extension = os.path.splitext(filename)[1]
if extension == 'docx':
read_full_docx(filename)
else:
read_full_pptxe(filename)
def Convert(string):
li = list(string.split(" "))
return li
def Convert2(string):
li = list(string.split("\n"))
return li
def read_slide3(filename):
a = 1
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if a == 4 and shape.shape_id == 3:
s3 = str(shape.text)
return s3
def read_full_pptx(filename, sss):
numberslide = []
numberslide.append(sss)
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if shape.shape_id != 2:
s = shape.text.replace("\n", " ")
s = str(s)
if (len(s)) >= 20 and a != 3 and a != 1 and a != 2:
lo_1 = [a for a in new_l1 if a in s.lower()]
f_lo_l = round((len(lo_1) / len_of_l1) * 100)
if f_lo_l >= 50:
f = "Slide " + str(a)
numberslide.append(f)
return numberslide
loooo = Convert2(read_slide3(filename))
abc = []
for i in loooo:
l1 = Convert(i.lower())
new_l1 = [w for w in l1 if w not in stop_word]
len_of_l1 = len(new_l1)
read_full_pptx(filename, i)
abc.append(read_full_pptx(filename, i))
return (read_full_pptxe(filename), abc)
else:
print('error')
from flask import Flask, request, url_for, redirect, render_template
from flask_cors import CORS
import werkzeug
import topics_find.summary as summarizeed
import json
import textract
from pptx import Presentation
import os
app = Flask(__name__)
CORS(app)
@app.route('/summerize', methods=['GET', 'POST'])
def summerize():
file = request.files['file']
ratio = float(request.form['ratio'])
filename = werkzeug.utils.secure_filename(file.filename)
print("\nReceived image File name : " + file.filename)
file.save('upload/' + filename)
f, file_extension = os.path.splitext('upload/' + filename)
print(file_extension)
if file_extension == '.docx':
text = textract.process('upload/' + filename)
arr = str(text).replace("\\n", "")
arr = arr.replace("\\t", "")
arr = arr.replace("\\", "")
prs = Presentation()
lyt = prs.slide_layouts[0] # choosing a slide layout
for x in range(0, 3):
if x == 2:
slide = prs.slides.add_slide(lyt) # adding a slide
title = slide.shapes.title # assigning a title
subtitle = slide.placeholders[1] # placeholder for subtitle
subtitle.text = arr
else:
slide = prs.slides.add_slide(lyt) # adding a slide
title = slide.shapes.title # assigning a title
subtitle = slide.placeholders[1] # placeholder for subtitle
title.text = "ignore" # title
subtitle.text = "ignore" # subtitle
prs.save("upload/slide3.pptx") # saving file
print('file saved')
res = summarizeed.create_sumall('upload/slide3.pptx', ratio)
else:
res = summarizeed.create_sumall('upload/' + filename, ratio)
rr = []
for r in res[0]:
rr.append(r[0].replace('"', ''))
return_str = '{ "result" : ['
for i in range(len(rr)):
if i == len(rr) - 1:
return_str += '"' + rr[i] + '"'
else:
return_str += '"' + rr[i] + '"' + ','
return_str += ']}'
print(return_str)
return json.loads(return_str)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=5005, debug=True)
import os
import subprocess
import sys
import speech_recognition as sr
PYTHONIOENCODING = "UTF-8"
FOLDER_AUDIO = "audio_input"
FOLDER_TEXT = "text_output"
LANGUAGE = "en-US"
def convert_audio_to_text(filename):
r = sr.Recognizer()
with sr.AudioFile(filename) as source:
audio = r.record(source)
try:
command = r.recognize_google(audio, language='en-IN', show_all=True)
print(command["alternative"][0]["transcript"])
return command["alternative"][0]["transcript"]
except:
return 'did not convert'
# convert_audio_to_text('audio_input/3.wav')
{'alternative': [{'transcript': 'type of diffusion is an IR module that making waves right now open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion took 256 180 hours and hours to train at market price that 612 and pigments', 'confidence': 0.88088202}, {'transcript': 'type of diffusion is an IR module that making waves right now open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion took 256 150 hours and hours to train at market price that 612 and pigments'}, {'transcript': 'type of diffusion is an IR module that making waves right now open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion took 256 180 days and hours to train at market price that 612 and pigments'}, {'transcript': 'type of diffusion is an IR module that making waves right now kitchen open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion took 256 150 hours and hours to train at market price that 612 and pigments'}, {'transcript': 'type of diffusion is an IR module that making waves right now open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion Tuk 256 150 hours and hours to train at market price that 612 and pigments'}], 'final': True}
\ No newline at end of file
{'alternative': [{'transcript': 'two baby chicks headphone very own eggs when they would ask returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined the final babies', 'confidence': 0.87500781}, {'transcript': 'two baby chicks headphone very own eggs when they would ask that returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined to find her babies'}, {'transcript': 'water to baby chicks headphone very own eggs when they would ask that returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined the final babies'}, {'transcript': 'two baby chicks headphone very own eggs when they would ask returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined to find her babies'}, {'transcript': 'water to baby chicks headphone very own eggs when they would ask returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined the final babies'}], 'final': True}
\ No newline at end of file
we will start with supervised learning and unsupervised learning is the machine talking about using the data which is well labelled let's marry right then what is the telling ok low-income and other companies of this well it's a low-income marriage male and living in Android with some number of children no like you know what is this you are giving some attributes and the corresponding outcome with positive and that you're giving it will look at the patterns and identify ok if I get which type of combination will be the day is it right one thank you to another example is so it pays in the email spam filter problem we have the date of receipt of email with over text within every evening we also know which of these moon so when you are having condition with its like that only thing is you are giving just the data and the outcome to find out is it a problem with each and every moving into an excellent so what is the answer 100 emails in other two machine is just looking at it and see what is this what is this is it is very simple process of partition in set of data object in the subject you are breaking how to cancel the items in a manner that you are maximized in Singh similarity between within one and you heard maximize the gift of fighter in separate that and it has Max we have to maximum put it into something that you don't know how many subject you will get to you will get then you can licence ok
\ No newline at end of file
security that Security at your home or anything else you can think who are the people involved who would you who would you attributes security in your home who is the person who is responsible for security usually at home in your pin it can be different from house to house parents ok very good so parents are you should I am in your mother most of the time asking you know that did you lock the door before going to sleep so that I want to be protected from teams very good if you don't think so it's just that we want to learn what is security in information system security in information security at home means to protect our property and personal and we have property things that we need protection from so these new words in other subjects have a very clear idea about what is computer security is already told me some of this was not trying to protect information systems that that we keep hold of three things this is the important what are the 3 things integrity and availability and it's called confidential guys no matter which part of the world you leaving wherever you go security is always always about these three things what are the three things easy to remember confidentiality integrity and availability ok so I will go through this time one by one and explain the meaning of every time ok but for you to remember very easily remember these three letters very familiar acronym of America right Central Intelligence Agency no no that but confidentiality integrity availability you should remember get so you have this we want this we want we want for what one of the student send me a message saying we want our valuable things and you know personal items protect just that information system resources should be protected what are the resources we want the hardware to be protected we want this software to be protected we want to be protected and the data that is stored in the hardware and not only that don't forget you have you know service computers and everything but none of these things are useful if we don't connect internet and connectivity is very so we want to protect this one also which is called telecommunication and we want to make sure that this is also protected not only this not only this not only the software inside we also want to make sure whatever the data that is going here and coming in is also protect that is clear so now all we need to learn about these three things what do we mean by confidentiality integrity and that's what we want to learn today is not happen after that your time to talk not my type what does the thief usually do after please talk to me guys otherwise you know I'm already losing interest I am looking at my phone good still valuable things stealing is not there to have a cup of tea electricity line so that
\ No newline at end of file
This diff is collapsed.
supervised learning is a learning English speak paint machine ok talking about I told you machine learning using the data which is well labelled what do you mean by that mean something is old
\ No newline at end of file
from flask import Flask, request, render_template, send_file
from flask_cors import CORS
import werkzeug
import cv2
import note_generator.note as note_gen
import topics_find.audio_gen as topic_gen
import note_generator.write_word as writer
import topics_find.bert as bert
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
app = Flask(__name__)
CORS(app)
video_file_name = ''
pptx_file_name = ''
@app.route('/')
def index():
return render_template('index.html')
@app.route('/upload_action', methods=['GET', 'POST'])
def upload_action():
lecture_video = request.files['lecture_video']
filename_v = werkzeug.utils.secure_filename(lecture_video.filename)
print("\nReceived image File name : " + lecture_video.filename)
lecture_video.save('upload/' + filename_v)
lecture_ppt = request.files['lecture_ppt']
filename = werkzeug.utils.secure_filename(lecture_ppt.filename)
print("\nReceived image File name : " + lecture_ppt.filename)
lecture_ppt.save('upload/' + filename)
global video_file_name
global pptx_file_name
video_file_name = filename_v
pptx_file_name = filename
return render_template('upload.html', video_file_name=video_file_name, pptx_file_name=pptx_file_name)
@app.route('/generate_topics', methods=['GET', 'POST'])
def generate_topics():
global video_file_name
global pptx_file_name
text_list_from_video, all_text = topic_gen.split_video_file('upload/' + video_file_name)
topic_list = []
for index in text_list_from_video:
topic_list.append([index[0], index[1], bert.get_topics_new(index[1])])
return render_template('topics.html', topic_list=topic_list)
@app.route('/generate_short_note', methods=['GET', 'POST'])
def generate_short_note():
global video_file_name
global pptx_file_name
text_from_pptx = note_gen.generate_note('upload/' + pptx_file_name)
text_list_from_video, all_text = topic_gen.split_video_file('upload/' + video_file_name)
writer.create_doc()
writer.write_note('short note from lecture video :- ')
for i in text_list_from_video:
writer.write_note(i[1])
writer.write_note('short note from lecture slide (pptx) :- ')
writer.write_note(text_from_pptx)
writer.save_note('upload/' + pptx_file_name.split('.')[0] + '.docx')
doc_filename = pptx_file_name.split('.')[0] + '.docx'
print(doc_filename)
return render_template('short_notes.html', filename=doc_filename)
@app.route('/short_note/<name>')
def short_note(name):
doc = 'upload/' + name
print('request', doc)
return send_file(doc, as_attachment=True)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=5200, debug=True)
.html {
height: 100%;
}
* {box-sizing: border-box;}
.body {
margin: 0;
height: 100%;
font-family: Arial, Helvetica, sans-serif;
background-image: url( ../images/bg.jpg )
}
.body_login {
height:50%;
width: 50%;
padding: 10px;
margin: 60px auto;
font-family: Arial, Helvetica, sans-serif;
background-image: url( ../images/bg.jpg )
}
.header {
overflow: hidden;
background-color: #e28743;
padding: 5px 10px;
}
.header a {
float: left;
color: White;
text-align: center;
padding: 12px;
text-decoration: none;
font-size: 18px;
line-height: 25px;
border-radius: 4px;
}
.header a.logo {
font-size: 25px;
font-weight: bold;
}
.header a:hover {
background-color: #76b5c5;
color: black;
}
.header a.active {
background-color: #76b5c5;
color: white;
}
.header-right {
float: right;
}
@media screen and (max-width: 500px) {
.header a {
float: none;
display: block;
text-align: left;
}
.header-right {
float: none;
}
}
.global-container{
height:100%;
display: flex;
align-items: center;
justify-content: center;
float: left;
width: 100%;
}
.login-form {
opacity: 0.9;
width: 340px;
margin: 50px auto;
font-size: 15px;
}
.login-form form {
margin-bottom: 15px;
background: #f7f7f7;
box-shadow: 0px 2px 2px rgba(0, 0, 0, 0.3);
padding: 30px;
}
.login-form h2 {
margin: 0 0 15px;
}
.form-control, .btn {
min-height: 38px;
border-radius: 2px;
}
.btn {
font-size: 15px;
font-weight: bold;
}
form{
padding-top: 10px;
font-size: 14px;
margin-top: 30px;
}
.card-title{ font-weight:300; }
.card{opacity: 0.9;}
.effect7{
position:relative;
-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
}
.btn{
font-size: 14px;
margin-top:20px;
}
.login-form{
width:700px;
margin:20px;
}
.sign-up{
text-align:center;
padding:20px 0 0;
}
.alert{
margin-bottom:-30px;
font-size: 13px;
margin-top:20px;
}
.modal-dialog {
max-width: 800px;
margin: 30px auto;
}
.modal-body {
position:relative;
padding:0px;
}
.close {
position:absolute;
right:-30px;
top:0;
z-index:999;
font-size:2rem;
font-weight: normal;
color:#fff;
opacity:1;
}
.custom-file-uploader {
position: relative;
input[type='file'] {
display: block;
position: absolute;
top: 0;
right: 0;
bottom: 0;
left: 0;
z-index: 5;
width: 100%;
height: 100%;
opacity: 0;
cursor: default;
}
}
\ No newline at end of file
/* Always set the map height explicitly to define the size of the div
* element that contains the map. */
#map {
height: 100%;
}
/* Optional: Makes the sample page fill the window. */
html,
body {
height: 100%;
margin: 0;
padding: 0;
}
\ No newline at end of file
const citymap = {
chicago: {
center: { lat: 6.9061, lng: 79.9696 },
population: 100,
}
};
function initMap() {
// Create the map.
const map = new google.maps.Map(document.getElementById("map"), {
zoom: 15,
center: { lat: 6.9061, lng: 79.9696 },
mapTypeId: "terrain",
});
// Construct the circle for each value in citymap.
// Note: We scale the area of the circle based on the population.
for (const city in citymap) {
// Add the circle for this city to the map.
const cityCircle = new google.maps.Circle({
strokeColor: "#FF0000",
strokeOpacity: 0.8,
strokeWeight: 2,
fillColor: "#FF0000",
fillOpacity: 0.35,
map,
center: citymap[city].center,
radius: Math.sqrt(citymap[city].population) * 100,
});
}
}
\ No newline at end of file
document.querySelectorAll(".drop-zone__input").forEach((inputElement) => {
const dropZoneElement = inputElement.closest(".drop-zone");
dropZoneElement.addEventListener("click", (e) => {
inputElement.click();
});
inputElement.addEventListener("change", (e) => {
if (inputElement.files.length) {
updateThumbnail(dropZoneElement, inputElement.files[0]);
}
});
dropZoneElement.addEventListener("dragover", (e) => {
e.preventDefault();
dropZoneElement.classList.add("drop-zone--over");
});
["dragleave", "dragend"].forEach((type) => {
dropZoneElement.addEventListener(type, (e) => {
dropZoneElement.classList.remove("drop-zone--over");
});
});
dropZoneElement.addEventListener("drop", (e) => {
e.preventDefault();
if (e.dataTransfer.files.length) {
inputElement.files = e.dataTransfer.files;
updateThumbnail(dropZoneElement, e.dataTransfer.files[0]);
}
dropZoneElement.classList.remove("drop-zone--over");
});
});
/**
* Updates the thumbnail on a drop zone element.
*
* @param {HTMLElement} dropZoneElement
* @param {File} file
*/
function updateThumbnail(dropZoneElement, file) {
let thumbnailElement = dropZoneElement.querySelector(".drop-zone__thumb");
// First time - remove the prompt
if (dropZoneElement.querySelector(".drop-zone__prompt")) {
dropZoneElement.querySelector(".drop-zone__prompt").remove();
}
// First time - there is no thumbnail element, so lets create it
if (!thumbnailElement) {
thumbnailElement = document.createElement("div");
thumbnailElement.classList.add("drop-zone__thumb");
dropZoneElement.appendChild(thumbnailElement);
}
thumbnailElement.dataset.label = file.name;
// Show thumbnail for image files
if (file.type.startsWith("image/")) {
const reader = new FileReader();
reader.readAsDataURL(file);
reader.onload = () => {
thumbnailElement.style.backgroundImage = `url('${reader.result}')`;
};
} else {
thumbnailElement.style.backgroundImage = null;
}
}
\ No newline at end of file
<!DOCTYPE html>
<html lang="en">
<head>
<title>Index</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
</head>
<style>
html {
height: 100%;
}
* {box-sizing: border-box;}
body {
margin: 0;
height: 100%;
font-family: Arial, Helvetica, sans-serif;
background-image: url({{ url_for('static', filename='images/bg.jpg') }})
}
.header {
overflow: hidden;
background-color: #970103;
padding: 5px 10px;
}
.header a {
float: left;
color: White;
text-align: center;
padding: 12px;
text-decoration: none;
font-size: 18px;
line-height: 25px;
border-radius: 4px;
}
.header a.logo {
font-size: 25px;
font-weight: bold;
}
.header a:hover {
background-color: #8a8a8a;
color: black;
}
.header a.active {
background-color: #0b0b0b;
color: white;
}
.header-right {
float: right;
}
@media screen and (max-width: 500px) {
.header a {
float: none;
display: block;
text-align: left;
}
.header-right {
float: none;
}
}
.global-container{
margin-top: 20px;
display: flex;
align-items: center;
justify-content: center;
float: left;
width: 100%;
}
form{
padding-top: 10px;
font-size: 14px;
margin-top: 30px;
margin-left: 50px;
margin-right: 50px;
}
.card-title{ font-weight:300; }
.card{opacity: 0.95;}
.effect7{
position:relative;
-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
}
.login-form{
width:1175px;
margin:20px;
}
.drop-zone {
max-width: 300px;
height: 300px;
padding: 25px;
display: flex;
align-items: center;
justify-content: center;
text-align: center;
font-family: "Quicksand", sans-serif;
font-weight: 500;
font-size: 20px;
cursor: pointer;
color: #cccccc;
border: 4px dashed #345BDE;
border-radius: 10px;
}
.drop-zone--over {
border-style: solid;
}
.drop-zone__input {
display: none;
}
.drop-zone__thumb {
width: 100%;
height: 100%;
border-radius: 10px;
overflow: hidden;
background-color: #cccccc;
background-size: cover;
position: relative;
}
.drop-zone__thumb::after {
content: attr(data-label);
position: absolute;
bottom: 0;
left: 0;
width: 100%;
padding: 5px 0;
color: #ffffff;
background: rgba(0, 0, 0, 0.75);
font-size: 14px;
text-align: center;
}
</style>
<body>
<div class="global-container">
<br>
<br>
<br>
<div class="card login-form effect7">
<div class="card-body">
<center>
<h3>Please Upload Lecture and Slides</h3>
</center>
<form action="upload_action" method="post" enctype="multipart/form-data">
<div class="form-group">
<h5>Please Select Lecture</h5>
<input type="file" name="lecture_video" class="form" accept="video/mp4,video/x-m4v,video/*"
required>
</div>
<div class="form-group">
<h5>Please Select Slides</h5>
<input type="file" name="lecture_ppt" class="form" accept=".ppt, .pptx" required>
</div>
<div class="form-group">
<button type="submit" class="btn btn-primary btn-block">upload</button>
</div>
</form>
</div>
</div>
</div>
</body>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<title>Short Note</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
</head>
<style>
html {
height: 100%;
}
* {box-sizing: border-box;}
body {
margin: 0;
height: 100%;
font-family: Arial, Helvetica, sans-serif;
background-image: url({{ url_for('static', filename='images/bg.jpg') }})
}
.header {
overflow: hidden;
background-color: #970103;
padding: 5px 10px;
}
.header a {
float: left;
color: White;
text-align: center;
padding: 12px;
text-decoration: none;
font-size: 18px;
line-height: 25px;
border-radius: 4px;
}
.header a.logo {
font-size: 25px;
font-weight: bold;
}
.header a:hover {
background-color: #8a8a8a;
color: black;
}
.header a.active {
background-color: #0b0b0b;
color: white;
}
.header-right {
float: right;
}
@media screen and (max-width: 500px) {
.header a {
float: none;
display: block;
text-align: left;
}
.header-right {
float: none;
}
}
.global-container{
margin-top: 20px;
display: flex;
align-items: center;
justify-content: center;
float: left;
width: 100%;
}
form{
padding-top: 10px;
font-size: 14px;
margin-top: 50px;
margin-left: 50px;
margin-right: 50px;
}
.card-title{ font-weight:300; }
.card{
opacity: 0.95;
}
.card-body{
margin-top: 100px;
}
.effect7{
position:relative;
-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
}
.login-form{
width:1175px;
margin:20px;
}
</style>
<body>
<div class="global-container">
<br>
<br>
<br>
<div class="card login-form effect7">
<div class="card-body">
<center>
<h2>Short Note Ready</h2>
<a href="/short_note/{{filename}}" style="color:red;">Download as doc file</a>
</center>
</div>
</div>
</div>
</body>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<title>Topics</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
</head>
<style>
html {
height: 100%;
}
* {box-sizing: border-box;}
body {
margin: 0;
height: 100%;
font-family: Arial, Helvetica, sans-serif;
background-image: url({{ url_for('static', filename='images/bg.jpg') }})
}
.header {
overflow: hidden;
background-color: #970103;
padding: 5px 10px;
}
.header a {
float: left;
color: White;
text-align: center;
padding: 12px;
text-decoration: none;
font-size: 18px;
line-height: 25px;
border-radius: 4px;
}
.header a.logo {
font-size: 25px;
font-weight: bold;
}
.header a:hover {
background-color: #8a8a8a;
color: black;
}
.header a.active {
background-color: #0b0b0b;
color: white;
}
.header-right {
float: right;
}
@media screen and (max-width: 500px) {
.header a {
float: none;
display: block;
text-align: left;
}
.header-right {
float: none;
}
}
.global-container{
margin-top: 20px;
display: flex;
align-items: center;
justify-content: center;
float: left;
width: 100%;
}
form{
padding-top: 10px;
font-size: 14px;
margin-top: 50px;
margin-left: 50px;
margin-right: 50px;
}
.card-title{ font-weight:300; }
.card{
opacity: 0.95;
}
.card-body{
margin-top: 100px;
}
.effect7{
position:relative;
-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
}
.login-form{
width:1175px;
margin:20px;
}
</style>
<body>
<div class="global-container">
<br>
<br>
<br>
<div class="card login-form effect7">
<div class="card-body">
<center>
<h2>Topics Ready</h2>
</center>
<table class="table" id="table">
<thead>
<tr>
<th>Index</th>
<th>Key Points ( Topics )</th>
</tr>
</thead>
<tbody>
{% for row in topic_list %}
<tr>
<td>{{row[0]}}</td>
<td>{{row[2]}}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</body>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<title>Upload</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
</head>
<style>
html {
height: 100%;
}
* {box-sizing: border-box;}
body {
margin: 0;
height: 100%;
font-family: Arial, Helvetica, sans-serif;
background-image: url({{ url_for('static', filename='images/bg.jpg') }})
}
.header {
overflow: hidden;
background-color: #970103;
padding: 5px 10px;
}
.header a {
float: left;
color: White;
text-align: center;
padding: 12px;
text-decoration: none;
font-size: 18px;
line-height: 25px;
border-radius: 4px;
}
.header a.logo {
font-size: 25px;
font-weight: bold;
}
.header a:hover {
background-color: #8a8a8a;
color: black;
}
.header a.active {
background-color: #0b0b0b;
color: white;
}
.header-right {
float: right;
}
@media screen and (max-width: 500px) {
.header a {
float: none;
display: block;
text-align: left;
}
.header-right {
float: none;
}
}
.global-container{
margin-top: 20px;
display: flex;
align-items: center;
justify-content: center;
float: left;
width: 100%;
}
form{
padding-top: 10px;
font-size: 14px;
margin-top: 50px;
margin-left: 50px;
margin-right: 50px;
}
.card-title{ font-weight:300; }
.card{
opacity: 0.95;
}
.card-body{
margin-top: 100px;
}
.effect7{
position:relative;
-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
}
.login-form{
width:1175px;
margin:20px;
}
</style>
<body>
<div class="global-container">
<br>
<br>
<br>
<div class="card login-form effect7">
<div class="card-body">
<center>
<h2>File uploaded successfully</h2>
<h4 style="color:red;">Note generation and topic analysing will take some time</h4>
</center>
<form>
<h4>Lecture Video File : {{video_file_name}}</h4>
<h4>Lecture Slide File (pptx) : {{pptx_file_name}}</h4>
<div class="form-group">
<a href="/generate_short_note" class="btn btn-primary btn-block">Generate Short Note</a>
</div>
<div class="form-group">
<a href="/generate_topics" class="btn btn-primary btn-block">Analyse Topics</a>
</div>
</form>
</div>
</div>
</div>
</body>
</html>
import moviepy.editor as mp
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from moviepy.editor import VideoFileClip
import os
from topics_find import text_gen
def convert_video_to_audio(filename):
clip = mp.VideoFileClip(r"" + filename)
audio_file_name = str(filename).split('/')[-1].replace('.mp4', '.wav')
clip.audio.write_audiofile(r"topics_find/audio_input/" + audio_file_name)
return text_gen.convert_audio_to_text("topics_find/audio_input/" + audio_file_name)
def split_video_file(filename):
return_list = []
all_text = ''
required_video_file = filename
files = os.listdir('topics_find/video_input')
for filename in files:
os.remove('topics_find/video_input/' + filename)
total_length = VideoFileClip(required_video_file).duration
print(total_length)
no_of_slices = int(total_length / 50) + 1
time_grid = []
for i in range(0, no_of_slices):
time_grid.append(i * 50)
for i in range(no_of_slices):
if i == len(time_grid) - 1:
# ffmpeg_extract_subclip(required_video_file, time_grid[i], total_length - time_grid[i],
# targetname='videos/' + str(i) + ".mp4")
pass
else:
ffmpeg_extract_subclip(required_video_file, time_grid[i], time_grid[i + 1],
targetname='topics_find/video_input' + str(i) + ".mp4")
text = convert_video_to_audio('topics_find/video_input' + str(i) + ".mp4")
all_text += text + ' '
return_list.append([i, text])
return return_list, all_text
import nltk
import topics_find.question_generator as q_gen
# nltk.download('words')
# from bertopic import BERTopic
from nltk.corpus import words
# model = BERTopic(verbose=True)
def get_topics(file):
topics_outputs = []
docs = []
with open(file) as file:
for line in file:
docs.append(line.rstrip())
topics, probabilities = model.fit_transform(docs)
#
print(model.get_topic_freq())
#
print('done')
#
# model.get_topic_freq().head(11)
print(model.get_topics())
for i in model.get_topic(0):
if i[0] in words.words():
pass
else:
print(i[0])
topics_outputs.append(i[0])
return topics_outputs
def get_topics_new(text):
topics = q_gen.get_keywords(text, q_gen.summarizer(text))
print(topics)
return topics
from textwrap3 import wrap
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import random
import numpy as np
import nltk
# nltk.download('punkt')
# nltk.download('brown')
# nltk.download('wordnet')
# nltk.download('stopwords')
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
import pke
import traceback
from flashtext import KeywordProcessor
summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary_model = summary_model.to(device)
def set_seed(seed: int):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def postprocesstext(content):
final = ""
for sent in sent_tokenize(content):
sent = sent.capitalize()
final = final + " " + sent
return final
def summarizer(text, model=summary_model, tokenizer=summary_tokenizer):
text = text.strip().replace("\n", " ")
text = "summarize: " + text
# print (text)
max_len = 512
encoding = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=False, truncation=True,
return_tensors="pt").to(device)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
outs = model.generate(input_ids=input_ids,
attention_mask=attention_mask,
early_stopping=True,
num_beams=3,
num_return_sequences=1,
no_repeat_ngram_size=2,
min_length=75,
max_length=300)
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
summary = dec[0]
summary = postprocesstext(summary)
summary = summary.strip()
return summary
def get_nouns_multipartite(content):
out = []
try:
extractor = pke.unsupervised.MultipartiteRank()
extractor.load_document(input=content)
# not contain punctuation marks or stopwords as candidates.
pos = {'PROPN', 'NOUN'}
# pos = {'PROPN','NOUN'}
stoplist = list(string.punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')
extractor.candidate_selection(pos=pos, stoplist=stoplist)
# 4. build the Multipartite graph and rank candidates using random walk,
# alpha controls the weight adjustment mechanism, see TopicRank for
# threshold/method parameters.
extractor.candidate_weighting(alpha=1.1,
threshold=0.75,
method='average')
keyphrases = extractor.get_n_best(n=15)
for val in keyphrases:
out.append(val[0])
except:
out = []
traceback.print_exc()
return out
def get_keywords(originaltext, summarytext):
keywords = get_nouns_multipartite(originaltext)
print("keywords unsummarized: ", keywords)
keyword_processor = KeywordProcessor()
for keyword in keywords:
keyword_processor.add_keyword(keyword)
keywords_found = keyword_processor.extract_keywords(summarytext)
keywords_found = list(set(keywords_found))
print("keywords_found in summarized: ", keywords_found)
important_keywords = []
for keyword in keywords:
if keyword in keywords_found:
important_keywords.append(keyword)
return important_keywords[:1]
question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_model = question_model.to(device)
def get_question(context, answer, model, tokenizer):
text = "context: {} answer: {}".format(context, answer)
encoding = tokenizer.encode_plus(text, max_length=384, pad_to_max_length=False, truncation=True,
return_tensors="pt").to(device)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
outs = model.generate(input_ids=input_ids,
attention_mask=attention_mask,
early_stopping=True,
num_beams=5,
num_return_sequences=1,
no_repeat_ngram_size=2,
max_length=72)
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
Question = dec[0].replace("question:", "")
Question = Question.strip()
return Question
def generate_questions_and_answers(text):
set_seed(42)
summarized_text = summarizer(text, summary_model, summary_tokenizer)
imp_keywords = get_keywords(text, summarized_text)
question_and_answer_list = []
for answer in imp_keywords:
ques = get_question(summarized_text, answer, question_model, question_tokenizer)
question_and_answer_list.append([ques, answer.capitalize()])
return question_and_answer_list
# xxx = """Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company
# Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve
# system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin
# rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet,
# Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and
# transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, “To be clear, I strongly
# believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”. It triggered a downward spiral for Bitcoin value but
# the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising
# that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency."""
# print(generate_questions_and_answers(xxx))
#
# x = generate_questions_and_answers(xxx)
#
# for i in x:
# print(i[0])
# print(i[1])
import glob
from pptx import Presentation
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import textract
import os.path
def create_sumall(abc, ratio):
if abc:
filename = abc
stop_word = ['is', 'a', 'and', 'the']
# Function to create Text summarization
def create_summ(text):
stopWords = set(stopwords.words("english"))
words = word_tokenize(text)
freqTable = dict()
for word in words:
word = word.lower()
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
sentences = sent_tokenize(text)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
else:
sentenceValue[sentence] = freq
sumValues = 0
for sentence in sentenceValue:
sumValues += sentenceValue[sentence]
lensenvalu = len(sentenceValue)
if lensenvalu == 0:
lensenvalu = 1
average = int(sumValues / lensenvalu)
else:
average = int(sumValues / lensenvalu)
summary = ''
for sentence in sentences:
if (sentence in sentenceValue) and (sentenceValue[sentence] > (
ratio * average)):
summary += " " + sentence
return summary
def read_full_pptxe(filename):
sentences = []
b = []
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
s = create_summ(shape.text.replace("\n", " "))
s = str(s)
if (len(s)) >= 1:
f = ["Slide " + str(a) + "-" + s]
sentences.append(f)
return sentences
def read_full_docx(filename):
sentences = []
text = textract.process(filename)
temp = text.split(".")
for t in temp:
sentences.append(t.replace("\n", " "))
return sentences
extension = os.path.splitext(filename)[1]
if extension == 'docx':
read_full_docx(filename)
else:
read_full_pptxe(filename)
def Convert(string):
li = list(string.split(" "))
return li
def Convert2(string):
li = list(string.split("\n"))
return li
def read_slide3(filename):
a = 1
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if a == 4 and shape.shape_id == 3:
s3 = str(shape.text)
return s3
def read_full_pptx(filename, sss):
numberslide = []
numberslide.append(sss)
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if shape.shape_id != 2:
s = shape.text.replace("\n", " ")
s = str(s)
if (len(s)) >= 20 and a != 3 and a != 1 and a != 2:
lo_1 = [a for a in new_l1 if a in s.lower()]
f_lo_l = round((len(lo_1) / len_of_l1) * 100)
if f_lo_l >= 50:
f = "Slide " + str(a)
numberslide.append(f)
return numberslide
loooo = Convert2(read_slide3(filename))
abc = []
for i in loooo:
l1 = Convert(i.lower())
new_l1 = [w for w in l1 if w not in stop_word]
len_of_l1 = len(new_l1)
read_full_pptx(filename, i)
abc.append(read_full_pptx(filename, i))
return (read_full_pptxe(filename), abc)
else:
print('error')
from flask import Flask, request, url_for, redirect, render_template
from flask_cors import CORS
import werkzeug
import topics_find.summary as summarizeed
import json
import textract
from pptx import Presentation
import os
app = Flask(__name__)
CORS(app)
@app.route('/summerize', methods=['GET', 'POST'])
def summerize():
file = request.files['file']
ratio = float(request.form['ratio'])
filename = werkzeug.utils.secure_filename(file.filename)
print("\nReceived image File name : " + file.filename)
file.save('upload/' + filename)
f, file_extension = os.path.splitext('upload/' + filename)
print(file_extension)
if file_extension == '.docx':
text = textract.process('upload/' + filename)
arr = str(text).replace("\\n", "")
arr = arr.replace("\\t", "")
arr = arr.replace("\\", "")
prs = Presentation()
lyt = prs.slide_layouts[0] # choosing a slide layout
for x in range(0, 3):
if x == 2:
slide = prs.slides.add_slide(lyt) # adding a slide
title = slide.shapes.title # assigning a title
subtitle = slide.placeholders[1] # placeholder for subtitle
subtitle.text = arr
else:
slide = prs.slides.add_slide(lyt) # adding a slide
title = slide.shapes.title # assigning a title
subtitle = slide.placeholders[1] # placeholder for subtitle
title.text = "ignore" # title
subtitle.text = "ignore" # subtitle
prs.save("upload/slide3.pptx") # saving file
print('file saved')
res = summarizeed.create_sumall('upload/slide3.pptx', ratio)
else:
res = summarizeed.create_sumall('upload/' + filename, ratio)
rr = []
for r in res[0]:
rr.append(r[0].replace('"', ''))
return_str = '{ "result" : ['
for i in range(len(rr)):
if i == len(rr) - 1:
return_str += '"' + rr[i] + '"'
else:
return_str += '"' + rr[i] + '"' + ','
return_str += ']}'
print(return_str)
return json.loads(return_str)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=5005, debug=True)
import speech_recognition as sr
import subprocess
import os
import sys
PYTHONIOENCODING = "UTF-8"
FOLDER_AUDIO = "audio_input"
FOLDER_TEXT = "text_output"
LANGUAGE = "en-US"
# print("starting...")
#
# if not os.path.isdir(FOLDER_AUDIO):
# os.mkdir(FOLDER_AUDIO)
#
# if not os.path.isdir(FOLDER_TEXT):
# os.mkdir(FOLDER_TEXT)
#
# paths = [os.path.join(FOLDER_AUDIO, nome) for nome in os.listdir(FOLDER_AUDIO)]
# files = [arq for arq in paths if os.path.isfile(arq)]
# wav_files = [arq for arq in files if arq.lower().endswith(".wav")]
#
# for filename in wav_files:
# r = sr.Recognizer()
# with sr.AudioFile(filename) as source:
# audio = r.record(source)
#
# command = r.recognize_google(audio, language='en-IN', show_all=True)
# print(command)
#
# print("running file {}".format(filename))
#
# filefinal = filename.split("audio_input/")[1].split(".wav")[0]
# filefinal = '{}/{}.txt'.format(FOLDER_TEXT, filefinal)
# with open(filefinal, 'w') as arq:
# arq.write(str(command))
#
# print("create a new file {}".format(filefinal))
#
# print("finish")
def convert_audio_to_text(filename):
r = sr.Recognizer()
with sr.AudioFile(filename) as source:
audio = r.record(source)
try:
command = r.recognize_google(audio, language='en-IN', show_all=True)
print(command["alternative"][0]["transcript"])
return command["alternative"][0]["transcript"]
except:
return 'did not convert'
# convert_audio_to_text('audio_input/3.wav')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment