Merge remote-tracking branch 'origin/IT19114736' into IT19408316

df8ceba1 · ThushanSandeepa · 07324af0 · da86d1ac · df8ceba1 · df8ceba1
Commit df8ceba1 authored Nov 16, 2022 by ThushanSandeepa
41 changed files
--- a/Backend/Python/__pycache__/summary.cpython-38.pyc
+++ b/Backend/Python/__pycache__/summary.cpython-38.pyc
--- a/Backend/Python/controllers/api.py
+++ b/Backend/Python/controllers/api.py
+from flask import Flask, request, url_for, redirect, render_template
+from flask_cors import CORS
+import werkzeug
+import summerise.summary as summarizeed
+import json
+import textract
+from pptx import Presentation
+import os
+
+
+app = Flask(__name__)
+CORS(app)
+
+
+@app.route('/summerize', methods=['GET', 'POST'])
+def summerize():
+    file = request.files['file']
+    ratio = float(request.form['ratio'])
+    filename = werkzeug.utils.secure_filename(file.filename)
+    print("\nReceived image File name : " + file.filename)
+    file.save('upload/' + filename)
+
+    f, file_extension = os.path.splitext('upload/' + filename)
+    print(file_extension)
+
+    if file_extension == '.docx':
+        text = textract.process('upload/' + filename)
+        arr = str(text).replace("\\n", "")
+        arr = arr.replace("\\t", "")
+        arr = arr.replace("\\", "")
+
+        prs = Presentation()
+
+        lyt = prs.slide_layouts[0]  # choosing a slide layout
+
+        for x in range(0, 3):
+
+            if x == 2:
+                slide = prs.slides.add_slide(lyt)  # adding a slide
+                title = slide.shapes.title  # assigning a title
+                subtitle = slide.placeholders[1]  # placeholder for subtitle
+                subtitle.text = arr
+            else:
+                slide = prs.slides.add_slide(lyt)  # adding a slide
+                title = slide.shapes.title  # assigning a title
+                subtitle = slide.placeholders[1]  # placeholder for subtitle
+                title.text = "ignore"  # title
+                subtitle.text = "ignore"  # subtitle
+        prs.save("upload/slide3.pptx")  # saving file
+        print('file saved')
+        res = summarizeed.create_sumall('upload/slide3.pptx', ratio)
+    else:
+        res = summarizeed.create_sumall('upload/' + filename, ratio)
+
+    rr = []
+    for r in res[0]:
+        rr.append(r[0].replace('"', ''))
+
+    return_str = '{ "result" : ['
+
+    for i in range(len(rr)):
+        if i == len(rr) - 1:
+            return_str += '"' + rr[i] + '"'
+        else:
+            return_str += '"' + rr[i] + '"' + ','
+    return_str += ']}'
+
+    print(return_str)
+
+    return json.loads(return_str)
+
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=5005, debug=True)
--- a/Backend/Python/modules/summary.py
+++ b/Backend/Python/modules/summary.py
+import glob
+from pptx import Presentation
+import math
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, sent_tokenize
+import textract
+
+import os.path
+import nltk
+nltk.download('stopwords')
+nltk.download('punkt')
+
+
+def create_sumall(abc, ratio):
+    if abc:
+        filename = abc
+        stop_word = ['is', 'a', 'and', 'the']
+
+        # Function to create Text summarization
+        def create_summ(text):
+            stopWords = set(stopwords.words("english"))
+            words = word_tokenize(text)
+            freqTable = dict()
+            for word in words:
+                word = word.lower()
+                if word in stopWords:
+                    continue
+                if word in freqTable:
+                    freqTable[word] += 1
+                else:
+                    freqTable[word] = 1
+
+            sentences = sent_tokenize(text)
+            sentenceValue = dict()
+
+            for sentence in sentences:
+                for word, freq in freqTable.items():
+                    if word in sentence.lower():
+                        if sentence in sentenceValue:
+                            sentenceValue[sentence] += freq
+                        else:
+                            sentenceValue[sentence] = freq
+
+            sumValues = 0
+            for sentence in sentenceValue:
+                sumValues += sentenceValue[sentence]
+
+            lensenvalu = len(sentenceValue)
+            if lensenvalu == 0:
+                lensenvalu = 1
+                average = int(sumValues / lensenvalu)
+            else:
+                average = int(sumValues / lensenvalu)
+
+            summary = ''
+            for sentence in sentences:
+                if (sentence in sentenceValue) and (sentenceValue[sentence] > (
+                        ratio * average)):
+                    summary += " " + sentence
+
+            return summary
+
+        def read_full_pptxe(filename):
+            sentences = []
+            b = []
+            a = 0
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            s = create_summ(shape.text.replace("\n", " "))
+                            s = str(s)
+                            if (len(s)) >= 1:
+                                f = ["Slide " + str(a) + "-" + s]
+                                sentences.append(f)
+            return sentences
+
+        def read_full_docx(filename):
+            sentences = []
+            text = textract.process(filename)
+            temp = text.split(".")
+            for t in temp:
+                sentences.append(t.replace("\n", " "))
+            return sentences
+
+        extension = os.path.splitext(filename)[1]
+
+        if extension == 'docx':
+            read_full_docx(filename)
+        else:
+            read_full_pptxe(filename)
+
+        def Convert(string):
+            li = list(string.split(" "))
+            return li
+
+        def Convert2(string):
+            li = list(string.split("\n"))
+            return li
+
+        def read_slide3(filename):
+            a = 1
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            if a == 4 and shape.shape_id == 3:
+                                s3 = str(shape.text)
+                                return s3
+
+        def read_full_pptx(filename, sss):
+            numberslide = []
+            numberslide.append(sss)
+            a = 0
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            if shape.shape_id != 2:
+                                s = shape.text.replace("\n", " ")
+                                s = str(s)
+                                if (len(s)) >= 20 and a != 3 and a != 1 and a != 2:
+                                    lo_1 = [a for a in new_l1 if a in s.lower()]
+                                    f_lo_l = round((len(lo_1) / len_of_l1) * 100)
+                                    if f_lo_l >= 50:
+                                        f = "Slide " + str(a)
+                                        numberslide.append(f)
+
+            return numberslide
+
+        loooo = Convert2(read_slide3(filename))
+
+        abc = []
+        for i in loooo:
+            l1 = Convert(i.lower())
+            new_l1 = [w for w in l1 if w not in stop_word]
+            len_of_l1 = len(new_l1)
+            read_full_pptx(filename, i)
+            abc.append(read_full_pptx(filename, i))
+
+        return (read_full_pptxe(filename), abc)
+    else:
+        print('error')
--- a/Backend/upload/IT2040_Lecture01_2018.pptx
+++ b/Backend/upload/IT2040_Lecture01_2018.pptx
--- a/Indexing/.DS_Store
+++ b/Indexing/.DS_Store
--- a/Indexing/LDA.ipynb
+++ b/Indexing/LDA.ipynb
--- a/Indexing/__pycache__/audio_gen.cpython-38.pyc
+++ b/Indexing/__pycache__/audio_gen.cpython-38.pyc
--- a/Indexing/__pycache__/bert.cpython-38.pyc
+++ b/Indexing/__pycache__/bert.cpython-38.pyc
--- a/Indexing/__pycache__/question_generator.cpython-38.pyc
+++ b/Indexing/__pycache__/question_generator.cpython-38.pyc
--- a/Indexing/__pycache__/text_gen.cpython-38.pyc
+++ b/Indexing/__pycache__/text_gen.cpython-38.pyc
--- a/Indexing/api.py
+++ b/Indexing/api.py
+# import libraries
+import json
+import os
+
+import audio_gen as topic_gen
+import bert as bert
+import werkzeug
+from flask import Flask, request, send_file
+from flask_cors import CORS
+from nltk.corpus import stopwords
+
+s = set(stopwords.words('english'))
+app = Flask(__name__)
+CORS(app)
+
+download_file = ''
+
+# Topics API
+@app.route('/topic', methods=['GET', 'POST'])
+def topic():
+    imagefile = request.files['video']
+
+    filename = werkzeug.utils.secure_filename(imagefile.filename)
+    print("\nReceived image File name : " + imagefile.filename)
+    imagefile.save('upload/' + filename)
+    global download_file
+    download_file = 'upload/' + str(filename).replace('.mp4', '.txt')
+
+    text_list_from_video, all_text = topic_gen.split_video_file('upload/' + filename)
+
+    # Writing to a file
+    file1 = open(download_file, 'w')
+    file1.writelines(all_text)
+    file1.close()
+
+    topic_list = []
+    for index in text_list_from_video:
+        temp_topic = bert.get_topics_new(index[1])
+
+        filtered_topics = [elem for elem in temp_topic if elem not in s]
+
+        topic_list.append(filtered_topics[0])
+
+    return_json = '[ '
+
+    for i, topic in enumerate(topic_list):
+        if i == len(topic_list) - 1:
+            return_json += '{ "index" : "' + str(i) + '", "topic" : "' + str(topic) + '", "time_frame" : "' + str(
+                i * 240) + ' to end" } ]'
+        else:
+            return_json += '{ "index" : "' + str(i) + '", "topic" : "' + str(topic) + '", "time_frame" : "' + str(
+                i * 240) + ' to ' + str((i + 1) * 240) + ' seconds"} ,'
+
+    print(return_json)
+
+    return json.loads(return_json)
+
+# Transcript API
+@app.route('/transcript', methods=['GET', 'POST'])
+def transcript():
+    global download_file
+    doc = download_file
+    return send_file(doc, as_attachment=True)
+
+
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=1100, debug=True)
--- a/Indexing/audio_gen.py
+++ b/Indexing/audio_gen.py
+import moviepy.editor as mp
+from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
+from moviepy.editor import VideoFileClip
+import os
+import text_gen
+
+
+def convert_video_to_audio(filename):
+    clip = mp.VideoFileClip(r"" + filename)
+    audio_file_name = str(filename).split('/')[-1].replace('.mp4', '.wav')
+
+    clip.audio.write_audiofile(r"audio_input/" + audio_file_name)
+
+    return text_gen.convert_audio_to_text("audio_input/" + audio_file_name)
+
+
+def split_video_file(filename):
+    return_list = []
+    all_text = ''
+
+    required_video_file = filename
+
+    files = os.listdir('video_input')
+
+    for filename in files:
+        os.remove('video_input/' + filename)
+
+    total_length = VideoFileClip(required_video_file).duration
+    # print(total_length)
+
+    no_of_slices = int(total_length / 240) + 1
+
+    time_grid = []
+
+    for i in range(0, no_of_slices):
+        time_grid.append(i * 240)
+
+    for i in range(no_of_slices):
+        if i == len(time_grid) - 1:
+            # ffmpeg_extract_subclip(required_video_file, time_grid[i], total_length - time_grid[i],
+            #                        targetname='videos/' + str(i) + ".mp4")
+            pass
+        else:
+            ffmpeg_extract_subclip(required_video_file, time_grid[i], time_grid[i + 1],
+                                   targetname='video_input/' + str(i) + ".mp4")
+            text = convert_video_to_audio('video_input/' + str(i) + ".mp4")
+            all_text += text + ' '
+            return_list.append([i, text])
+
+    return return_list, all_text
--- a/Indexing/audio_input/0.wav
+++ b/Indexing/audio_input/0.wav
--- a/Indexing/bert.py
+++ b/Indexing/bert.py
+import nltk
+import question_generator as q_gen
+
+
+
+from bertopic import BERTopic
+from nltk.corpus import words
+
+model = BERTopic(verbose=True)
+
+
+def get_topics(file):
+    topics_outputs = []
+
+    docs = []
+    with open(file) as file:
+        for line in file:
+            docs.append(line.rstrip())
+
+    topics, probabilities = model.fit_transform(docs)
+    
+    print(model.get_topic_freq())
+    
+    print('done')
+    
+
+    print(model.get_topics())
+
+    for i in model.get_topic(0):
+        if i[0] in words.words():
+            pass
+        else:
+            print(i[0])
+            topics_outputs.append(i[0])
+
+    return topics_outputs
+
+
+def get_topics_new(text):
+    topics = q_gen.get_keywords(text, q_gen.summarizer(text))
+    return topics
--- a/Indexing/question_generator.py
+++ b/Indexing/question_generator.py
+from textwrap3 import wrap
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import random
+import numpy as np
+import nltk
+# nltk.download('punkt')
+# nltk.download('brown')
+# nltk.download('wordnet')
+# nltk.download('stopwords')
+from nltk.corpus import wordnet as wn
+from nltk.tokenize import sent_tokenize
+from nltk.corpus import stopwords
+import string
+import pke
+import traceback
+from flashtext import KeywordProcessor
+
+summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
+summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+summary_model = summary_model.to(device)
+
+
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def postprocesstext(content):
+    final = ""
+    for sent in sent_tokenize(content):
+        sent = sent.capitalize()
+        final = final + " " + sent
+    return final
+
+# text summarizing
+def summarizer(text, model=summary_model, tokenizer=summary_tokenizer):
+    text = text.strip().replace("\n", " ")
+    text = "summarize: " + text
+    max_len = 512
+    encoding = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=False, truncation=True,
+                                     return_tensors="pt").to(device)
+
+    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
+
+    outs = model.generate(input_ids=input_ids,
+                          attention_mask=attention_mask,
+                          early_stopping=True,
+                          num_beams=3,
+                          num_return_sequences=1,
+                          no_repeat_ngram_size=2,
+                          min_length=75,
+                          max_length=300)
+
+    dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
+    summary = dec[0]
+    summary = postprocesstext(summary)
+    summary = summary.strip()
+
+    return summary
+
+
+def get_nouns_multipartite(content):
+    out = []
+    try:
+        extractor = pke.unsupervised.MultipartiteRank()
+        extractor.load_document(input=content)
+        #    not contain punctuation marks or stopwords as candidates.
+        pos = {'PROPN', 'NOUN'}
+        
+        stoplist = list(string.punctuation)
+        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
+        stoplist += stopwords.words('english')
+        extractor.candidate_selection(pos=pos, stoplist=stoplist)
+        # 4. build the Multipartite graph and rank candidates using random walk,
+        #    alpha controls the weight adjustment mechanism, see TopicRank for
+        #    threshold/method parameters.
+        extractor.candidate_weighting(alpha=1.1,
+                                      threshold=0.75,
+                                      method='average')
+        keyphrases = extractor.get_n_best(n=15)
+
+        for val in keyphrases:
+            out.append(val[0])
+    except:
+        out = []
+        traceback.print_exc()
+
+    return out
+
+
+def get_keywords(originaltext, summarytext):
+    keywords = get_nouns_multipartite(originaltext)
+    # print("keywords unsummarized: ", keywords)
+    keyword_processor = KeywordProcessor()
+    for keyword in keywords:
+        keyword_processor.add_keyword(keyword)
+
+    keywords_found = keyword_processor.extract_keywords(summarytext)
+    keywords_found = list(set(keywords_found))
+    # print("keywords_found in summarized: ", keywords_found)
+
+    important_keywords = []
+    for keyword in keywords:
+        if keyword in keywords_found:
+            important_keywords.append(keyword)
+
+    return important_keywords[:4]
+
+
+question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
+question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
+question_model = question_model.to(device)
+
+
+def get_question(context, answer, model, tokenizer):
+    text = "context: {} answer: {}".format(context, answer)
+    encoding = tokenizer.encode_plus(text, max_length=384, pad_to_max_length=False, truncation=True,
+                                     return_tensors="pt").to(device)
+    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
+
+    outs = model.generate(input_ids=input_ids,
+                          attention_mask=attention_mask,
+                          early_stopping=True,
+                          num_beams=5,
+                          num_return_sequences=1,
+                          no_repeat_ngram_size=2,
+                          max_length=72)
+
+    dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
+
+    Question = dec[0].replace("question:", "")
+    Question = Question.strip()
+    return Question
+
+
+def generate_questions_and_answers(text):
+    set_seed(42)
+    summarized_text = summarizer(text, summary_model, summary_tokenizer)
+    imp_keywords = get_keywords(text, summarized_text)
+    question_and_answer_list = []
+    for answer in imp_keywords:
+        ques = get_question(summarized_text, answer, question_model, question_tokenizer)
+        question_and_answer_list.append([ques, answer.capitalize()])
+
+    return question_and_answer_list
+
--- a/Indexing/readme.txt
+++ b/Indexing/readme.txt
+
+
+pip install --ignore-installed nltk pywsd scikit-learn flask Flask-Cors PyPDF2 textwrap3 transformers pke-tool flashtext sentence_transformers spacy pydot bertopic pandas rake-nltk protobuf==3.20.0 moviepy SpeechRecognition
+
+python -m spacy download en_core_web_sm
\ No newline at end of file
--- a/Indexing/summary.py
+++ b/Indexing/summary.py
+import glob
+import math
+import os.path
+
+import nltk
+import textract
+from nltk.corpus import stopwords
+from nltk.tokenize import sent_tokenize, word_tokenize
+from pptx import Presentation
+
+
+def create_sumall(abc, ratio):
+    if abc:
+        filename = abc
+        stop_word = ['is', 'a', 'and', 'the']
+
+        # Function to create Text summarization
+        def create_summ(text):
+            stopWords = set(stopwords.words("english"))
+            words = word_tokenize(text)
+            freqTable = dict()
+            for word in words:
+                word = word.lower()
+                if word in stopWords:
+                    continue
+                if word in freqTable:
+                    freqTable[word] += 1
+                else:
+                    freqTable[word] = 1
+
+            sentences = sent_tokenize(text)
+            sentenceValue = dict()
+
+            for sentence in sentences:
+                for word, freq in freqTable.items():
+                    if word in sentence.lower():
+                        if sentence in sentenceValue:
+                            sentenceValue[sentence] += freq
+                        else:
+                            sentenceValue[sentence] = freq
+
+            sumValues = 0
+            for sentence in sentenceValue:
+                sumValues += sentenceValue[sentence]
+
+            lensenvalu = len(sentenceValue)
+            if lensenvalu == 0:
+                lensenvalu = 1
+                average = int(sumValues / lensenvalu)
+            else:
+                average = int(sumValues / lensenvalu)
+
+            summary = ''
+            for sentence in sentences:
+                if (sentence in sentenceValue) and (sentenceValue[sentence] > (
+                        ratio * average)):
+                    summary += " " + sentence
+
+            return summary
+
+        def read_full_pptxe(filename):
+            sentences = []
+            b = []
+            a = 0
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            s = create_summ(shape.text.replace("\n", " "))
+                            s = str(s)
+                            if (len(s)) >= 1:
+                                f = ["Slide " + str(a) + "-" + s]
+                                sentences.append(f)
+            return sentences
+
+        def read_full_docx(filename):
+            sentences = []
+            text = textract.process(filename)
+            temp = text.split(".")
+            for t in temp:
+                sentences.append(t.replace("\n", " "))
+            return sentences
+
+        extension = os.path.splitext(filename)[1]
+
+        if extension == 'docx':
+            read_full_docx(filename)
+        else:
+            read_full_pptxe(filename)
+
+        def Convert(string):
+            li = list(string.split(" "))
+            return li
+
+        def Convert2(string):
+            li = list(string.split("\n"))
+            return li
+
+        def read_slide3(filename):
+            a = 1
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            if a == 4 and shape.shape_id == 3:
+                                s3 = str(shape.text)
+                                return s3
+
+        def read_full_pptx(filename, sss):
+            numberslide = []
+            numberslide.append(sss)
+            a = 0
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            if shape.shape_id != 2:
+                                s = shape.text.replace("\n", " ")
+                                s = str(s)
+                                if (len(s)) >= 20 and a != 3 and a != 1 and a != 2:
+                                    lo_1 = [a for a in new_l1 if a in s.lower()]
+                                    f_lo_l = round((len(lo_1) / len_of_l1) * 100)
+                                    if f_lo_l >= 50:
+                                        f = "Slide " + str(a)
+                                        numberslide.append(f)
+
+            return numberslide
+
+        loooo = Convert2(read_slide3(filename))
+
+        abc = []
+        for i in loooo:
+            l1 = Convert(i.lower())
+            new_l1 = [w for w in l1 if w not in stop_word]
+            len_of_l1 = len(new_l1)
+            read_full_pptx(filename, i)
+            abc.append(read_full_pptx(filename, i))
+
+        return (read_full_pptxe(filename), abc)
+    else:
+        print('error')
--- a/Indexing/test_api.py
+++ b/Indexing/test_api.py
+from flask import Flask, request, url_for, redirect, render_template
+from flask_cors import CORS
+import werkzeug
+import topics_find.summary as summarizeed
+import json
+import textract
+from pptx import Presentation
+import os
+
+app = Flask(__name__)
+CORS(app)
+
+
+@app.route('/summerize', methods=['GET', 'POST'])
+def summerize():
+    file = request.files['file']
+    ratio = float(request.form['ratio'])
+    filename = werkzeug.utils.secure_filename(file.filename)
+    print("\nReceived image File name : " + file.filename)
+    file.save('upload/' + filename)
+
+    f, file_extension = os.path.splitext('upload/' + filename)
+    print(file_extension)
+
+    if file_extension == '.docx':
+        text = textract.process('upload/' + filename)
+        arr = str(text).replace("\\n", "")
+        arr = arr.replace("\\t", "")
+        arr = arr.replace("\\", "")
+
+        prs = Presentation()
+
+        lyt = prs.slide_layouts[0]  # choosing a slide layout
+
+        for x in range(0, 3):
+
+            if x == 2:
+                slide = prs.slides.add_slide(lyt)  # adding a slide
+                title = slide.shapes.title  # assigning a title
+                subtitle = slide.placeholders[1]  # placeholder for subtitle
+                subtitle.text = arr
+            else:
+                slide = prs.slides.add_slide(lyt)  # adding a slide
+                title = slide.shapes.title  # assigning a title
+                subtitle = slide.placeholders[1]  # placeholder for subtitle
+                title.text = "ignore"  # title
+                subtitle.text = "ignore"  # subtitle
+        prs.save("upload/slide3.pptx")  # saving file
+        print('file saved')
+        res = summarizeed.create_sumall('upload/slide3.pptx', ratio)
+    else:
+        res = summarizeed.create_sumall('upload/' + filename, ratio)
+
+    rr = []
+    for r in res[0]:
+        rr.append(r[0].replace('"', ''))
+
+    return_str = '{ "result" : ['
+
+    for i in range(len(rr)):
+        if i == len(rr) - 1:
+            return_str += '"' + rr[i] + '"'
+        else:
+            return_str += '"' + rr[i] + '"' + ','
+    return_str += ']}'
+
+    print(return_str)
+
+    return json.loads(return_str)
+
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=5005, debug=True)
--- a/Indexing/text_gen.py
+++ b/Indexing/text_gen.py
+import os
+import subprocess
+import sys
+
+import speech_recognition as sr
+
+PYTHONIOENCODING = "UTF-8"
+
+FOLDER_AUDIO = "audio_input"
+FOLDER_TEXT = "text_output"
+LANGUAGE = "en-US"
+
+
+
+def convert_audio_to_text(filename):
+    r = sr.Recognizer()
+    with sr.AudioFile(filename) as source:
+        audio = r.record(source)
+
+    try:
+        command = r.recognize_google(audio, language='en-IN', show_all=True)
+        print(command["alternative"][0]["transcript"])
+        return command["alternative"][0]["transcript"]
+
+    except:
+        return 'did not convert'
+
+# convert_audio_to_text('audio_input/3.wav')
--- a/Indexing/text_output/3.txt
+++ b/Indexing/text_output/3.txt
+{'alternative': [{'transcript': 'type of diffusion is an IR module that making waves right now open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion took 256 180 hours and hours to train at market price that 612 and pigments', 'confidence': 0.88088202}, {'transcript': 'type of diffusion is an IR module that making waves right now open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion took 256 150 hours and hours to train at market price that 612 and pigments'}, {'transcript': 'type of diffusion is an IR module that making waves right now open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion took 256 180 days and hours to train at market price that 612 and pigments'}, {'transcript': 'type of diffusion is an IR module that making waves right now kitchen open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion took 256 150 hours and hours to train at market price that 612 and pigments'}, {'transcript': 'type of diffusion is an IR module that making waves right now open source machine many more than the Legend images from text free ridiculously well with the engine is the ability to fusion Tuk 256 150 hours and hours to train at market price that 612 and pigments'}], 'final': True}
\ No newline at end of file
--- a/Indexing/text_output/uil9h-hwq08.txt
+++ b/Indexing/text_output/uil9h-hwq08.txt
+{'alternative': [{'transcript': 'two baby chicks headphone very own eggs when they would ask returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined the final babies', 'confidence': 0.87500781}, {'transcript': 'two baby chicks headphone very own eggs when they would ask that returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined to find her babies'}, {'transcript': 'water to baby chicks headphone very own eggs when they would ask that returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined the final babies'}, {'transcript': 'two baby chicks headphone very own eggs when they would ask returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined to find her babies'}, {'transcript': 'water to baby chicks headphone very own eggs when they would ask returned home from gathering food for 24 x 7 Pro Kabaddi final anywhere natural on what you discover the Lion treks around her to for it tracks therefore be determined the final babies'}], 'final': True}
\ No newline at end of file
--- a/Indexing/upload/Eduscope_3_00_01_31-00_10_30.txt
+++ b/Indexing/upload/Eduscope_3_00_01_31-00_10_30.txt
+we will start with supervised learning and unsupervised learning is the machine talking about using the data which is well labelled let's marry right then what is the telling ok low-income and other companies of this well it's a low-income marriage male and living in Android with some number of children no like you know what is this you are giving some attributes and the corresponding outcome with positive and that you're giving it will look at the patterns and identify ok if I get which type of combination will be the day is it right one thank you to another example is so it pays in the email spam filter problem we have the date of receipt of email with over text within every evening we also know which of these moon so when you are having condition with its like that only thing is you are giving just the data and the outcome to find out is it a problem with each and every moving into an excellent so what is the answer 100 emails in other two machine is just looking at it and see what is this what is this is it is very simple process of partition in set of data object in the subject you are breaking how to cancel the items in a manner that you are maximized in Singh similarity between within one and you heard maximize the gift of fighter in separate that and it has Max we have to maximum put it into something that you don't know how many subject you will get to you will get then you can licence ok 
\ No newline at end of file
--- a/Indexing/upload/IAS_Lec1_10min.txt
+++ b/Indexing/upload/IAS_Lec1_10min.txt
+security that Security at your home or anything else you can think who are the people involved who would you who would you attributes security in your home who is the person who is responsible for security usually at home in your pin it can be different from house to house parents ok very good so parents are you should I am in your mother most of the time asking you know that did you lock the door before going to sleep so that I want to be protected from teams very good if you don't think so it's just that we want to learn what is security in information system security in information security at home means to protect our property and personal and we have property things that we need protection from so these new words in other subjects have a very clear idea about what is computer security is already told me some of this was not trying to protect information systems that that we keep hold of three things this is the important what are the 3 things integrity and availability and it's called confidential guys no matter which part of the world you leaving wherever you go security is always always about these three things what are the three things easy to remember confidentiality integrity and availability ok so I will go through this time one by one and explain the meaning of every time ok but for you to remember very easily remember these three letters very familiar acronym of America right Central Intelligence Agency no no that but confidentiality integrity availability you should remember get so you have this we want this we want we want for what one of the student send me a message saying we want our valuable things and you know personal items protect just that information system resources should be protected what are the resources we want the hardware to be protected we want this software to be protected we want to be protected and the data that is stored in the hardware and not only that don't forget you have you know service computers and everything but none of these things are useful if we don't connect internet and connectivity is very so we want to protect this one also which is called telecommunication and we want to make sure that this is also protected not only this not only this not only the software inside we also want to make sure whatever the data that is going here and coming in is also protect that is clear so now all we need to learn about these three things what do we mean by confidentiality integrity and that's what we want to learn today is not happen after that your time to talk not my type what does the thief usually do after please talk to me guys otherwise you know I'm already losing interest I am looking at my phone good still valuable things stealing is not there to have a cup of tea electricity line so that 
\ No newline at end of file
--- a/Indexing/upload/IAS_Lec1_30min.txt
+++ b/Indexing/upload/IAS_Lec1_30min.txt
+in other subjects not in the security subject security ok but let's have a very clear idea about what is computer security so you guys already told me some of these words it taught me about protection it was not me Who said this is it was you you send we are trying to protect Information Systems this is what you said by that means we keep hold of three things this is the important what are the 3 things integrity and availability and it's called confidential guys no matter which part of the world you leaving wherever you go security is always always about these three things what are the three things easy to remember confidentiality integrity and availability ok so I will go through this x one by one and explain the meaning of every time ok but for you to remember very easily remember these three letters very familiar acronym of America right Central Intelligence Agency no not that but confidentiality integrity availability you should remember get so you have this year we want we want we want for what one of the student send me a message saying we want our valuable things and you know personal items protect just like that information system resources should be protected what are the resources we want to be protected this software to be protected we want to be protected and the data that is stored in the hardware and not only that don't forget you have you know service computers and everything but none of these things are useful if we don't connect internet and connectivity is very important so we want to protect this one also which is called telecommunication and we want to make sure that this is also protected not only this not only this not only the software we also want to make sure whatever the data that is going here and coming in is also checked that is clear so now all we need to learn about these three things what do we mean by confidential the integrity and that's what we want to learn today is not Dilshan you already gave a letter to your house to house and so he is awarded the unauthorised access has happened what does she usually do after that your time to talk not my type what does the thief usually do after that please talk to me gais otherwise you know I'm already losing interest now I'm looking at my very good Steel valuable things stealing is not there to have a cup of tea I don't think so that so we want to make sure that we don't give unauthorised access cause of unauthorised access outside the should not be able to use power system because of unauthorised access outside should not be able to see our data information as very important we call it disclosure disclosure means people releasing or looking at our information without permission problem right is a big problem so we don't want that and we don't want our systems to be distracted we don't have systems to be district imagine you are doing your online exam M mixer and suddenly the midterm exam system Server is not working that ko destruction and then we have another modification this is something that is different in information systems why modification means someone will come and change things Samay Tak will happen to our system and it will change of a data it should it will change our program it will change the hardware behaviour it will change our software behaviour modification or else they will completely destroyed The Attack will completely destroy the date completely destroy our hardware School destruction so we want to make sure none of these things are there so basically we want protection from all these things that we discuss if we somehow managed to achieve this this this and this this and this if you somehow manage to give protection to these things that I told you we will have confidentiality integrity and availability these are the three things that we are trying to achieve so let's talk about this three things sahi se Nanded definition I don't want to go into each and every definition but you can see the same was appearing right same was happy that you can see availability is their integrity is confidential this yes there are two Newton's call non-repudiation and authentication which we will talk about it but technically speaking even these two guys actually belong to intake I will tell you later ok to Newtons belong to integrate ok so now what is this usually in cybersecurity we call this a try and some people also called the triangle that's also ke triangle so you can see in security we want to have equal integrity protection equal availability action and equal confidence interval so I think what you really need to know is ok what do we mean by this what do you mean by Intex what do you mean by available so that whatever they discussed now I think I think that confidential information should not be made available it should not be disclosed it should not be there for anybody who does not have permission to see you guys tell me a piece of information that you have that is very confidential for you is a secret in other words we are talking about secrecy what is the most secretive thing for you guys anyone who is using computers and systems and website and very good I got one answer but is a private and will not be able to see so I I prefer if you guys can discuss in the public very good I already have an answer in the public chat red one of the most important thing that you need drivers your confidentiality is your password something that everyone should not be able to see that the only people who can see that so very very important ok can you tell me something else that you you think data confidentiality of privacy is needed some other piece of information in some other system I need another answer not only password password is a good answer but I need to know the exam bank account number very good credit card numbers pin numbers you are now understanding this right so these are private things ok this is all about personal information about sleep systems and website and some of these systems have some information and some systems and applications that are very very confidential piece of information PDF file or a word file that you think should be confidential anything let me give you he can you think of a piece of information that needs confidentiality only up to period of time after that it's ok after that time and it's ok but only up to that point it should be confidential should be ok very good I am now getting some good dances exam papers even may be the result right so because the results will have a specific date that you can see should have a lot of confidence according to world famous cyber security organisation is the National Institute of Technology and its not the only organisation that working on cybersecurity and also they only focus is not also on security they have other areas as well but according to them we want to make sure that our information right and our personal information and privacy is protected so we want to assure that the owners have full control who can see my data who can come into the system who can look at these files who can look at this letter code of things so confidentiality in other words it's the same thing as secrecy same thing you can edit and change very important you can edit and change data only if you have permission only if you have permission unchanged it that integrity is a very important requirement that very very important ok can you tell me a system at least that need to make sure integrity is there 100% we need integrity can you give me an example nobody should be able to edit that without permission tell me an example someone saying courseweb that also good because you should as a student you should not be able to login to cause a band change my Aas page are that's very bad that's very good but the sun first name he has given a very good answer which is your research student profile very good now you get the idea you should never be able to go to your profile and I have a change that I can't go into your profile and change your data if you should not be so that subject is configured as 50% assignment not assignment 50% mid-term and assignment and 50% final exam system I change this information and there is some sort of a student who is a very good hacker don't do this you will get punished for your life in the student who has very good marks in exam a + b x 2 and maybe you don't even need to x if you can say great that attacker did not change the data he changed the system is changed to Khud Hi change the way that the system work there are two different things in integrated Data integrity and system intended but most of the time in our discussions we will be talking about dating dating so once again you can clearly see that is also talking about people with permission and without permission if you have the permission to do it you should be able to change you should be able to modify but if you don't have the permission and as a lecturer I can go to the page and footer not but I can't go to your HCG page and footer not I am not the lecture in charge for that subject so I can't do it so that's how it integrity is protect you have to understand ok let's move on the final one should be available if you have permission to look at the data if you have permission to access the data the data and the system should be available in the world will have different tell me in your opinion a system that should never go down so it should never go out of service during a certain time period in example you can think of anything from your life in general something in the industry anything I was sleeping availability I explain the availability so it's basically it should be working that should be available I tell you for the next five minutes ABC down are you guys going to suicide I am I going to suicide are you guys going to like a lot of trouble because of that dishoom lecture system using video conferencing tool not allowed to go down almost 24/7 should work as a traffic system very good traffic control system if it goes down during the height of a period of the city my goodness big problem there is another one exam result 8 p.m. today it should be a very patient and patient information there are some systems now let me let me put something here confidentiality integrity availability ok there are some system we can say which one is this if you have to banking transaction system which one should I use Si I ok ok who was that 
\ No newline at end of file
--- a/Indexing/upload/text.txt
+++ b/Indexing/upload/text.txt
+supervised learning is a learning English speak paint machine ok talking about I told you machine learning using the data which is well labelled what do you mean by that mean something is old 
\ No newline at end of file
--- a/TopicIndexing/api.py
+++ b/TopicIndexing/api.py
+from flask import Flask, request, render_template, send_file
+from flask_cors import CORS
+import werkzeug
+import cv2
+import note_generator.note as note_gen
+import topics_find.audio_gen as topic_gen
+import note_generator.write_word as writer
+import topics_find.bert as bert
+import os
+
+os.environ['KMP_DUPLICATE_LIB_OK']='True'
+
+app = Flask(__name__)
+CORS(app)
+
+video_file_name = ''
+pptx_file_name = ''
+
+
+@app.route('/')
+def index():
+    return render_template('index.html')
+
+
+@app.route('/upload_action', methods=['GET', 'POST'])
+def upload_action():
+    lecture_video = request.files['lecture_video']
+    filename_v = werkzeug.utils.secure_filename(lecture_video.filename)
+    print("\nReceived image File name : " + lecture_video.filename)
+    lecture_video.save('upload/' + filename_v)
+
+    lecture_ppt = request.files['lecture_ppt']
+    filename = werkzeug.utils.secure_filename(lecture_ppt.filename)
+    print("\nReceived image File name : " + lecture_ppt.filename)
+    lecture_ppt.save('upload/' + filename)
+
+    global video_file_name
+    global pptx_file_name
+
+    video_file_name = filename_v
+    pptx_file_name = filename
+
+    return render_template('upload.html', video_file_name=video_file_name, pptx_file_name=pptx_file_name)
+
+
+
+@app.route('/generate_topics', methods=['GET', 'POST'])
+def generate_topics():
+    global video_file_name
+    global pptx_file_name
+
+    text_list_from_video, all_text = topic_gen.split_video_file('upload/' + video_file_name)
+
+    topic_list = []
+    for index in text_list_from_video:
+        topic_list.append([index[0], index[1], bert.get_topics_new(index[1])])
+
+    return render_template('topics.html', topic_list=topic_list)
+
+
+@app.route('/generate_short_note', methods=['GET', 'POST'])
+def generate_short_note():
+    global video_file_name
+    global pptx_file_name
+
+    text_from_pptx = note_gen.generate_note('upload/' + pptx_file_name)
+    text_list_from_video, all_text = topic_gen.split_video_file('upload/' + video_file_name)
+
+    writer.create_doc()
+
+    writer.write_note('short note from lecture video :- ')
+    for i in text_list_from_video:
+        writer.write_note(i[1])
+
+    writer.write_note('short note from lecture slide (pptx) :- ')
+    writer.write_note(text_from_pptx)
+
+    writer.save_note('upload/' + pptx_file_name.split('.')[0] + '.docx')
+
+    doc_filename = pptx_file_name.split('.')[0] + '.docx'
+    print(doc_filename)
+
+    return render_template('short_notes.html', filename=doc_filename)
+
+
+
+@app.route('/short_note/<name>')
+def short_note(name):
+    doc = 'upload/' + name
+    print('request', doc)
+    return send_file(doc, as_attachment=True)
+
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=5200, debug=True)
--- a/TopicIndexing/static/css/main.css
+++ b/TopicIndexing/static/css/main.css
+
+.html { 
+	height: 100%; 
+}
+* {box-sizing: border-box;}
+
+.body { 
+	margin: 0;  
+	height: 100%; 
+	font-family: Arial, Helvetica, sans-serif;	
+	background-image:  url( ../images/bg.jpg )
+}
+
+.body_login {
+	height:50%;
+	width: 50%;
+	padding: 10px;
+	margin: 60px auto;
+	font-family: Arial, Helvetica, sans-serif;
+	background-image:  url( ../images/bg.jpg )
+}
+
+.header {
+	overflow: hidden;
+	background-color: #e28743;
+	padding: 5px 10px;
+}
+
+.header a {
+	float: left;
+	color: White;
+	text-align: center;
+	padding: 12px;
+	text-decoration: none;
+	font-size: 18px; 
+	line-height: 25px;
+	border-radius: 4px;
+}
+
+.header a.logo {
+	font-size: 25px;
+	font-weight: bold;
+}
+
+.header a:hover {
+	background-color: #76b5c5;
+	color: black;
+}
+
+.header a.active {
+	background-color: #76b5c5;
+	color: white;
+}
+
+.header-right {
+	float: right;
+}
+
+@media screen and (max-width: 500px) {
+  .header a {
+	float: none;
+	display: block;
+	text-align: left;
+  }
+  
+  .header-right {
+	float: none;
+  }
+}
+.global-container{
+	height:100%;
+	display: flex;
+	align-items: center;
+	justify-content: center;
+	float: left;
+	width: 100%;
+}
+
+.login-form {
+	opacity: 0.9;
+	width: 340px;
+	margin: 50px auto;
+	font-size: 15px;
+} 
+
+.login-form form {
+	margin-bottom: 15px;
+	background: #f7f7f7;
+	box-shadow: 0px 2px 2px rgba(0, 0, 0, 0.3);
+	padding: 30px;
+}
+
+.login-form h2 {
+	margin: 0 0 15px;
+}
+
+.form-control, .btn {
+	min-height: 38px;
+	border-radius: 2px;
+}
+
+.btn {        
+	font-size: 15px;
+	font-weight: bold;
+}
+
+form{
+	padding-top: 10px;
+	font-size: 14px;
+	margin-top: 30px;
+}
+
+.card-title{ font-weight:300; }
+
+.card{opacity: 0.9;}
+
+.effect7{
+	position:relative;
+	-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+	-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+	box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+}
+
+.btn{
+	font-size: 14px;
+	margin-top:20px;
+}
+
+.login-form{ 
+	width:700px;
+	margin:20px;
+}
+
+.sign-up{
+	text-align:center;
+	padding:20px 0 0;
+}
+
+.alert{
+	margin-bottom:-30px;
+	font-size: 13px;
+	margin-top:20px;
+}
+
+.modal-dialog {
+	max-width: 800px;
+	margin: 30px auto;
+}
+
+.modal-body {
+	position:relative;
+	padding:0px;
+}
+
+.close {
+	position:absolute;
+	right:-30px;
+	top:0;
+	z-index:999;
+	font-size:2rem;
+	font-weight: normal;
+	color:#fff;
+	opacity:1;
+}
+
+.custom-file-uploader {
+	position: relative;
+	input[type='file'] {
+		display: block;
+		position: absolute;
+		top: 0;
+		right: 0;
+		bottom: 0;
+		left: 0;
+		z-index: 5;
+		width: 100%;
+		height: 100%;
+		opacity: 0;
+		cursor: default;
+	}
+}
\ No newline at end of file
--- a/TopicIndexing/static/css/style.css
+++ b/TopicIndexing/static/css/style.css
+/* Always set the map height explicitly to define the size of the div
+       * element that contains the map. */
+#map {
+  height: 100%;
+}
+
+/* Optional: Makes the sample page fill the window. */
+html,
+body {
+  height: 100%;
+  margin: 0;
+  padding: 0;
+}
\ No newline at end of file
--- a/TopicIndexing/static/images/bg.jpg
+++ b/TopicIndexing/static/images/bg.jpg
--- a/TopicIndexing/static/js/index.js
+++ b/TopicIndexing/static/js/index.js
+const citymap = {
+  chicago: {
+    center: { lat: 6.9061, lng: 79.9696 },
+    population: 100,
+  }
+};
+
+function initMap() {
+  // Create the map.
+  const map = new google.maps.Map(document.getElementById("map"), {
+    zoom: 15,
+    center: { lat: 6.9061, lng: 79.9696 },
+    mapTypeId: "terrain",
+  });
+
+  // Construct the circle for each value in citymap.
+  // Note: We scale the area of the circle based on the population.
+  for (const city in citymap) {
+    // Add the circle for this city to the map.
+    const cityCircle = new google.maps.Circle({
+      strokeColor: "#FF0000",
+      strokeOpacity: 0.8,
+      strokeWeight: 2,
+      fillColor: "#FF0000",
+      fillOpacity: 0.35,
+      map,
+      center: citymap[city].center,
+      radius: Math.sqrt(citymap[city].population) * 100,
+    });
+  }
+}
\ No newline at end of file
--- a/TopicIndexing/static/js/main.js
+++ b/TopicIndexing/static/js/main.js
+document.querySelectorAll(".drop-zone__input").forEach((inputElement) => {
+  const dropZoneElement = inputElement.closest(".drop-zone");
+
+  dropZoneElement.addEventListener("click", (e) => {
+    inputElement.click();
+  });
+
+  inputElement.addEventListener("change", (e) => {
+    if (inputElement.files.length) {
+      updateThumbnail(dropZoneElement, inputElement.files[0]);
+    }
+  });
+
+  dropZoneElement.addEventListener("dragover", (e) => {
+    e.preventDefault();
+    dropZoneElement.classList.add("drop-zone--over");
+  });
+
+  ["dragleave", "dragend"].forEach((type) => {
+    dropZoneElement.addEventListener(type, (e) => {
+      dropZoneElement.classList.remove("drop-zone--over");
+    });
+  });
+
+  dropZoneElement.addEventListener("drop", (e) => {
+    e.preventDefault();
+
+    if (e.dataTransfer.files.length) {
+      inputElement.files = e.dataTransfer.files;
+      updateThumbnail(dropZoneElement, e.dataTransfer.files[0]);
+    }
+
+    dropZoneElement.classList.remove("drop-zone--over");
+  });
+});
+
+/**
+ * Updates the thumbnail on a drop zone element.
+ *
+ * @param {HTMLElement} dropZoneElement
+ * @param {File} file
+ */
+function updateThumbnail(dropZoneElement, file) {
+  let thumbnailElement = dropZoneElement.querySelector(".drop-zone__thumb");
+
+  // First time - remove the prompt
+  if (dropZoneElement.querySelector(".drop-zone__prompt")) {
+    dropZoneElement.querySelector(".drop-zone__prompt").remove();
+  }
+
+  // First time - there is no thumbnail element, so lets create it
+  if (!thumbnailElement) {
+    thumbnailElement = document.createElement("div");
+    thumbnailElement.classList.add("drop-zone__thumb");
+    dropZoneElement.appendChild(thumbnailElement);
+  }
+
+  thumbnailElement.dataset.label = file.name;
+
+  // Show thumbnail for image files
+  if (file.type.startsWith("image/")) {
+    const reader = new FileReader();
+
+    reader.readAsDataURL(file);
+    reader.onload = () => {
+      thumbnailElement.style.backgroundImage = `url('${reader.result}')`;
+    };
+  } else {
+    thumbnailElement.style.backgroundImage = null;
+  }
+}
\ No newline at end of file
--- a/TopicIndexing/templates/index.html
+++ b/TopicIndexing/templates/index.html
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <title>Index</title>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
+</head>
+<style>
+	html {
+		height: 100%;
+	}
+	* {box-sizing: border-box;}
+
+	body {
+		margin: 0;
+		height: 100%;
+		font-family: Arial, Helvetica, sans-serif;
+		background-image: url({{ url_for('static', filename='images/bg.jpg') }})
+	}
+
+
+	.header {
+		overflow: hidden;
+		background-color: #970103;
+		padding: 5px 10px;
+	}
+
+	.header a {
+		float: left;
+		color: White;
+		text-align: center;
+		padding: 12px;
+		text-decoration: none;
+		font-size: 18px;
+		line-height: 25px;
+		border-radius: 4px;
+	}
+
+	.header a.logo {
+		font-size: 25px;
+		font-weight: bold;
+	}
+
+	.header a:hover {
+		background-color: #8a8a8a;
+		color: black;
+	}
+
+	.header a.active {
+		background-color: #0b0b0b;
+		color: white;
+	}
+
+	.header-right {
+		float: right;
+	}
+
+	@media screen and (max-width: 500px) {
+	  .header a {
+		float: none;
+		display: block;
+		text-align: left;
+	  }
+
+	  .header-right {
+		float: none;
+	  }
+	}
+
+   .global-container{
+		margin-top: 20px;
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		float: left;
+		width: 100%;
+	}
+
+	form{
+		padding-top: 10px;
+		font-size: 14px;
+		margin-top: 30px;
+		margin-left: 50px;
+		margin-right: 50px;
+	}
+
+	.card-title{ font-weight:300; }
+
+	.card{opacity: 0.95;}
+
+	.effect7{
+		position:relative;
+		-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+		-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+        box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+	}
+	.login-form{
+		width:1175px;
+		margin:20px;
+	}
+
+
+.drop-zone {
+  max-width: 300px;
+  height: 300px;
+  padding: 25px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  text-align: center;
+  font-family: "Quicksand", sans-serif;
+  font-weight: 500;
+  font-size: 20px;
+  cursor: pointer;
+  color: #cccccc;
+  border: 4px dashed #345BDE;
+  border-radius: 10px;
+}
+
+.drop-zone--over {
+  border-style: solid;
+}
+
+.drop-zone__input {
+  display: none;
+}
+
+.drop-zone__thumb {
+  width: 100%;
+  height: 100%;
+  border-radius: 10px;
+  overflow: hidden;
+  background-color: #cccccc;
+  background-size: cover;
+  position: relative;
+}
+
+.drop-zone__thumb::after {
+  content: attr(data-label);
+  position: absolute;
+  bottom: 0;
+  left: 0;
+  width: 100%;
+  padding: 5px 0;
+  color: #ffffff;
+  background: rgba(0, 0, 0, 0.75);
+  font-size: 14px;
+  text-align: center;
+}
+
+
+
+
+
+
+
+
+
+
+</style>
+<body>
+
+
+<div class="global-container">
+    <br>
+    <br>
+    <br>
+    <div class="card login-form effect7">
+        <div class="card-body">
+            <center>
+                <h3>Please Upload Lecture and Slides</h3>
+            </center>
+            <form action="upload_action" method="post" enctype="multipart/form-data">
+                <div class="form-group">
+                    <h5>Please Select Lecture</h5>
+                    <input type="file" name="lecture_video" class="form" accept="video/mp4,video/x-m4v,video/*"
+                           required>
+                </div>
+                <div class="form-group">
+                    <h5>Please Select Slides</h5>
+                    <input type="file" name="lecture_ppt" class="form" accept=".ppt, .pptx" required>
+                </div>
+                <div class="form-group">
+                    <button type="submit" class="btn btn-primary btn-block">upload</button>
+                </div>
+            </form>
+
+        </div>
+    </div>
+</div>
+</body>
+</html>
--- a/TopicIndexing/templates/short_notes.html
+++ b/TopicIndexing/templates/short_notes.html
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <title>Short Note</title>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
+</head>
+<style>
+	html {
+		height: 100%;
+	}
+	* {box-sizing: border-box;}
+
+	body {
+		margin: 0;
+		height: 100%;
+		font-family: Arial, Helvetica, sans-serif;
+		background-image: url({{ url_for('static', filename='images/bg.jpg') }})
+	}
+
+
+	.header {
+		overflow: hidden;
+		background-color: #970103;
+		padding: 5px 10px;
+	}
+
+	.header a {
+		float: left;
+		color: White;
+		text-align: center;
+		padding: 12px;
+		text-decoration: none;
+		font-size: 18px;
+		line-height: 25px;
+		border-radius: 4px;
+	}
+
+	.header a.logo {
+		font-size: 25px;
+		font-weight: bold;
+	}
+
+	.header a:hover {
+		background-color: #8a8a8a;
+		color: black;
+	}
+
+	.header a.active {
+		background-color: #0b0b0b;
+		color: white;
+	}
+
+	.header-right {
+		float: right;
+	}
+
+	@media screen and (max-width: 500px) {
+	  .header a {
+		float: none;
+		display: block;
+		text-align: left;
+	  }
+
+	  .header-right {
+		float: none;
+	  }
+	}
+
+   .global-container{
+		margin-top: 20px;
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		float: left;
+		width: 100%;
+	}
+
+	form{
+		padding-top: 10px;
+		font-size: 14px;
+		margin-top: 50px;
+		margin-left: 50px;
+		margin-right: 50px;
+	}
+
+	.card-title{ font-weight:300; }
+
+	.card{
+	opacity: 0.95;
+
+	}
+
+	.card-body{
+  margin-top: 100px;
+
+	}
+
+	.effect7{
+		position:relative;
+		-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+		-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+        box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+	}
+	.login-form{
+		width:1175px;
+		margin:20px;
+	}
+
+
+
+
+
+
+
+
+
+
+
+</style>
+<body>
+
+
+<div class="global-container">
+    <br>
+    <br>
+    <br>
+    <div class="card login-form effect7">
+        <div class="card-body">
+            <center>
+                <h2>Short Note Ready</h2>
+                <a href="/short_note/{{filename}}" style="color:red;">Download as doc file</a>
+            </center>
+
+
+
+        </div>
+    </div>
+</div>
+</body>
+</html>
--- a/TopicIndexing/templates/topics.html
+++ b/TopicIndexing/templates/topics.html
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <title>Topics</title>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
+</head>
+<style>
+	html {
+		height: 100%;
+	}
+	* {box-sizing: border-box;}
+
+	body {
+		margin: 0;
+		height: 100%;
+		font-family: Arial, Helvetica, sans-serif;
+		background-image: url({{ url_for('static', filename='images/bg.jpg') }})
+	}
+
+
+	.header {
+		overflow: hidden;
+		background-color: #970103;
+		padding: 5px 10px;
+	}
+
+	.header a {
+		float: left;
+		color: White;
+		text-align: center;
+		padding: 12px;
+		text-decoration: none;
+		font-size: 18px;
+		line-height: 25px;
+		border-radius: 4px;
+	}
+
+	.header a.logo {
+		font-size: 25px;
+		font-weight: bold;
+	}
+
+	.header a:hover {
+		background-color: #8a8a8a;
+		color: black;
+	}
+
+	.header a.active {
+		background-color: #0b0b0b;
+		color: white;
+	}
+
+	.header-right {
+		float: right;
+	}
+
+	@media screen and (max-width: 500px) {
+	  .header a {
+		float: none;
+		display: block;
+		text-align: left;
+	  }
+
+	  .header-right {
+		float: none;
+	  }
+	}
+
+   .global-container{
+		margin-top: 20px;
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		float: left;
+		width: 100%;
+	}
+
+	form{
+		padding-top: 10px;
+		font-size: 14px;
+		margin-top: 50px;
+		margin-left: 50px;
+		margin-right: 50px;
+	}
+
+	.card-title{ font-weight:300; }
+
+	.card{
+	opacity: 0.95;
+
+	}
+
+	.card-body{
+  margin-top: 100px;
+
+	}
+
+	.effect7{
+		position:relative;
+		-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+		-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+        box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+	}
+	.login-form{
+		width:1175px;
+		margin:20px;
+	}
+
+
+
+
+
+
+
+
+
+
+
+
+</style>
+<body>
+
+
+<div class="global-container">
+    <br>
+    <br>
+    <br>
+    <div class="card login-form effect7">
+        <div class="card-body">
+            <center>
+                <h2>Topics Ready</h2>
+            </center>
+            <table class="table" id="table">
+                <thead>
+                <tr>
+                    <th>Index</th>
+                    <th>Key Points ( Topics )</th>
+                </tr>
+                </thead>
+                <tbody>
+                {% for row in topic_list %}
+                <tr>
+                    <td>{{row[0]}}</td>
+                    <td>{{row[2]}}</td>
+
+                </tr>
+                 {% endfor %}
+                </tbody>
+            </table>
+
+
+        </div>
+    </div>
+</div>
+</body>
+</html>
--- a/TopicIndexing/templates/upload.html
+++ b/TopicIndexing/templates/upload.html
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <title>Upload</title>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
+</head>
+<style>
+	html {
+		height: 100%;
+	}
+	* {box-sizing: border-box;}
+
+	body {
+		margin: 0;
+		height: 100%;
+		font-family: Arial, Helvetica, sans-serif;
+		background-image: url({{ url_for('static', filename='images/bg.jpg') }})
+	}
+
+
+	.header {
+		overflow: hidden;
+		background-color: #970103;
+		padding: 5px 10px;
+	}
+
+	.header a {
+		float: left;
+		color: White;
+		text-align: center;
+		padding: 12px;
+		text-decoration: none;
+		font-size: 18px;
+		line-height: 25px;
+		border-radius: 4px;
+	}
+
+	.header a.logo {
+		font-size: 25px;
+		font-weight: bold;
+	}
+
+	.header a:hover {
+		background-color: #8a8a8a;
+		color: black;
+	}
+
+	.header a.active {
+		background-color: #0b0b0b;
+		color: white;
+	}
+
+	.header-right {
+		float: right;
+	}
+
+	@media screen and (max-width: 500px) {
+	  .header a {
+		float: none;
+		display: block;
+		text-align: left;
+	  }
+
+	  .header-right {
+		float: none;
+	  }
+	}
+
+   .global-container{
+		margin-top: 20px;
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		float: left;
+		width: 100%;
+	}
+
+	form{
+		padding-top: 10px;
+		font-size: 14px;
+		margin-top: 50px;
+		margin-left: 50px;
+		margin-right: 50px;
+	}
+
+	.card-title{ font-weight:300; }
+
+	.card{
+	opacity: 0.95;
+
+	}
+
+	.card-body{
+  margin-top: 100px;
+
+	}
+
+	.effect7{
+		position:relative;
+		-webkit-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+		-moz-box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+        box-shadow:0 1px 20px rgba(0, 0, 0, 0.3), 0 0 40px rgba(0, 0, 0, 0.1) inset;
+	}
+	.login-form{
+		width:1175px;
+		margin:20px;
+	}
+
+
+
+
+
+
+
+
+
+
+
+</style>
+<body>
+
+
+<div class="global-container">
+    <br>
+    <br>
+    <br>
+    <div class="card login-form effect7">
+        <div class="card-body">
+            <center>
+                <h2>File uploaded successfully</h2>
+                <h4 style="color:red;">Note generation and topic analysing will take some time</h4>
+            </center>
+
+            <form>
+                <h4>Lecture Video File : {{video_file_name}}</h4>
+                <h4>Lecture Slide File (pptx) : {{pptx_file_name}}</h4>
+                <div class="form-group">
+                    <a href="/generate_short_note" class="btn btn-primary btn-block">Generate Short Note</a>
+                </div>
+                <div class="form-group">
+                    <a href="/generate_topics" class="btn btn-primary btn-block">Analyse Topics</a>
+                </div>
+            </form>
+
+        </div>
+    </div>
+</div>
+</body>
+</html>
--- a/TopicIndexing/topics_find/audio_gen.py
+++ b/TopicIndexing/topics_find/audio_gen.py
+import moviepy.editor as mp
+from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
+from moviepy.editor import VideoFileClip
+import os
+from topics_find import text_gen
+
+
+def convert_video_to_audio(filename):
+    clip = mp.VideoFileClip(r"" + filename)
+    audio_file_name = str(filename).split('/')[-1].replace('.mp4', '.wav')
+
+    clip.audio.write_audiofile(r"topics_find/audio_input/" + audio_file_name)
+
+    return text_gen.convert_audio_to_text("topics_find/audio_input/" + audio_file_name)
+
+
+def split_video_file(filename):
+    return_list = []
+    all_text = ''
+
+    required_video_file = filename
+
+    files = os.listdir('topics_find/video_input')
+
+    for filename in files:
+        os.remove('topics_find/video_input/' + filename)
+
+    total_length = VideoFileClip(required_video_file).duration
+    print(total_length)
+
+    no_of_slices = int(total_length / 50) + 1
+
+    time_grid = []
+
+    for i in range(0, no_of_slices):
+        time_grid.append(i * 50)
+
+    for i in range(no_of_slices):
+        if i == len(time_grid) - 1:
+            # ffmpeg_extract_subclip(required_video_file, time_grid[i], total_length - time_grid[i],
+            #                        targetname='videos/' + str(i) + ".mp4")
+            pass
+        else:
+            ffmpeg_extract_subclip(required_video_file, time_grid[i], time_grid[i + 1],
+                                   targetname='topics_find/video_input' + str(i) + ".mp4")
+            text = convert_video_to_audio('topics_find/video_input' + str(i) + ".mp4")
+            all_text += text + ' '
+            return_list.append([i, text])
+
+    return return_list, all_text
--- a/TopicIndexing/topics_find/bert.py
+++ b/TopicIndexing/topics_find/bert.py
+import nltk
+import topics_find.question_generator as q_gen
+
+# nltk.download('words')
+
+# from bertopic import BERTopic
+from nltk.corpus import words
+
+# model = BERTopic(verbose=True)
+
+
+def get_topics(file):
+    topics_outputs = []
+
+    docs = []
+    with open(file) as file:
+        for line in file:
+            docs.append(line.rstrip())
+
+    topics, probabilities = model.fit_transform(docs)
+    #
+    print(model.get_topic_freq())
+    #
+    print('done')
+    #
+    # model.get_topic_freq().head(11)
+
+    print(model.get_topics())
+
+    for i in model.get_topic(0):
+        if i[0] in words.words():
+            pass
+        else:
+            print(i[0])
+            topics_outputs.append(i[0])
+
+    return topics_outputs
+
+
+def get_topics_new(text):
+    topics = q_gen.get_keywords(text, q_gen.summarizer(text))
+    print(topics)
+    return topics
--- a/TopicIndexing/topics_find/question_generator.py
+++ b/TopicIndexing/topics_find/question_generator.py
+from textwrap3 import wrap
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import random
+import numpy as np
+import nltk
+# nltk.download('punkt')
+# nltk.download('brown')
+# nltk.download('wordnet')
+# nltk.download('stopwords')
+from nltk.corpus import wordnet as wn
+from nltk.tokenize import sent_tokenize
+from nltk.corpus import stopwords
+import string
+import pke
+import traceback
+from flashtext import KeywordProcessor
+
+summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
+summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+summary_model = summary_model.to(device)
+
+
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def postprocesstext(content):
+    final = ""
+    for sent in sent_tokenize(content):
+        sent = sent.capitalize()
+        final = final + " " + sent
+    return final
+
+
+def summarizer(text, model=summary_model, tokenizer=summary_tokenizer):
+    text = text.strip().replace("\n", " ")
+    text = "summarize: " + text
+    # print (text)
+    max_len = 512
+    encoding = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=False, truncation=True,
+                                     return_tensors="pt").to(device)
+
+    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
+
+    outs = model.generate(input_ids=input_ids,
+                          attention_mask=attention_mask,
+                          early_stopping=True,
+                          num_beams=3,
+                          num_return_sequences=1,
+                          no_repeat_ngram_size=2,
+                          min_length=75,
+                          max_length=300)
+
+    dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
+    summary = dec[0]
+    summary = postprocesstext(summary)
+    summary = summary.strip()
+
+    return summary
+
+
+def get_nouns_multipartite(content):
+    out = []
+    try:
+        extractor = pke.unsupervised.MultipartiteRank()
+        extractor.load_document(input=content)
+        #    not contain punctuation marks or stopwords as candidates.
+        pos = {'PROPN', 'NOUN'}
+        # pos = {'PROPN','NOUN'}
+        stoplist = list(string.punctuation)
+        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
+        stoplist += stopwords.words('english')
+        extractor.candidate_selection(pos=pos, stoplist=stoplist)
+        # 4. build the Multipartite graph and rank candidates using random walk,
+        #    alpha controls the weight adjustment mechanism, see TopicRank for
+        #    threshold/method parameters.
+        extractor.candidate_weighting(alpha=1.1,
+                                      threshold=0.75,
+                                      method='average')
+        keyphrases = extractor.get_n_best(n=15)
+
+        for val in keyphrases:
+            out.append(val[0])
+    except:
+        out = []
+        traceback.print_exc()
+
+    return out
+
+
+def get_keywords(originaltext, summarytext):
+    keywords = get_nouns_multipartite(originaltext)
+    print("keywords unsummarized: ", keywords)
+    keyword_processor = KeywordProcessor()
+    for keyword in keywords:
+        keyword_processor.add_keyword(keyword)
+
+    keywords_found = keyword_processor.extract_keywords(summarytext)
+    keywords_found = list(set(keywords_found))
+    print("keywords_found in summarized: ", keywords_found)
+
+    important_keywords = []
+    for keyword in keywords:
+        if keyword in keywords_found:
+            important_keywords.append(keyword)
+
+    return important_keywords[:1]
+
+
+question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
+question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
+question_model = question_model.to(device)
+
+
+def get_question(context, answer, model, tokenizer):
+    text = "context: {} answer: {}".format(context, answer)
+    encoding = tokenizer.encode_plus(text, max_length=384, pad_to_max_length=False, truncation=True,
+                                     return_tensors="pt").to(device)
+    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
+
+    outs = model.generate(input_ids=input_ids,
+                          attention_mask=attention_mask,
+                          early_stopping=True,
+                          num_beams=5,
+                          num_return_sequences=1,
+                          no_repeat_ngram_size=2,
+                          max_length=72)
+
+    dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
+
+    Question = dec[0].replace("question:", "")
+    Question = Question.strip()
+    return Question
+
+
+def generate_questions_and_answers(text):
+    set_seed(42)
+    summarized_text = summarizer(text, summary_model, summary_tokenizer)
+    imp_keywords = get_keywords(text, summarized_text)
+    question_and_answer_list = []
+    for answer in imp_keywords:
+        ques = get_question(summarized_text, answer, question_model, question_tokenizer)
+        question_and_answer_list.append([ques, answer.capitalize()])
+
+    return question_and_answer_list
+
+# xxx = """Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company
+# Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve
+# system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin
+# rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin.  In a recent tweet,
+# Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and
+# transaction, and hence was suspending vehicle purchases using the cryptocurrency.  A day later he again tweeted saying, “To be clear, I strongly
+# believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”.  It triggered a downward spiral for Bitcoin value but
+# the cryptocurrency has stabilised since.   A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising
+# that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency."""
+
+
+# print(generate_questions_and_answers(xxx))
+#
+# x = generate_questions_and_answers(xxx)
+#
+# for i in x:
+#     print(i[0])
+#     print(i[1])
--- a/TopicIndexing/topics_find/summary.py
+++ b/TopicIndexing/topics_find/summary.py
+import glob
+from pptx import Presentation
+import math
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, sent_tokenize
+import textract
+import os.path
+
+
+def create_sumall(abc, ratio):
+    if abc:
+        filename = abc
+        stop_word = ['is', 'a', 'and', 'the']
+
+        # Function to create Text summarization
+        def create_summ(text):
+            stopWords = set(stopwords.words("english"))
+            words = word_tokenize(text)
+            freqTable = dict()
+            for word in words:
+                word = word.lower()
+                if word in stopWords:
+                    continue
+                if word in freqTable:
+                    freqTable[word] += 1
+                else:
+                    freqTable[word] = 1
+
+            sentences = sent_tokenize(text)
+            sentenceValue = dict()
+
+            for sentence in sentences:
+                for word, freq in freqTable.items():
+                    if word in sentence.lower():
+                        if sentence in sentenceValue:
+                            sentenceValue[sentence] += freq
+                        else:
+                            sentenceValue[sentence] = freq
+
+            sumValues = 0
+            for sentence in sentenceValue:
+                sumValues += sentenceValue[sentence]
+
+            lensenvalu = len(sentenceValue)
+            if lensenvalu == 0:
+                lensenvalu = 1
+                average = int(sumValues / lensenvalu)
+            else:
+                average = int(sumValues / lensenvalu)
+
+            summary = ''
+            for sentence in sentences:
+                if (sentence in sentenceValue) and (sentenceValue[sentence] > (
+                        ratio * average)):
+                    summary += " " + sentence
+
+            return summary
+
+        def read_full_pptxe(filename):
+            sentences = []
+            b = []
+            a = 0
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            s = create_summ(shape.text.replace("\n", " "))
+                            s = str(s)
+                            if (len(s)) >= 1:
+                                f = ["Slide " + str(a) + "-" + s]
+                                sentences.append(f)
+            return sentences
+
+        def read_full_docx(filename):
+            sentences = []
+            text = textract.process(filename)
+            temp = text.split(".")
+            for t in temp:
+                sentences.append(t.replace("\n", " "))
+            return sentences
+
+        extension = os.path.splitext(filename)[1]
+
+        if extension == 'docx':
+            read_full_docx(filename)
+        else:
+            read_full_pptxe(filename)
+
+        def Convert(string):
+            li = list(string.split(" "))
+            return li
+
+        def Convert2(string):
+            li = list(string.split("\n"))
+            return li
+
+        def read_slide3(filename):
+            a = 1
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            if a == 4 and shape.shape_id == 3:
+                                s3 = str(shape.text)
+                                return s3
+
+        def read_full_pptx(filename, sss):
+            numberslide = []
+            numberslide.append(sss)
+            a = 0
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            if shape.shape_id != 2:
+                                s = shape.text.replace("\n", " ")
+                                s = str(s)
+                                if (len(s)) >= 20 and a != 3 and a != 1 and a != 2:
+                                    lo_1 = [a for a in new_l1 if a in s.lower()]
+                                    f_lo_l = round((len(lo_1) / len_of_l1) * 100)
+                                    if f_lo_l >= 50:
+                                        f = "Slide " + str(a)
+                                        numberslide.append(f)
+
+            return numberslide
+
+        loooo = Convert2(read_slide3(filename))
+
+        abc = []
+        for i in loooo:
+            l1 = Convert(i.lower())
+            new_l1 = [w for w in l1 if w not in stop_word]
+            len_of_l1 = len(new_l1)
+            read_full_pptx(filename, i)
+            abc.append(read_full_pptx(filename, i))
+
+        return (read_full_pptxe(filename), abc)
+    else:
+        print('error')
--- a/TopicIndexing/topics_find/test_api.py
+++ b/TopicIndexing/topics_find/test_api.py
+from flask import Flask, request, url_for, redirect, render_template
+from flask_cors import CORS
+import werkzeug
+import topics_find.summary as summarizeed
+import json
+import textract
+from pptx import Presentation
+import os
+
+app = Flask(__name__)
+CORS(app)
+
+
+@app.route('/summerize', methods=['GET', 'POST'])
+def summerize():
+    file = request.files['file']
+    ratio = float(request.form['ratio'])
+    filename = werkzeug.utils.secure_filename(file.filename)
+    print("\nReceived image File name : " + file.filename)
+    file.save('upload/' + filename)
+
+    f, file_extension = os.path.splitext('upload/' + filename)
+    print(file_extension)
+
+    if file_extension == '.docx':
+        text = textract.process('upload/' + filename)
+        arr = str(text).replace("\\n", "")
+        arr = arr.replace("\\t", "")
+        arr = arr.replace("\\", "")
+
+        prs = Presentation()
+
+        lyt = prs.slide_layouts[0]  # choosing a slide layout
+
+        for x in range(0, 3):
+
+            if x == 2:
+                slide = prs.slides.add_slide(lyt)  # adding a slide
+                title = slide.shapes.title  # assigning a title
+                subtitle = slide.placeholders[1]  # placeholder for subtitle
+                subtitle.text = arr
+            else:
+                slide = prs.slides.add_slide(lyt)  # adding a slide
+                title = slide.shapes.title  # assigning a title
+                subtitle = slide.placeholders[1]  # placeholder for subtitle
+                title.text = "ignore"  # title
+                subtitle.text = "ignore"  # subtitle
+        prs.save("upload/slide3.pptx")  # saving file
+        print('file saved')
+        res = summarizeed.create_sumall('upload/slide3.pptx', ratio)
+    else:
+        res = summarizeed.create_sumall('upload/' + filename, ratio)
+
+    rr = []
+    for r in res[0]:
+        rr.append(r[0].replace('"', ''))
+
+    return_str = '{ "result" : ['
+
+    for i in range(len(rr)):
+        if i == len(rr) - 1:
+            return_str += '"' + rr[i] + '"'
+        else:
+            return_str += '"' + rr[i] + '"' + ','
+    return_str += ']}'
+
+    print(return_str)
+
+    return json.loads(return_str)
+
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=5005, debug=True)
--- a/TopicIndexing/topics_find/text_gen.py
+++ b/TopicIndexing/topics_find/text_gen.py
+import speech_recognition as sr
+import subprocess
+import os
+import sys
+
+PYTHONIOENCODING = "UTF-8"
+
+FOLDER_AUDIO = "audio_input"
+FOLDER_TEXT = "text_output"
+LANGUAGE = "en-US"
+
+
+# print("starting...")
+#
+# if not os.path.isdir(FOLDER_AUDIO):
+#     os.mkdir(FOLDER_AUDIO)
+#
+# if not os.path.isdir(FOLDER_TEXT):
+#     os.mkdir(FOLDER_TEXT)
+#
+# paths = [os.path.join(FOLDER_AUDIO, nome) for nome in os.listdir(FOLDER_AUDIO)]
+# files = [arq for arq in paths if os.path.isfile(arq)]
+# wav_files = [arq for arq in files if arq.lower().endswith(".wav")]
+#
+# for filename in wav_files:
+#     r = sr.Recognizer()
+#     with sr.AudioFile(filename) as source:
+#         audio = r.record(source)
+#
+#     command = r.recognize_google(audio, language='en-IN', show_all=True)
+#     print(command)
+#
+#     print("running file {}".format(filename))
+#
+#     filefinal = filename.split("audio_input/")[1].split(".wav")[0]
+#     filefinal = '{}/{}.txt'.format(FOLDER_TEXT, filefinal)
+#     with open(filefinal, 'w') as arq:
+#         arq.write(str(command))
+#
+#     print("create a new file {}".format(filefinal))
+#
+# print("finish")
+
+
+def convert_audio_to_text(filename):
+    r = sr.Recognizer()
+    with sr.AudioFile(filename) as source:
+        audio = r.record(source)
+
+    try:
+        command = r.recognize_google(audio, language='en-IN', show_all=True)
+        print(command["alternative"][0]["transcript"])
+        return command["alternative"][0]["transcript"]
+
+    except:
+        return 'did not convert'
+
+# convert_audio_to_text('audio_input/3.wav')