topic

540b8a5d · chaveenagit · 5854f523 · 540b8a5d · 540b8a5d · 540b8a5d
Commit 540b8a5d authored Oct 09, 2022 by chaveenagit
6 changed files
--- a/TopicIndexing/topics_find/audio_gen.py
+++ b/TopicIndexing/topics_find/audio_gen.py
+import moviepy.editor as mp
+from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
+from moviepy.editor import VideoFileClip
+import os
+from topics_find import text_gen
+
+
+def convert_video_to_audio(filename):
+    clip = mp.VideoFileClip(r"" + filename)
+    audio_file_name = str(filename).split('/')[-1].replace('.mp4', '.wav')
+
+    clip.audio.write_audiofile(r"topics_find/audio_input/" + audio_file_name)
+
+    return text_gen.convert_audio_to_text("topics_find/audio_input/" + audio_file_name)
+
+
+def split_video_file(filename):
+    return_list = []
+    all_text = ''
+
+    required_video_file = filename
+
+    files = os.listdir('topics_find/video_input')
+
+    for filename in files:
+        os.remove('topics_find/video_input/' + filename)
+
+    total_length = VideoFileClip(required_video_file).duration
+    print(total_length)
+
+    no_of_slices = int(total_length / 50) + 1
+
+    time_grid = []
+
+    for i in range(0, no_of_slices):
+        time_grid.append(i * 50)
+
+    for i in range(no_of_slices):
+        if i == len(time_grid) - 1:
+            # ffmpeg_extract_subclip(required_video_file, time_grid[i], total_length - time_grid[i],
+            #                        targetname='videos/' + str(i) + ".mp4")
+            pass
+        else:
+            ffmpeg_extract_subclip(required_video_file, time_grid[i], time_grid[i + 1],
+                                   targetname='topics_find/video_input' + str(i) + ".mp4")
+            text = convert_video_to_audio('topics_find/video_input' + str(i) + ".mp4")
+            all_text += text + ' '
+            return_list.append([i, text])
+
+    return return_list, all_text
--- a/TopicIndexing/topics_find/bert.py
+++ b/TopicIndexing/topics_find/bert.py
+import nltk
+import topics_find.question_generator as q_gen
+
+# nltk.download('words')
+
+# from bertopic import BERTopic
+from nltk.corpus import words
+
+# model = BERTopic(verbose=True)
+
+
+def get_topics(file):
+    topics_outputs = []
+
+    docs = []
+    with open(file) as file:
+        for line in file:
+            docs.append(line.rstrip())
+
+    topics, probabilities = model.fit_transform(docs)
+    #
+    print(model.get_topic_freq())
+    #
+    print('done')
+    #
+    # model.get_topic_freq().head(11)
+
+    print(model.get_topics())
+
+    for i in model.get_topic(0):
+        if i[0] in words.words():
+            pass
+        else:
+            print(i[0])
+            topics_outputs.append(i[0])
+
+    return topics_outputs
+
+
+def get_topics_new(text):
+    topics = q_gen.get_keywords(text, q_gen.summarizer(text))
+    print(topics)
+    return topics
--- a/TopicIndexing/topics_find/question_generator.py
+++ b/TopicIndexing/topics_find/question_generator.py
+from textwrap3 import wrap
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import random
+import numpy as np
+import nltk
+# nltk.download('punkt')
+# nltk.download('brown')
+# nltk.download('wordnet')
+# nltk.download('stopwords')
+from nltk.corpus import wordnet as wn
+from nltk.tokenize import sent_tokenize
+from nltk.corpus import stopwords
+import string
+import pke
+import traceback
+from flashtext import KeywordProcessor
+
+summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
+summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+summary_model = summary_model.to(device)
+
+
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def postprocesstext(content):
+    final = ""
+    for sent in sent_tokenize(content):
+        sent = sent.capitalize()
+        final = final + " " + sent
+    return final
+
+
+def summarizer(text, model=summary_model, tokenizer=summary_tokenizer):
+    text = text.strip().replace("\n", " ")
+    text = "summarize: " + text
+    # print (text)
+    max_len = 512
+    encoding = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=False, truncation=True,
+                                     return_tensors="pt").to(device)
+
+    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
+
+    outs = model.generate(input_ids=input_ids,
+                          attention_mask=attention_mask,
+                          early_stopping=True,
+                          num_beams=3,
+                          num_return_sequences=1,
+                          no_repeat_ngram_size=2,
+                          min_length=75,
+                          max_length=300)
+
+    dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
+    summary = dec[0]
+    summary = postprocesstext(summary)
+    summary = summary.strip()
+
+    return summary
+
+
+def get_nouns_multipartite(content):
+    out = []
+    try:
+        extractor = pke.unsupervised.MultipartiteRank()
+        extractor.load_document(input=content)
+        #    not contain punctuation marks or stopwords as candidates.
+        pos = {'PROPN', 'NOUN'}
+        # pos = {'PROPN','NOUN'}
+        stoplist = list(string.punctuation)
+        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
+        stoplist += stopwords.words('english')
+        extractor.candidate_selection(pos=pos, stoplist=stoplist)
+        # 4. build the Multipartite graph and rank candidates using random walk,
+        #    alpha controls the weight adjustment mechanism, see TopicRank for
+        #    threshold/method parameters.
+        extractor.candidate_weighting(alpha=1.1,
+                                      threshold=0.75,
+                                      method='average')
+        keyphrases = extractor.get_n_best(n=15)
+
+        for val in keyphrases:
+            out.append(val[0])
+    except:
+        out = []
+        traceback.print_exc()
+
+    return out
+
+
+def get_keywords(originaltext, summarytext):
+    keywords = get_nouns_multipartite(originaltext)
+    print("keywords unsummarized: ", keywords)
+    keyword_processor = KeywordProcessor()
+    for keyword in keywords:
+        keyword_processor.add_keyword(keyword)
+
+    keywords_found = keyword_processor.extract_keywords(summarytext)
+    keywords_found = list(set(keywords_found))
+    print("keywords_found in summarized: ", keywords_found)
+
+    important_keywords = []
+    for keyword in keywords:
+        if keyword in keywords_found:
+            important_keywords.append(keyword)
+
+    return important_keywords[:1]
+
+
+question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
+question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
+question_model = question_model.to(device)
+
+
+def get_question(context, answer, model, tokenizer):
+    text = "context: {} answer: {}".format(context, answer)
+    encoding = tokenizer.encode_plus(text, max_length=384, pad_to_max_length=False, truncation=True,
+                                     return_tensors="pt").to(device)
+    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
+
+    outs = model.generate(input_ids=input_ids,
+                          attention_mask=attention_mask,
+                          early_stopping=True,
+                          num_beams=5,
+                          num_return_sequences=1,
+                          no_repeat_ngram_size=2,
+                          max_length=72)
+
+    dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
+
+    Question = dec[0].replace("question:", "")
+    Question = Question.strip()
+    return Question
+
+
+def generate_questions_and_answers(text):
+    set_seed(42)
+    summarized_text = summarizer(text, summary_model, summary_tokenizer)
+    imp_keywords = get_keywords(text, summarized_text)
+    question_and_answer_list = []
+    for answer in imp_keywords:
+        ques = get_question(summarized_text, answer, question_model, question_tokenizer)
+        question_and_answer_list.append([ques, answer.capitalize()])
+
+    return question_and_answer_list
+
+# xxx = """Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company
+# Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve
+# system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin
+# rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin.  In a recent tweet,
+# Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and
+# transaction, and hence was suspending vehicle purchases using the cryptocurrency.  A day later he again tweeted saying, “To be clear, I strongly
+# believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”.  It triggered a downward spiral for Bitcoin value but
+# the cryptocurrency has stabilised since.   A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising
+# that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency."""
+
+
+# print(generate_questions_and_answers(xxx))
+#
+# x = generate_questions_and_answers(xxx)
+#
+# for i in x:
+#     print(i[0])
+#     print(i[1])
--- a/TopicIndexing/topics_find/summary.py
+++ b/TopicIndexing/topics_find/summary.py
+import glob
+from pptx import Presentation
+import math
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, sent_tokenize
+import textract
+import os.path
+
+
+def create_sumall(abc, ratio):
+    if abc:
+        filename = abc
+        stop_word = ['is', 'a', 'and', 'the']
+
+        # Function to create Text summarization
+        def create_summ(text):
+            stopWords = set(stopwords.words("english"))
+            words = word_tokenize(text)
+            freqTable = dict()
+            for word in words:
+                word = word.lower()
+                if word in stopWords:
+                    continue
+                if word in freqTable:
+                    freqTable[word] += 1
+                else:
+                    freqTable[word] = 1
+
+            sentences = sent_tokenize(text)
+            sentenceValue = dict()
+
+            for sentence in sentences:
+                for word, freq in freqTable.items():
+                    if word in sentence.lower():
+                        if sentence in sentenceValue:
+                            sentenceValue[sentence] += freq
+                        else:
+                            sentenceValue[sentence] = freq
+
+            sumValues = 0
+            for sentence in sentenceValue:
+                sumValues += sentenceValue[sentence]
+
+            lensenvalu = len(sentenceValue)
+            if lensenvalu == 0:
+                lensenvalu = 1
+                average = int(sumValues / lensenvalu)
+            else:
+                average = int(sumValues / lensenvalu)
+
+            summary = ''
+            for sentence in sentences:
+                if (sentence in sentenceValue) and (sentenceValue[sentence] > (
+                        ratio * average)):
+                    summary += " " + sentence
+
+            return summary
+
+        def read_full_pptxe(filename):
+            sentences = []
+            b = []
+            a = 0
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            s = create_summ(shape.text.replace("\n", " "))
+                            s = str(s)
+                            if (len(s)) >= 1:
+                                f = ["Slide " + str(a) + "-" + s]
+                                sentences.append(f)
+            return sentences
+
+        def read_full_docx(filename):
+            sentences = []
+            text = textract.process(filename)
+            temp = text.split(".")
+            for t in temp:
+                sentences.append(t.replace("\n", " "))
+            return sentences
+
+        extension = os.path.splitext(filename)[1]
+
+        if extension == 'docx':
+            read_full_docx(filename)
+        else:
+            read_full_pptxe(filename)
+
+        def Convert(string):
+            li = list(string.split(" "))
+            return li
+
+        def Convert2(string):
+            li = list(string.split("\n"))
+            return li
+
+        def read_slide3(filename):
+            a = 1
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            if a == 4 and shape.shape_id == 3:
+                                s3 = str(shape.text)
+                                return s3
+
+        def read_full_pptx(filename, sss):
+            numberslide = []
+            numberslide.append(sss)
+            a = 0
+            for eachfile in glob.glob(filename):
+                prs = Presentation(eachfile)
+                for slide in prs.slides:
+                    a = a + 1
+                    for shape in slide.shapes:
+                        if hasattr(shape, "text"):
+                            if shape.shape_id != 2:
+                                s = shape.text.replace("\n", " ")
+                                s = str(s)
+                                if (len(s)) >= 20 and a != 3 and a != 1 and a != 2:
+                                    lo_1 = [a for a in new_l1 if a in s.lower()]
+                                    f_lo_l = round((len(lo_1) / len_of_l1) * 100)
+                                    if f_lo_l >= 50:
+                                        f = "Slide " + str(a)
+                                        numberslide.append(f)
+
+            return numberslide
+
+        loooo = Convert2(read_slide3(filename))
+
+        abc = []
+        for i in loooo:
+            l1 = Convert(i.lower())
+            new_l1 = [w for w in l1 if w not in stop_word]
+            len_of_l1 = len(new_l1)
+            read_full_pptx(filename, i)
+            abc.append(read_full_pptx(filename, i))
+
+        return (read_full_pptxe(filename), abc)
+    else:
+        print('error')
--- a/TopicIndexing/topics_find/test_api.py
+++ b/TopicIndexing/topics_find/test_api.py
+from flask import Flask, request, url_for, redirect, render_template
+from flask_cors import CORS
+import werkzeug
+import topics_find.summary as summarizeed
+import json
+import textract
+from pptx import Presentation
+import os
+
+app = Flask(__name__)
+CORS(app)
+
+
+@app.route('/summerize', methods=['GET', 'POST'])
+def summerize():
+    file = request.files['file']
+    ratio = float(request.form['ratio'])
+    filename = werkzeug.utils.secure_filename(file.filename)
+    print("\nReceived image File name : " + file.filename)
+    file.save('upload/' + filename)
+
+    f, file_extension = os.path.splitext('upload/' + filename)
+    print(file_extension)
+
+    if file_extension == '.docx':
+        text = textract.process('upload/' + filename)
+        arr = str(text).replace("\\n", "")
+        arr = arr.replace("\\t", "")
+        arr = arr.replace("\\", "")
+
+        prs = Presentation()
+
+        lyt = prs.slide_layouts[0]  # choosing a slide layout
+
+        for x in range(0, 3):
+
+            if x == 2:
+                slide = prs.slides.add_slide(lyt)  # adding a slide
+                title = slide.shapes.title  # assigning a title
+                subtitle = slide.placeholders[1]  # placeholder for subtitle
+                subtitle.text = arr
+            else:
+                slide = prs.slides.add_slide(lyt)  # adding a slide
+                title = slide.shapes.title  # assigning a title
+                subtitle = slide.placeholders[1]  # placeholder for subtitle
+                title.text = "ignore"  # title
+                subtitle.text = "ignore"  # subtitle
+        prs.save("upload/slide3.pptx")  # saving file
+        print('file saved')
+        res = summarizeed.create_sumall('upload/slide3.pptx', ratio)
+    else:
+        res = summarizeed.create_sumall('upload/' + filename, ratio)
+
+    rr = []
+    for r in res[0]:
+        rr.append(r[0].replace('"', ''))
+
+    return_str = '{ "result" : ['
+
+    for i in range(len(rr)):
+        if i == len(rr) - 1:
+            return_str += '"' + rr[i] + '"'
+        else:
+            return_str += '"' + rr[i] + '"' + ','
+    return_str += ']}'
+
+    print(return_str)
+
+    return json.loads(return_str)
+
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=5005, debug=True)
--- a/TopicIndexing/topics_find/text_gen.py
+++ b/TopicIndexing/topics_find/text_gen.py
+import speech_recognition as sr
+import subprocess
+import os
+import sys
+
+PYTHONIOENCODING = "UTF-8"
+
+FOLDER_AUDIO = "audio_input"
+FOLDER_TEXT = "text_output"
+LANGUAGE = "en-US"
+
+
+# print("starting...")
+#
+# if not os.path.isdir(FOLDER_AUDIO):
+#     os.mkdir(FOLDER_AUDIO)
+#
+# if not os.path.isdir(FOLDER_TEXT):
+#     os.mkdir(FOLDER_TEXT)
+#
+# paths = [os.path.join(FOLDER_AUDIO, nome) for nome in os.listdir(FOLDER_AUDIO)]
+# files = [arq for arq in paths if os.path.isfile(arq)]
+# wav_files = [arq for arq in files if arq.lower().endswith(".wav")]
+#
+# for filename in wav_files:
+#     r = sr.Recognizer()
+#     with sr.AudioFile(filename) as source:
+#         audio = r.record(source)
+#
+#     command = r.recognize_google(audio, language='en-IN', show_all=True)
+#     print(command)
+#
+#     print("running file {}".format(filename))
+#
+#     filefinal = filename.split("audio_input/")[1].split(".wav")[0]
+#     filefinal = '{}/{}.txt'.format(FOLDER_TEXT, filefinal)
+#     with open(filefinal, 'w') as arq:
+#         arq.write(str(command))
+#
+#     print("create a new file {}".format(filefinal))
+#
+# print("finish")
+
+
+def convert_audio_to_text(filename):
+    r = sr.Recognizer()
+    with sr.AudioFile(filename) as source:
+        audio = r.record(source)
+
+    try:
+        command = r.recognize_google(audio, language='en-IN', show_all=True)
+        print(command["alternative"][0]["transcript"])
+        return command["alternative"][0]["transcript"]
+
+    except:
+        return 'did not convert'
+
+# convert_audio_to_text('audio_input/3.wav')