Commit 540b8a5d authored by chaveenagit's avatar chaveenagit

topic

parent 5854f523
import moviepy.editor as mp
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from moviepy.editor import VideoFileClip
import os
from topics_find import text_gen
def convert_video_to_audio(filename):
clip = mp.VideoFileClip(r"" + filename)
audio_file_name = str(filename).split('/')[-1].replace('.mp4', '.wav')
clip.audio.write_audiofile(r"topics_find/audio_input/" + audio_file_name)
return text_gen.convert_audio_to_text("topics_find/audio_input/" + audio_file_name)
def split_video_file(filename):
return_list = []
all_text = ''
required_video_file = filename
files = os.listdir('topics_find/video_input')
for filename in files:
os.remove('topics_find/video_input/' + filename)
total_length = VideoFileClip(required_video_file).duration
print(total_length)
no_of_slices = int(total_length / 50) + 1
time_grid = []
for i in range(0, no_of_slices):
time_grid.append(i * 50)
for i in range(no_of_slices):
if i == len(time_grid) - 1:
# ffmpeg_extract_subclip(required_video_file, time_grid[i], total_length - time_grid[i],
# targetname='videos/' + str(i) + ".mp4")
pass
else:
ffmpeg_extract_subclip(required_video_file, time_grid[i], time_grid[i + 1],
targetname='topics_find/video_input' + str(i) + ".mp4")
text = convert_video_to_audio('topics_find/video_input' + str(i) + ".mp4")
all_text += text + ' '
return_list.append([i, text])
return return_list, all_text
import nltk
import topics_find.question_generator as q_gen
# nltk.download('words')
# from bertopic import BERTopic
from nltk.corpus import words
# model = BERTopic(verbose=True)
def get_topics(file):
topics_outputs = []
docs = []
with open(file) as file:
for line in file:
docs.append(line.rstrip())
topics, probabilities = model.fit_transform(docs)
#
print(model.get_topic_freq())
#
print('done')
#
# model.get_topic_freq().head(11)
print(model.get_topics())
for i in model.get_topic(0):
if i[0] in words.words():
pass
else:
print(i[0])
topics_outputs.append(i[0])
return topics_outputs
def get_topics_new(text):
topics = q_gen.get_keywords(text, q_gen.summarizer(text))
print(topics)
return topics
from textwrap3 import wrap
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import random
import numpy as np
import nltk
# nltk.download('punkt')
# nltk.download('brown')
# nltk.download('wordnet')
# nltk.download('stopwords')
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
import pke
import traceback
from flashtext import KeywordProcessor
summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary_model = summary_model.to(device)
def set_seed(seed: int):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def postprocesstext(content):
final = ""
for sent in sent_tokenize(content):
sent = sent.capitalize()
final = final + " " + sent
return final
def summarizer(text, model=summary_model, tokenizer=summary_tokenizer):
text = text.strip().replace("\n", " ")
text = "summarize: " + text
# print (text)
max_len = 512
encoding = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=False, truncation=True,
return_tensors="pt").to(device)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
outs = model.generate(input_ids=input_ids,
attention_mask=attention_mask,
early_stopping=True,
num_beams=3,
num_return_sequences=1,
no_repeat_ngram_size=2,
min_length=75,
max_length=300)
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
summary = dec[0]
summary = postprocesstext(summary)
summary = summary.strip()
return summary
def get_nouns_multipartite(content):
out = []
try:
extractor = pke.unsupervised.MultipartiteRank()
extractor.load_document(input=content)
# not contain punctuation marks or stopwords as candidates.
pos = {'PROPN', 'NOUN'}
# pos = {'PROPN','NOUN'}
stoplist = list(string.punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')
extractor.candidate_selection(pos=pos, stoplist=stoplist)
# 4. build the Multipartite graph and rank candidates using random walk,
# alpha controls the weight adjustment mechanism, see TopicRank for
# threshold/method parameters.
extractor.candidate_weighting(alpha=1.1,
threshold=0.75,
method='average')
keyphrases = extractor.get_n_best(n=15)
for val in keyphrases:
out.append(val[0])
except:
out = []
traceback.print_exc()
return out
def get_keywords(originaltext, summarytext):
keywords = get_nouns_multipartite(originaltext)
print("keywords unsummarized: ", keywords)
keyword_processor = KeywordProcessor()
for keyword in keywords:
keyword_processor.add_keyword(keyword)
keywords_found = keyword_processor.extract_keywords(summarytext)
keywords_found = list(set(keywords_found))
print("keywords_found in summarized: ", keywords_found)
important_keywords = []
for keyword in keywords:
if keyword in keywords_found:
important_keywords.append(keyword)
return important_keywords[:1]
question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_model = question_model.to(device)
def get_question(context, answer, model, tokenizer):
text = "context: {} answer: {}".format(context, answer)
encoding = tokenizer.encode_plus(text, max_length=384, pad_to_max_length=False, truncation=True,
return_tensors="pt").to(device)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
outs = model.generate(input_ids=input_ids,
attention_mask=attention_mask,
early_stopping=True,
num_beams=5,
num_return_sequences=1,
no_repeat_ngram_size=2,
max_length=72)
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
Question = dec[0].replace("question:", "")
Question = Question.strip()
return Question
def generate_questions_and_answers(text):
set_seed(42)
summarized_text = summarizer(text, summary_model, summary_tokenizer)
imp_keywords = get_keywords(text, summarized_text)
question_and_answer_list = []
for answer in imp_keywords:
ques = get_question(summarized_text, answer, question_model, question_tokenizer)
question_and_answer_list.append([ques, answer.capitalize()])
return question_and_answer_list
# xxx = """Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company
# Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve
# system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin
# rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet,
# Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and
# transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, “To be clear, I strongly
# believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”. It triggered a downward spiral for Bitcoin value but
# the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising
# that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency."""
# print(generate_questions_and_answers(xxx))
#
# x = generate_questions_and_answers(xxx)
#
# for i in x:
# print(i[0])
# print(i[1])
import glob
from pptx import Presentation
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import textract
import os.path
def create_sumall(abc, ratio):
if abc:
filename = abc
stop_word = ['is', 'a', 'and', 'the']
# Function to create Text summarization
def create_summ(text):
stopWords = set(stopwords.words("english"))
words = word_tokenize(text)
freqTable = dict()
for word in words:
word = word.lower()
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
sentences = sent_tokenize(text)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
else:
sentenceValue[sentence] = freq
sumValues = 0
for sentence in sentenceValue:
sumValues += sentenceValue[sentence]
lensenvalu = len(sentenceValue)
if lensenvalu == 0:
lensenvalu = 1
average = int(sumValues / lensenvalu)
else:
average = int(sumValues / lensenvalu)
summary = ''
for sentence in sentences:
if (sentence in sentenceValue) and (sentenceValue[sentence] > (
ratio * average)):
summary += " " + sentence
return summary
def read_full_pptxe(filename):
sentences = []
b = []
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
s = create_summ(shape.text.replace("\n", " "))
s = str(s)
if (len(s)) >= 1:
f = ["Slide " + str(a) + "-" + s]
sentences.append(f)
return sentences
def read_full_docx(filename):
sentences = []
text = textract.process(filename)
temp = text.split(".")
for t in temp:
sentences.append(t.replace("\n", " "))
return sentences
extension = os.path.splitext(filename)[1]
if extension == 'docx':
read_full_docx(filename)
else:
read_full_pptxe(filename)
def Convert(string):
li = list(string.split(" "))
return li
def Convert2(string):
li = list(string.split("\n"))
return li
def read_slide3(filename):
a = 1
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if a == 4 and shape.shape_id == 3:
s3 = str(shape.text)
return s3
def read_full_pptx(filename, sss):
numberslide = []
numberslide.append(sss)
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if shape.shape_id != 2:
s = shape.text.replace("\n", " ")
s = str(s)
if (len(s)) >= 20 and a != 3 and a != 1 and a != 2:
lo_1 = [a for a in new_l1 if a in s.lower()]
f_lo_l = round((len(lo_1) / len_of_l1) * 100)
if f_lo_l >= 50:
f = "Slide " + str(a)
numberslide.append(f)
return numberslide
loooo = Convert2(read_slide3(filename))
abc = []
for i in loooo:
l1 = Convert(i.lower())
new_l1 = [w for w in l1 if w not in stop_word]
len_of_l1 = len(new_l1)
read_full_pptx(filename, i)
abc.append(read_full_pptx(filename, i))
return (read_full_pptxe(filename), abc)
else:
print('error')
from flask import Flask, request, url_for, redirect, render_template
from flask_cors import CORS
import werkzeug
import topics_find.summary as summarizeed
import json
import textract
from pptx import Presentation
import os
app = Flask(__name__)
CORS(app)
@app.route('/summerize', methods=['GET', 'POST'])
def summerize():
file = request.files['file']
ratio = float(request.form['ratio'])
filename = werkzeug.utils.secure_filename(file.filename)
print("\nReceived image File name : " + file.filename)
file.save('upload/' + filename)
f, file_extension = os.path.splitext('upload/' + filename)
print(file_extension)
if file_extension == '.docx':
text = textract.process('upload/' + filename)
arr = str(text).replace("\\n", "")
arr = arr.replace("\\t", "")
arr = arr.replace("\\", "")
prs = Presentation()
lyt = prs.slide_layouts[0] # choosing a slide layout
for x in range(0, 3):
if x == 2:
slide = prs.slides.add_slide(lyt) # adding a slide
title = slide.shapes.title # assigning a title
subtitle = slide.placeholders[1] # placeholder for subtitle
subtitle.text = arr
else:
slide = prs.slides.add_slide(lyt) # adding a slide
title = slide.shapes.title # assigning a title
subtitle = slide.placeholders[1] # placeholder for subtitle
title.text = "ignore" # title
subtitle.text = "ignore" # subtitle
prs.save("upload/slide3.pptx") # saving file
print('file saved')
res = summarizeed.create_sumall('upload/slide3.pptx', ratio)
else:
res = summarizeed.create_sumall('upload/' + filename, ratio)
rr = []
for r in res[0]:
rr.append(r[0].replace('"', ''))
return_str = '{ "result" : ['
for i in range(len(rr)):
if i == len(rr) - 1:
return_str += '"' + rr[i] + '"'
else:
return_str += '"' + rr[i] + '"' + ','
return_str += ']}'
print(return_str)
return json.loads(return_str)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=5005, debug=True)
import speech_recognition as sr
import subprocess
import os
import sys
PYTHONIOENCODING = "UTF-8"
FOLDER_AUDIO = "audio_input"
FOLDER_TEXT = "text_output"
LANGUAGE = "en-US"
# print("starting...")
#
# if not os.path.isdir(FOLDER_AUDIO):
# os.mkdir(FOLDER_AUDIO)
#
# if not os.path.isdir(FOLDER_TEXT):
# os.mkdir(FOLDER_TEXT)
#
# paths = [os.path.join(FOLDER_AUDIO, nome) for nome in os.listdir(FOLDER_AUDIO)]
# files = [arq for arq in paths if os.path.isfile(arq)]
# wav_files = [arq for arq in files if arq.lower().endswith(".wav")]
#
# for filename in wav_files:
# r = sr.Recognizer()
# with sr.AudioFile(filename) as source:
# audio = r.record(source)
#
# command = r.recognize_google(audio, language='en-IN', show_all=True)
# print(command)
#
# print("running file {}".format(filename))
#
# filefinal = filename.split("audio_input/")[1].split(".wav")[0]
# filefinal = '{}/{}.txt'.format(FOLDER_TEXT, filefinal)
# with open(filefinal, 'w') as arq:
# arq.write(str(command))
#
# print("create a new file {}".format(filefinal))
#
# print("finish")
def convert_audio_to_text(filename):
r = sr.Recognizer()
with sr.AudioFile(filename) as source:
audio = r.record(source)
try:
command = r.recognize_google(audio, language='en-IN', show_all=True)
print(command["alternative"][0]["transcript"])
return command["alternative"][0]["transcript"]
except:
return 'did not convert'
# convert_audio_to_text('audio_input/3.wav')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment