Commit c66ff3ca authored by De Silva K.C.C.C's avatar De Silva K.C.C.C

remove stop words

parent 7d0d8b06
import glob
from pptx import Presentation
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import textract
import os.path
def create_sumall(abc, ratio):
if abc:
filename = abc
stop_word = ['is', 'a', 'and', 'the']
# Function to create Text summarization
def create_summ(text):
stopWords = set(stopwords.words("english"))
words = word_tokenize(text)
freqTable = dict()
for word in words:
word = word.lower()
if word in stopWords:
if word in freqTable:
freqTable[word] += 1
freqTable[word] = 1
sentences = sent_tokenize(text)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
sentenceValue[sentence] = freq
sumValues = 0
for sentence in sentenceValue:
sumValues += sentenceValue[sentence]
lensenvalu = len(sentenceValue)
if lensenvalu == 0:
lensenvalu = 1
average = int(sumValues / lensenvalu)
average = int(sumValues / lensenvalu)
summary = ''
for sentence in sentences:
if (sentence in sentenceValue) and (sentenceValue[sentence] > (
ratio * average)):
summary += " " + sentence
return summary
def read_full_pptxe(filename):
sentences = []
b = []
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
s = create_summ(shape.text.replace("\n", " "))
s = str(s)
if (len(s)) >= 1:
f = ["Slide " + str(a) + "-" + s]
return sentences
def read_full_docx(filename):
sentences = []
text = textract.process(filename)
temp = text.split(".")
for t in temp:
sentences.append(t.replace("\n", " "))
return sentences
extension = os.path.splitext(filename)[1]
if extension == 'docx':
def Convert(string):
li = list(string.split(" "))
return li
def Convert2(string):
li = list(string.split("\n"))
return li
def read_slide3(filename):
a = 1
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if a == 4 and shape.shape_id == 3:
s3 = str(shape.text)
return s3
def read_full_pptx(filename, sss):
numberslide = []
a = 0
for eachfile in glob.glob(filename):
prs = Presentation(eachfile)
for slide in prs.slides:
a = a + 1
for shape in slide.shapes:
if hasattr(shape, "text"):
if shape.shape_id != 2:
s = shape.text.replace("\n", " ")
s = str(s)
if (len(s)) >= 20 and a != 3 and a != 1 and a != 2:
lo_1 = [a for a in new_l1 if a in s.lower()]
f_lo_l = round((len(lo_1) / len_of_l1) * 100)
if f_lo_l >= 50:
f = "Slide " + str(a)
return numberslide
loooo = Convert2(read_slide3(filename))
abc = []
for i in loooo:
l1 = Convert(i.lower())
new_l1 = [w for w in l1 if w not in stop_word]
len_of_l1 = len(new_l1)
read_full_pptx(filename, i)
abc.append(read_full_pptx(filename, i))
return (read_full_pptxe(filename), abc)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment