Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2021-235
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
2021-235
2021-235
Commits
6d173179
Commit
6d173179
authored
Nov 26, 2021
by
Warnasooriya W.M.C.D.B
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added summary doc py
parent
515000e5
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
103 additions
and
0 deletions
+103
-0
summarize_document.py
summarize_document.py
+103
-0
No files found.
summarize_document.py
0 → 100644
View file @
6d173179
import
spacy
from
spacy.lang.en.stop_words
import
STOP_WORDS
from
string
import
punctuation
from
heapq
import
nlargest
import
os
import
docx
# path declaration
current_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
)))
document_path
=
current_path
+
"
\\
documents"
summarized_doc_path
=
current_path
+
"
\\
summarized_doc"
summarized_doc
=
docx
.
Document
()
# get list of stop words
stopwords
=
list
(
STOP_WORDS
)
# get list of punctuations and add new line(\n) to the list
punctuation
=
punctuation
+
'
\n
'
def
summerize
(
fileName
):
fn
=
fileName
.
split
(
"."
)
text
=
""
doc
=
docx
.
Document
(
document_path
+
'
\\
'
+
fileName
)
AllLines
=
[]
# Read content of the file and store in AllLines
for
para
in
doc
.
paragraphs
:
AllLines
.
append
(
para
.
text
)
# Loop through AllLines and remove both the leading and the trailing characters
for
line
in
AllLines
:
text
=
text
+
line
.
strip
()
# load spacy (spacy - data analysis tool in NLP)
nlp
=
spacy
.
load
(
'en_core_web_sm'
)
doc
=
nlp
(
text
)
# apply tokenization
tokens
=
[
token
.
text
for
token
in
doc
]
print
(
"Tokens:
\n
"
+
str
(
tokens
))
# text cleaning - remove stop words and punctuations
# obtaining the frequency of words that are not considered as stop words (a, the, is, ..)
word_frequency
=
{}
for
word
in
doc
:
if
word
.
text
.
lower
()
not
in
stopwords
:
if
word
.
text
.
lower
()
not
in
punctuation
:
if
word
.
text
not
in
word_frequency
.
keys
():
word_frequency
[
word
.
text
]
=
1
else
:
#already exist
word_frequency
[
word
.
text
]
+=
1
print
(
"
\n
Words Frequencies:
\n
"
+
str
(
word_frequency
))
# get max frequency
max_frequency
=
max
(
word_frequency
.
values
())
print
(
"
\n
Max Frequency: "
+
str
(
max_frequency
))
# to get normalized word frequencies, divide word frequency by max frequency
for
word
in
word_frequency
.
keys
():
word_frequency
[
word
]
=
word_frequency
[
word
]
/
max_frequency
print
(
"
\n
Word Frequncies after devided by max frequency:
\n
"
+
str
(
word_frequency
))
# get sentence tokens
sentence_tokens
=
[
sent
for
sent
in
doc
.
sents
]
print
(
"
\n
Sentences tokens:
\n
"
+
str
(
sentence_tokens
))
# give scores to each sentences, by adding the word frequencies in each sentence
sentence_score
=
{}
for
sent
in
sentence_tokens
:
for
word
in
sent
:
if
word
.
text
.
lower
()
in
word_frequency
.
keys
():
if
sent
not
in
sentence_score
.
keys
():
sentence_score
[
sent
]
=
word_frequency
[
word
.
text
.
lower
()]
else
:
sentence_score
[
sent
]
+=
word_frequency
[
word
.
text
.
lower
()]
print
(
"
\n
Sentences Scores:
\n
"
+
str
(
sentence_score
))
# calculate 40% of text with maximum score
select_length
=
int
(
len
(
sentence_tokens
)
*
0.4
)
print
(
"
\n
Select Length:
\n
"
+
str
(
select_length
))
# create summary
summary
=
nlargest
(
select_length
,
sentence_score
,
key
=
sentence_score
.
get
)
print
(
"
\n
Summary:
\n
"
+
str
(
summary
))
# get the final summary
final_summary
=
[
word
.
text
for
word
in
summary
]
summary
=
''
.
join
(
final_summary
)
print
(
"
\n
final summary:
\n
"
+
summary
)
# write text into txt
with
open
(
summarized_doc_path
+
"
\\
"
+
fn
[
0
]
+
".txt"
,
'w'
)
as
f
:
for
line
in
final_summary
:
f
.
write
(
line
)
f
.
write
(
'
\n
'
)
if
__name__
==
'__main__'
:
summerize
(
"originalText.docx"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment