Commit eaef6cc0 authored by HashaniJayasinghe's avatar HashaniJayasinghe

add question generation component

parent 346dd83e
The island was divided into numerous kingdoms over the following centuries,
intermittently (between CE 993–1077) united under Chola rule. Sri Lanka was ruled
by 181 monarchs from the Anuradhapura to Kandy periods.[6] From the 16th century,
some coastal areas of the country were also controlled by the Portuguese, Dutch and British.
Between 1597 and 1658, a substantial part of the island was under Portuguese rule. The Portuguese
lost their possessions in Ceylon due to Dutch intervention in the Eighty Years' War. Following the
Kandyan Wars, the island was united under British rule in 1815. Armed uprisings against the British
took place in the 1818 Uva Rebellion and the 1848 Matale Rebellion. Independence was finally granted in 1948 but the country remained a Dominion of the British Empire until 1972.
Q-01: What was Sri Lanka ruled by 181 monarchs from the Anuradhapura to Kandy periods?
Q-02: Who was ruled by 181 monarchs from the Anuradhapura to Kandy periods?
Q-03: How much was Sri Lanka ruled by monarchs from the Anuradhapura to Kandy periods?
Q-04: Who was under portuguese rule?
Q-05: When did the island was under british rule?
Q-06: What did the island was under british rule in 1815?
Q-07: Who was rule under british rule in 1815?
Q-08: How much a D was Dominion of the british Empire until 1972?
Q-09: What did Independence was in 1948 but the country remained a dominion of the british Empire until 1972?
Q-10: Who was finally granted in 1948 but the country remained a D Dominion of the british Empire until 1972?
Q-11: Who remained a D Dominion of the british Empire until 1972?
Q-12: Who a D finally Dominion of the british Empire until 1972?
Q-13: How much did Independence was finally granted in but the country remained a D Dominion of the british Empire until 1972?
import spacy
import clause
import nonClause
import identification
import questionValidation
from nlpNER import nerTagger
class AutomaticQuestionGenerator():
# AQG Parsing & Generate a question
def aqgParse(self, sentence):
nlp = spacy.load('en_core_web_md')
singleSentences = sentence.split(".")
questionsList = []
if len(singleSentences) != 0:
for i in range(len(singleSentences)):
segmentSets = singleSentences[i].split(",")
ner = nerTagger(nlp, singleSentences[i])
if (len(segmentSets)) != 0:
for j in range(len(segmentSets)):
try:
questionsList += clause.howmuch_2(segmentSets, j, ner)
except Exception:
pass
if identification.clause_identify(segmentSets[j]) == 1:
try:
questionsList += clause.whom_1(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.whom_2(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.whom_3(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.whose(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.what_to_do(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.who(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.howmuch_1(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.howmuch_3(segmentSets, j, ner)
except Exception:
pass
else:
try:
s = identification.subjectphrase_search(segmentSets, j)
except Exception:
pass
if len(s) != 0:
segmentSets[j] = s + segmentSets[j]
try:
questionsList += clause.whom_1(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.whom_2(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.whom_3(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.whose(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.what_to_do(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += clause.who(segmentSets, j, ner)
except Exception:
pass
else:
try:
questionsList += nonClause.what_whom1(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += nonClause.what_whom2(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += nonClause.whose(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += nonClause.howmany(segmentSets, j, ner)
except Exception:
pass
try:
questionsList += nonClause.howmuch_1(segmentSets, j, ner)
except Exception:
pass
questionsList.append('\n')
return questionsList
# AQG Display the Generated Question
def display(self, str):
count = 0
out = ""
for i in range(len(str)):
if (len(str[i]) >= 3):
if (questionValidation.hNvalidation(str[i]) == 1):
if ((str[i][0] == 'W' and str[i][1] == 'h') or (str[i][0] == 'H' and str[i][1] == 'o') or (
str[i][0] == 'H' and str[i][1] == 'a')):
WH = str[i].split(',')
if (len(WH) == 1):
str[i] = str[i][:-1]
str[i] = str[i][:-1]
str[i] = str[i][:-1]
str[i] = str[i] + "?"
count = count + 1
if (count < 10):
print("Q-0%d: %s" % (count, str[i]))
out += "Q-0" + count.__str__() + ": " + str[i] + "\n"
else:
print("Q-%d: %s" % (count, str[i]))
out += "Q-" + count.__str__() + ": " + str[i] + "\n"
output = "Questions_generator\Outputs\\questions.txt"
w = open(output, 'w+', encoding="utf8")
w.write(out)
w.close()
return 0
This diff is collapsed.
import nltk
def chunk_search(segment, chunked):
m = len(chunked)
list1 = []
for j in range(m):
if (len(chunked[j]) > 2 or len(chunked[j]) == 1):
list1.append(j)
if (len(chunked[j]) == 2):
try:
str1 = chunked[j][0][0] + " " + chunked[j][1][0]
except Exception:
pass
else:
if (str1 in segment) == True:
list1.append(j)
return list1
def segment_identify(sen):
segment_set = sen.split(",")
return segment_set
def clause_identify(segment):
tok = nltk.word_tokenize(segment)
tag = nltk.pos_tag(tok)
gram = r"""chunk:{<EX>?<DT>?<JJ.?>*<NN.?|PRP|PRP\$|POS|IN|DT|CC|VBG|VBN>+<RB.?|VB.?|MD|RP>+}"""
chunkparser = nltk.RegexpParser(gram)
chunked = chunkparser.parse(tag)
flag = 0
for j in range(len(chunked)):
if (len(chunked[j]) > 2):
flag = 1
if (len(chunked[j]) == 2):
try:
str1 = chunked[j][0][0] + " " + chunked[j][1][0]
except Exception:
pass
else:
if (str1 in segment) == True:
flag = 1
if flag == 1:
break
return flag
def verbphrase_identify(clause):
tok = nltk.word_tokenize(clause)
tag = nltk.pos_tag(tok)
gram = r"""chunk:{<EX>?<DT>?<JJ.?>*<NN.?|PRP|PRP\$|POS|IN|DT|CC|VBG|VBN>+<RB.?>*<VB.?|MD|RP>+}"""
chunkparser = nltk.RegexpParser(gram)
chunked = chunkparser.parse(tag)
str1 = ""
str2 = ""
str3 = ""
list1 = chunk_search(clause, chunked)
if len(list1) != 0:
m = list1[len(list1) - 1]
for j in range(len(chunked[m])):
str1 += chunked[m][j][0]
str1 += " "
tok1 = nltk.word_tokenize(str1)
tag1 = nltk.pos_tag(tok1)
gram1 = r"""chunk:{<EX>?<DT>?<JJ.?>*<NN.?|PRP|PRP\$|POS|IN|DT|CC|VBG|VBN>+<RB.?>*}"""
chunkparser1 = nltk.RegexpParser(gram1)
chunked1 = chunkparser1.parse(tag1)
list2 = chunk_search(str1, chunked1)
if len(list2) != 0:
m = list2[0]
for j in range(len(chunked1[m])):
str2 += (chunked1[m][j][0] + " ")
tok1 = nltk.word_tokenize(str1)
tag1 = nltk.pos_tag(tok1)
gram1 = r"""chunk:{<VB.?|MD|RP>+}"""
chunkparser1 = nltk.RegexpParser(gram1)
chunked2 = chunkparser1.parse(tag1)
list3 = chunk_search(str1, chunked2)
if len(list3) != 0:
m = list3[0]
for j in range(len(chunked2[m])):
str3 += (chunked2[m][j][0] + " ")
X = ""
str4 = ""
st = nltk.word_tokenize(str3)
if len(st) > 1:
X = st[0]
s = ""
for k in range(1, len(st)):
s += st[k]
s += " "
str3 = s
str4 = X + " " + str2 + str3
if len(st) == 1:
tag1 = nltk.pos_tag(st)
if tag1[0][0] != 'are' and tag1[0][0] != 'were' and tag1[0][0] != 'is' and tag1[0][0] != 'am':
if tag1[0][1] == 'VB' or tag1[0][1] == 'VBP':
X = 'do'
if tag1[0][1] == 'VBD' or tag1[0][1] == 'VBN':
X = 'did'
if tag1[0][1] == 'VBZ':
X = 'does'
str4 = X + " " + str2 + str3
if (tag1[0][0] == 'are' or tag1[0][0] == 'were' or tag1[0][0] == 'is' or tag1[0][0] == 'am'):
str4 = tag1[0][0] + " " + str2
return str4
def subjectphrase_search(segment_set, num):
str2 = ""
for j in range(num - 1, 0, -1):
str1 = ""
flag = 0
tok = nltk.word_tokenize(segment_set[j])
tag = nltk.pos_tag(tok)
gram = r"""chunk:{<EX>?<DT>?<JJ.?>*<NN.?|PRP|PRP\$|POS|IN|DT|CC|VBG|VBN>+<RB.?>*<VB.?|MD|RP>+}"""
chunkparser = nltk.RegexpParser(gram)
chunked = chunkparser.parse(tag)
list1 = chunk_search(segment_set[j], chunked)
if len(list1) != 0:
m = list1[len(list1) - 1]
for j in range(len(chunked[m])):
str1 += chunked[m][j][0]
str1 += " "
tok1 = nltk.word_tokenize(str1)
tag1 = nltk.pos_tag(tok1)
gram1 = r"""chunk:{<EX>?<DT>?<JJ.?>*<NN.?|PRP|PRP\$|POS|IN|DT|CC|VBG|VBN>+}"""
chunkparser1 = nltk.RegexpParser(gram1)
chunked1 = chunkparser1.parse(tag1)
list2 = chunk_search(str1, chunked1)
if len(list2) != 0:
m = list2[len(list2) - 1]
for j in range(len(chunked1[m])):
str2 += (chunked1[m][j][0] + " ")
flag = 1
if flag == 0:
tok1 = nltk.word_tokenize(segment_set[j])
tag1 = nltk.pos_tag(tok1)
gram1 = r"""chunk:{<EX>?<DT>?<JJ.?>*<NN.?|PRP|PRP\$|POS|IN|DT|CC|VBG|VBN>+}"""
chunkparser1 = nltk.RegexpParser(gram1)
chunked1 = chunkparser1.parse(tag1)
list2 = chunk_search(str1, chunked1)
st = nltk.word_tokenize(segment_set[j])
if len(chunked1[list2[0]]) == len(st):
str2 = segment_set[j]
flag = 1
if flag == 1:
break
return str2
def postprocess(string):
tok = nltk.word_tokenize(string)
tag = nltk.pos_tag(tok)
str1 = tok[0].capitalize()
str1 += " "
if len(tok) != 0:
for i in range(1, len(tok)):
if tag[i][1] == "NNP":
str1 += tok[i].capitalize()
str1 += " "
else:
str1 += tok[i].lower()
str1 += " "
tok = nltk.word_tokenize(str1)
str1 = ""
for i in range(len(tok)):
if tok[i] == "i" or tok[i] == "we":
str1 += "you"
str1 += " "
elif tok[i] == "my" or tok[i] == "our":
str1 += "your"
str1 += " "
elif tok[i] == "your":
str1 += "my"
str1 += " "
elif tok[i] == "you":
if i - 1 >= 0:
to = nltk.word_tokenize(tok[i - 1])
ta = nltk.pos_tag(to)
# print ta
if ta[0][1] == 'IN':
str1 += "me"
str1 += " "
else:
str1 += "i"
str1 += " "
else:
str1 += "i "
elif tok[i] == "am":
str1 += "are"
str1 += " "
else:
str1 += tok[i]
str1 += " "
return str1
import aqgFunction
import os
current_path = os.path.abspath(os.path.join(os.path.dirname(__file__)))
# Main Function
def main():
# Create AQG object
aqg = aqgFunction.AutomaticQuestionGenerator()
inputTextPath = "Questions_generator\Inputs\\text.txt"
readFile = open(inputTextPath, 'r+', encoding="utf8")
inputText = readFile.read()
questionList = aqg.aqgParse(inputText)
aqg.display(questionList)
return 0
# Call Main Function
if __name__ == "__main__":
main()
import spacy
def nerTagger(nlp, tokenize):
doc = nlp(tokenize)
finalList = []
array = [[]]
for word in doc:
array[0] = 0
for ner in doc.ents:
if (ner.text == word.text):
finalList.append((word.text, ner.label_))
array[0] = 1
if (array[0] == 0):
finalList.append((word.text, 'O'))
return finalList
import nltk
import identification
def get_chunk(chunked):
str1 = ""
for j in range(len(chunked)):
str1 += (chunked[j][0] + " ")
return str1
def what_whom1(segment_set, num, ner):
tok = nltk.word_tokenize(segment_set[num])
tag = nltk.pos_tag(tok)
gram = r"""chunk:{<TO>+<DT>?<RB.?>*<JJ.?>*<NN.?|PRP|PRP\$|VBG|DT|POS|CD|VBN>+}"""
chunkparser = nltk.RegexpParser(gram)
chunked = chunkparser.parse(tag)
list1 = identification.chunk_search(segment_set[num], chunked)
s = []
if len(list1) != 0:
for j in range(len(chunked)):
str1 = ""
str3 = ""
if j in list1:
for k in range(j):
if k in list1:
str1 += get_chunk(chunked[k])
else:
str1 += (chunked[k][0] + " ")
for k in range(j + 1, len(chunked)):
if k in list1:
str3 += get_chunk(chunked[k])
else:
str3 += (chunked[k][0] + " ")
if chunked[j][1][1] == 'PRP':
str2 = "to whom "
else:
for x in range(len(chunked[j])):
if (chunked[j][x][1] == "NNP" or chunked[j][x][1] == "NNPS" or chunked[j][x][1] == "NNS" or
chunked[j][x][1] == "NN"):
break
for x1 in range(len(ner)):
if ner[x1][0] == chunked[j][x][0]:
if ner[x1][1] == "PERSON":
str2 = " to whom "
elif ner[x1][1] == "LOC" or ner[x1][1] == "ORG" or ner[x1][1] == "GPE":
str2 = " where "
elif ner[x1][1] == "TIME" or ner[x1][1] == "DATE":
str2 = " when "
else:
str2 = "to what"
str4 = str1 + str2 + str3
for k in range(len(segment_set)):
if k != num:
str4 += ("," + segment_set[k])
str4 += '?'
str4 = identification.postprocess(str4)
# str4 = 'Q.' + str4
s.append(str4)
return s
def what_whom2(segment_set, num, ner):
tok = nltk.word_tokenize(segment_set[num])
tag = nltk.pos_tag(tok)
gram = r"""chunk:{<IN>+<DT>?<RB.?>*<JJ.?>*<NN.?|PRP|PRP\$|POS|VBG|DT|CD|VBN>+}"""
chunkparser = nltk.RegexpParser(gram)
chunked = chunkparser.parse(tag)
list1 = identification.chunk_search(segment_set[num], chunked)
s = []
if len(list1) != 0:
for j in range(len(chunked)):
str1 = ""
str3 = ""
if j in list1:
for k in range(j):
if k in list1:
str1 += get_chunk(chunked[k])
else:
str1 += (chunked[k][0] + " ")
for k in range(j + 1, len(chunked)):
if k in list1:
str3 += get_chunk(chunked[k])
else:
str3 += (chunked[k][0] + " ")
if chunked[j][1][1] == 'PRP':
str2 = " " + chunked[j][0][0] + " whom "
else:
for x in range(len(chunked[j])):
if (chunked[j][x][1] == "NNP" or chunked[j][x][1] == "NNPS" or chunked[j][x][1] == "NNS" or
chunked[j][x][1] == "NN"):
break
for x1 in range(len(ner)):
if ner[x1][0] == chunked[j][x][0]:
if ner[x1][1] == "PERSON":
str2 = " " + chunked[j][0][0] + "whom "
elif ner[x1][1] == "LOC" or ner[x1][1] == "ORG" or ner[x1][1] == "GPE":
str2 = " where "
elif ner[x1][1] == "TIME" or ner[x1][1] == "DATE":
str2 = " when "
else:
str2 = " " + chunked[j][0][0] + " what"
str4 = str1 + str2 + str3
for k in range(len(segment_set)):
if k != num:
str4 += ("," + segment_set[k])
str4 += '?'
str4 = identification.postprocess(str4)
# str4 = 'Q.' + str4
s.append(str4)
return s
def whose(segment_set, num, ner):
tok = nltk.word_tokenize(segment_set[num])
tag = nltk.pos_tag(tok)
gram = r"""chunk:{<NN.?>*<PRP\$|POS>+<RB.?>*<JJ.?>*<NN.?|VBG|VBN>+}"""
chunkparser = nltk.RegexpParser(gram)
chunked = chunkparser.parse(tag)
list1 = identification.chunk_search(segment_set[num], chunked)
s = []
if len(list1) != 0:
for j in range(len(chunked)):
str1 = ""
str3 = ""
str2 = " whose "
if j in list1:
for k in range(j):
if k in list1:
str1 += get_chunk(chunked[k])
else:
str1 += (chunked[k][0] + " ")
for k in range(j + 1, len(chunked)):
if k in list1:
str3 += get_chunk(chunked[k])
else:
str3 += (chunked[k][0] + " ")
if chunked[j][1][1] == 'POS':
for k in range(2, len(chunked[j])):
str2 += (chunked[j][k][0] + " ")
else:
for k in range(1, len(chunked[j])):
str2 += (chunked[j][k][0] + " ")
str4 = str1 + str2 + str3
for k in range(len(segment_set)):
if k != num:
str4 += ("," + segment_set[k])
str4 += '?'
str4 = identification.postprocess(str4)
# str4 = 'Q.' + str4
s.append(str4)
return s
def howmany(segment_set, num, ner):
tok = nltk.word_tokenize(segment_set[num])
tag = nltk.pos_tag(tok)
gram = r"""chunk:{<DT>?<CD>+<RB>?<JJ|JJR|JJS>?<NN|NNS|NNP|NNPS|VBG>+}"""
chunkparser = nltk.RegexpParser(gram)
chunked = chunkparser.parse(tag)
list1 = identification.chunk_search(segment_set[num], chunked)
s = []
if len(list1) != 0:
for j in range(len(chunked)):
str1 = ""
str3 = ""
str2 = " how many "
if j in list1:
for k in range(j):
if k in list1:
str1 += get_chunk(chunked[k])
else:
str1 += (chunked[k][0] + " ")
for k in range(j + 1, len(chunked)):
if k in list1:
str3 += get_chunk(chunked[k])
else:
str3 += (chunked[k][0] + " ")
st = get_chunk(chunked[j])
tok = nltk.word_tokenize(st)
tag = nltk.pos_tag(tok)
gram = r"""chunk:{<RB>?<JJ|JJR|JJS>?<NN|NNS|NNP|NNPS|VBG>+}"""
chunkparser = nltk.RegexpParser(gram)
chunked1 = chunkparser.parse(tag)
list2 = identification.chunk_search(st, chunked1)
z = ""
for k in range(len(chunked1)):
if k in list2:
z += get_chunk(chunked1[k])
str4 = str1 + str2 + z + str3
for k in range(len(segment_set)):
if k != num:
str4 += ("," + segment_set[k])
str4 += '?'
str4 = identification.postprocess(str4)
# str4 = 'Q.' + str4
s.append(str4)
return s
def howmuch_1(segment_set, num, ner):
tok = nltk.word_tokenize(segment_set[num])
tag = nltk.pos_tag(tok)
gram = r"""chunk:{<IN>+<\$>?<CD>+}"""
chunkparser = nltk.RegexpParser(gram)
chunked = chunkparser.parse(tag)
list1 = identification.chunk_search(segment_set[num], chunked)
s = []
if len(list1) != 0:
for j in range(len(chunked)):
str1 = ""
str3 = ""
str2 = " how much "
if j in list1:
for k in range(j):
if k in list1:
str1 += get_chunk(chunked[k])
else:
str1 += (chunked[k][0] + " ")
for k in range(j + 1, len(chunked)):
if k in list1:
str3 += get_chunk(chunked[k])
else:
str3 += (chunked[k][0] + " ")
str2 = chunked[j][0][0] + str2
str4 = str1 + str2 + str3
for k in range(len(segment_set)):
if k != num:
str4 += ("," + segment_set[k])
str4 += '?'
str4 = identification.postprocess(str4)
# str4 = 'Q.' + str4
s.append(str4)
return s
# Question Validation
def hNvalidation(sentence):
flag = 1
Length = len(sentence)
if (Length > 4):
for i in range(Length):
if (i+4 < Length):
if (sentence[i]==' ' and sentence[i+1]=='h' and sentence[i+2]==' ' and sentence[i+3]=='N' and sentence[i+4]==' '):
flag = 0
return flag
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment