Commit c2c44ca2 authored by HiranyaDilukshi's avatar HiranyaDilukshi

Question Generation

parent eda06ea3
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\GAMER\\AppData\\Roaming\\Python\\Python38\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Warming up PyWSD (takes ~10 secs)... took 4.643778324127197 secs.\n"
]
}
],
"source": [
"#importing necessary libraries\n",
"import pke\n",
"import nltk\n",
"import torch\n",
"import random\n",
"import string\n",
"import warnings\n",
"import requests\n",
"import traceback\n",
"import numpy as np\n",
"from textwrap3 import wrap\n",
"from nltk.corpus import stopwords\n",
"from pywsd.lesk import simple_lesk\n",
"from pywsd.lesk import cosine_lesk\n",
"from pywsd.lesk import adapted_lesk\n",
"from nltk.corpus import wordnet as wn\n",
"from flashtext import KeywordProcessor\n",
"from nltk.tokenize import sent_tokenize\n",
"from pywsd.similarity import max_similarity\n",
"from transformers import T5ForConditionalGeneration,T5Tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\GAMER\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package brown to\n",
"[nltk_data] C:\\Users\\GAMER\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package brown is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] C:\\Users\\GAMER\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\GAMER\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"#Sets random number generator seeds for Python's random, NumPy's np.random, and PyTorch's CPU and GPU\n",
"def set_seed(seed: int):\n",
" random.seed(seed)\n",
" np.random.seed(seed)\n",
" torch.manual_seed(seed)\n",
" torch.cuda.manual_seed_all(seed)\n",
"\n",
"# Downloads NLTK resources and suppresses warnings before setting a seed value\n",
"nltk.download('punkt')\n",
"nltk.download('brown')\n",
"nltk.download('wordnet')\n",
"nltk.download('stopwords')\n",
"warnings.filterwarnings('ignore')\n",
"set_seed(42)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" A network is a large system consisting of many similar parts that are connected together to allow movement or communication along the parts, or\n",
"between the parts and a control centre. There are different types of networks available. Telecommunication networks , Television or radio network\n",
",Transport networks , Social networks. a digital telecommunications network, which allows nodes to share resources. In computer networks, computing\n",
"devices exchange data with each other using connections between nodes (data links). A network is a large system consisting of many similar parts that\n",
"are connected together to allow movement or communication along the parts, or between the parts and a control centre. There are different types of\n",
"networks available. Telecommunication networks , Television or radio network ,Transport networks , Social networks. a digital telecommunications\n",
"network, which allows nodes to share resources. In computer networks, computing devices exchange data with each other using connections between\n",
"nodes (data links). The Internet is the global system of interconnected computer networks that use the Internet protocol suite to link devices\n",
"worldwide. ▪It is a network of networks ▪Consists of private, public, academic, business, and government networks of local to global scope. ▪Linked\n",
"by a broad array of electronic, wireless, and optical networking technologies. system of rules that allow two or more entities of a communications\n",
"system to transmit information (wiki) ▪ the formal system of rules for correct behavior on official occasions (Cambridge Extensible markup language\n",
"is Designed to store and transport data ,Both human- and machine\u0002readable (self descriptive) , Often used for distributing data over networks ,Used by\n",
"may other tools like protocols. The main and the only component of XML is called an element . An element has 3 components 1. Start tag 2. Body 3.\n",
"End tag. An element has a name . Element names are case-sensitive .Element names must start with a letter or underscore . This is the XML\n",
"declaration ◦ Provides the instructions for the processor to understand the details of the XML file ◦ Encoding attribute indicates the character set\n",
"◦ UTF-8 = Unicode Transformation Format (with 8-bit blocks to represent a character) .An element may have attribute(s) ◦ Attributes describe the\n",
"element .Attribute value is always quoted (either single or double quote). Computer based systems can be mainly divided into 2 types, according to the\n",
"distribution of the components. Standalone Computer System - All the components are executed within a single device, Do not need a network, Usually\n",
"one or tightly coupled set of technologies are used to develop (JAVA, .NET). Distributed system- The components are distributed and executed in\n",
"multiple devices, Need a network, Multiple and loosely coupled set of technologies are used to develop (HTML+CSS+JS + PHP). Client-server architecture\n",
"(3-tier)- 3-tier architecture is used, when there is a need for data persistence and also to separate the application logic from the data . This can\n",
"be seen as an extension of 2-tier architecture. Client-server architecture (n-tier)- When there is a need for further separation and distribution of\n",
"the components, more tiers can be added and extend the 2-tier or 3-tier architecture into an n-tier architecture. A network is a large system\n",
"consisting of many similar parts that are connected together to allow movement or communication along the parts, or between the parts and a control\n",
"centre. There are different types of networks available. Telecommunication networks , Television or radio network ,Transport networks , Social\n",
"networks. a digital telecommunications network, which allows nodes to share resources. In computer networks, computing devices exchange data with\n",
"each other using connections between nodes (data links). The Internet is the global system of interconnected computer networks that use the Internet\n",
"protocol suite to link devices worldwide. ▪It is a network of networks ▪Consists of private, public, academic, business, and government networks of\n",
"local to global scope. ▪Linked by a broad array of electronic, wireless, and optical networking technologies. system of rules that allow two or more\n",
"entities of a communications system to transmit information (wiki) ▪ the formal system of rules for correct behavior on official occasions\n",
"(Cambridge Extensible markup language is Designed to store and transport data ,Both human- and machine\u0002readable (self descriptive) ,Often used for\n",
"distributing data over networks ,Used by may other tools like protocols. The main and the only component of XML is called an element . An element has\n",
"3 components 1. Start tag 2. Body 3. End tag. An element has a name . Element names are case-sensitive .Element names must start with a letter or\n",
"underscore . This is the XML declaration ◦ Provides the instructions for the processor to understand the details of the XML file ◦ Encoding\n",
"attribute indicates the character set ◦ UTF-8 = Unicode Transformation Format (with 8-bit blocks to represent a character) .An element may have\n",
"attribute(s) ◦ Attributes describe the element .Attribute value is always quoted (either single or double quote). Computer based systems can be\n",
"mainly divided into 2 types, according to the distribution of the components. Standalone Computer System - All the components are executed within a\n",
"single device, Do not need a network, Usually one or tightly coupled set of technologies are used to develop (JAVA, .NET). Distributed system- The\n",
"components are distributed and executed in multiple devices, Need a network, Multiple and loosely coupled set of technologies are used to develop\n",
"(HTML+CSS+JS + PHP). Client-server architecture (3-tier)- 3-tier architecture is used, when there is a need for data persistence and also to separate\n",
"the application logic from the data . This can be seen as an extension of 2-tier architecture. Client-server architecture (n-tier)- When there is a\n",
"need for further separation and distribution of the components, more tiers can be added and extend the 2-tier or 3-tier architecture into an n-tier\n",
"architecture. DNS is a network, which consists of Domain Name Servers . DNS helps to map the domain name to the IP address. Unified Resource\n",
"Identifier (URI) 18 . URI is a string of characters designed for unambiguous identification of resources. URI is extensible via the URI scheme.\n",
"Unified Resource Name(URN) is a persistent, location-independent identifier. Website can be seen as a collection of web pages with static content\n",
".Early websites were entirely developed only using HTML – Nowadays, some server-side application components and databases are used to dynamically\n",
"generate the content – However, still the content is not user tailored. Web application is a single page or a collection of web pages, with\n",
"interactive components to dynamically generate the content E-commerce is a large domain, which covers many related concepts like – Internet\n",
"marketing – Electronic fund transfer – Online transaction processing. E-commerce systems provide online buying and selling over the internet. There\n",
"is a large variety of types of ecommerce systems – Online goods/soft items(software, e-books, videos) – Retail services (travel, food, cloths) –\n",
"Marketing services (advertising, auctions) – Customer services (help centers, online banking). Advantages of e-commerce . To businesses – After the\n",
"capital cost, maintenance cost is low – Global customers – Increased market share. Disadvantages of e-commerce. To businesses – For physical items,\n",
"storing and distributing is needed – Need to update the system frequently – Depends on the power and the internet. DNS is a network, which consists\n",
"of Domain Name Servers . DNS helps to map the domain name to the IP address. Unified Resource Identifier (URI) 18 . URI is a string of characters\n",
"designed for unambiguous identification of resources. URI is extensible via the URI scheme. Unified Resource Name(URN) is a persistent, location-\n",
"independent identifier. Website can be seen as a collection of web pages with static content .Early websites were entirely developed only using HTML\n",
"– Nowadays, some server-side application components and databases are used to dynamically generate the content – However, still the content is not\n",
"user tailored. Web application is a single page or a collection of web pages, with interactive components to dynamically generate the content\n",
"E-commerce is a large domain, which covers many related concepts like – Internet marketing – Electronic fund transfer – Online transaction processing.\n",
"E-commerce systems provide online buying and selling over the internet. There is a large variety of types of ecommerce systems – Online goods/soft\n",
"items(software, e-books, videos) – Retail services (travel, food, cloths) – Marketing services (advertising, auctions) – Customer services (help\n",
"centers, online banking). Advantages of e-commerce . To businesses – After the capital cost, maintenance cost is low – Global customers – Increased\n",
"market share. Disadvantages of e-commerce. To businesses – For physical items, storing and distributing is needed – Need to update the system\n",
"frequently – Depends on the power and the internet The Internet is the global system of interconnected computer networks that use the Internet\n",
"protocol suite to link devices worldwide. ▪It is a network of networks ▪Consists of private, public, academic, business, and government networks of\n",
"local to global scope. ▪Linked by a broad array of electronic, wireless, and optical networking technologies. system of rules that allow two or more\n",
"entities of a communications system to transmit information (wiki) ▪ the formal system of rules for correct behavior on official occasions\n",
"(Cambridge Extensible markup language is Designed to store and transport data ,Both human- and machine\u0002readable (self descriptive) ,Often used for\n",
"distributing data over networks ,Used by may other tools like protocols. The main and the only component of XML is called an element . An element has\n",
"3 components 1. Start tag 2. Body 3. End tag. An element has a name . Element names are case-sensitive .Element names must start with a letter or\n",
"underscore . This is the XML declaration ◦ Provides the instructions for the processor to understand the details of the XML file ◦ Encoding\n",
"attribute indicates the character set ◦ UTF-8 = Unicode Transformation Format (with 8-bit blocks to represent a character) .An element may have\n",
"attribute(s) ◦ Attributes describe the element .Attribute value is always quoted (either single or double quote). Computer based systems can be\n",
"mainly divided into 2 types, according to the distribution of the components. Standalone Computer System - All the components are executed within a\n",
"single device, Do not need a network, Usually one or tightly coupled set of technologies are used to develop (JAVA, .NET). Distributed system- The\n",
"components are distributed and executed in multiple devices, Need a network, Multiple and loosely coupled set of technologies are used to develop\n",
"(HTML+CSS+JS + PHP). Client-server architecture (3-tier)- 3-tier architecture is used, when there is a need for data persistence and also to separate\n",
"the application logic from the data . This can be seen as an extension of 2-tier architecture. Client-server architecture (n-tier)- When there is a\n",
"need for further separation and distribution of the components, more tiers can be added and extend the 2-tier or 3-tier architecture into an n-tier\n",
"architecture. DNS is a network, which consists of Domain Name Servers . DNS helps to map the domain name to the IP address. Unified Resource\n",
"Identifier (URI) 18 . URI is a string of characters designed for unambiguous identification of resources. URI is extensible via the URI scheme.\n",
"Unified Resource Name(URN) is a persistent, location-independent identifier. Website can be seen as a collection of web pages with static content\n",
".Early websites were entirely developed only using HTML – Nowadays, some server-side application components and databases are used to dynamically\n",
"generate the content – However, still the content is not user tailored. Web application is a single page or a collection of web pages, with\n",
"interactive components to dynamically generate the content E-commerce is a large domain, which covers many related concepts like – Internet\n",
"marketing – Electronic fund transfer – Online transaction processing. E-commerce systems provide online buying and selling over the internet. There\n",
"is a large variety of types of ecommerce systems – Online goods/soft items(software, e-books, videos) – Retail services (travel, food, cloths) –\n",
"Marketing services (advertising, auctions) – Customer services (help centers, online banking). Advantages of e-commerce . To businesses – After the\n",
"capital cost, maintenance cost is low – Global customers – Increased market share. Disadvantages of e-commerce. To businesses – For physical items,\n",
"storing and distributing is needed – Need to update the system frequently – Depends on the power and the internet\n",
"\n",
"\n"
]
}
],
"source": [
"text = \"\"\"\n",
"A network is a large system consisting of many similar parts that are connected together to allow \n",
"movement or communication along the parts, or between the parts and a control centre. There are \n",
"different types of networks available. Telecommunication networks , Television or radio network \n",
",Transport networks , Social networks. a digital telecommunications network, which allows nodes to \n",
"share resources. In computer networks, computing devices exchange data with each other using \n",
"connections between nodes (data links). A network is a large system consisting of many similar parts that are connected together to allow \n",
"movement or communication along the parts, or between the parts and a control centre. There are \n",
"different types of networks available. Telecommunication networks , Television or radio network \n",
",Transport networks , Social networks. a digital telecommunications network, which allows nodes to \n",
"share resources. In computer networks, computing devices exchange data with each other using \n",
"connections between nodes (data links). \n",
"The Internet is the global system of interconnected computer networks that use the Internet protocol \n",
"suite to link devices worldwide. ▪It is a network of networks ▪Consists of private, public, academic, \n",
"business, and government networks of local to global scope. ▪Linked by a broad array of electronic, \n",
"wireless, and optical networking technologies. system of rules that allow two or more entities of a \n",
"communications system to transmit information (wiki) ▪ the formal system of rules for correct behavior \n",
"on official occasions (Cambridge\n",
"Extensible markup language is Designed to store and transport data ,Both human- and machine\u0002readable (self descriptive) ,\n",
"Often used for distributing data over networks ,Used by may other tools like \n",
"protocols. The main and the only component of XML is called an element . An element has 3 \n",
"components 1. Start tag 2. Body 3. End tag. An element has a name . Element names are case-sensitive \n",
".Element names must start with a letter or underscore . \n",
"This is the XML declaration ◦ Provides the instructions for the processor to understand the details of the \n",
"XML file ◦ Encoding attribute indicates the character set ◦ UTF-8 = Unicode Transformation Format (with \n",
"8-bit blocks to represent a character) .An element may have attribute(s) ◦ Attributes describe the \n",
"element .Attribute value is always quoted (either single or double quote).\n",
"Computer based systems can be mainly divided into 2 types, according to the distribution of the \n",
"components. Standalone Computer System - All the components are executed within a single device, Do \n",
"not need a network, Usually one or tightly coupled set of technologies are used to develop (JAVA, .NET).\n",
"Distributed system- The components are distributed and executed in multiple devices, Need a network,\n",
"Multiple and loosely coupled set of technologies are used to develop (HTML+CSS+JS + PHP).\n",
"Client-server architecture (3-tier)- 3-tier architecture is used, when there is a need for data persistence \n",
"and also to separate the application logic from the data . This can be seen as an extension of 2-tier \n",
"architecture. Client-server architecture (n-tier)- When there is a need for further separation and \n",
"distribution of the components, more tiers can be added and extend the 2-tier or 3-tier architecture into \n",
"an n-tier architecture.\n",
"A network is a large system consisting of many similar parts that are connected together to allow \n",
"movement or communication along the parts, or between the parts and a control centre. There are \n",
"different types of networks available. Telecommunication networks , Television or radio network \n",
",Transport networks , Social networks. a digital telecommunications network, which allows nodes to \n",
"share resources. In computer networks, computing devices exchange data with each other using \n",
"connections between nodes (data links). \n",
"The Internet is the global system of interconnected computer networks that use the Internet protocol \n",
"suite to link devices worldwide. ▪It is a network of networks ▪Consists of private, public, academic, \n",
"business, and government networks of local to global scope. ▪Linked by a broad array of electronic, \n",
"wireless, and optical networking technologies. system of rules that allow two or more entities of a \n",
"communications system to transmit information (wiki) ▪ the formal system of rules for correct behavior \n",
"on official occasions (Cambridge\n",
"Extensible markup language is Designed to store and transport data ,Both human- and machine\u0002readable (self descriptive) ,Often used for distributing data over networks ,Used by may other tools like \n",
"protocols. The main and the only component of XML is called an element . An element has 3 \n",
"components 1. Start tag 2. Body 3. End tag. An element has a name . Element names are case-sensitive \n",
".Element names must start with a letter or underscore . \n",
"This is the XML declaration ◦ Provides the instructions for the processor to understand the details of the \n",
"XML file ◦ Encoding attribute indicates the character set ◦ UTF-8 = Unicode Transformation Format (with \n",
"8-bit blocks to represent a character) .An element may have attribute(s) ◦ Attributes describe the \n",
"element .Attribute value is always quoted (either single or double quote).\n",
"Computer based systems can be mainly divided into 2 types, according to the distribution of the \n",
"components. Standalone Computer System - All the components are executed within a single device, Do \n",
"not need a network, Usually one or tightly coupled set of technologies are used to develop (JAVA, .NET).\n",
"Distributed system- The components are distributed and executed in multiple devices, Need a network,\n",
"Multiple and loosely coupled set of technologies are used to develop (HTML+CSS+JS + PHP).\n",
"Client-server architecture (3-tier)- 3-tier architecture is used, when there is a need for data persistence \n",
"and also to separate the application logic from the data . This can be seen as an extension of 2-tier \n",
"architecture. Client-server architecture (n-tier)- When there is a need for further separation and \n",
"distribution of the components, more tiers can be added and extend the 2-tier or 3-tier architecture into \n",
"an n-tier architecture.\n",
"DNS is a network, which consists of Domain Name Servers . DNS helps to map the domain name to the \n",
"IP address. Unified Resource Identifier (URI) 18 . URI is a string of characters designed for unambiguous \n",
"identification of resources. URI is extensible via the URI scheme. Unified Resource Name(URN) is a \n",
"persistent, location-independent identifier. \n",
"Website can be seen as a collection of web pages with static content .Early websites were entirely \n",
"developed only using HTML – Nowadays, some server-side application components and databases are \n",
"used to dynamically generate the content – However, still the content is not user tailored. Web \n",
"application is a single page or a collection of web pages, with interactive components to dynamically \n",
"generate the content \n",
"E-commerce is a large domain, which covers many related concepts like – Internet marketing –\n",
"Electronic fund transfer – Online transaction processing. E-commerce systems provide online buying and \n",
"selling over the internet. There is a large variety of types of ecommerce systems – Online goods/soft \n",
"items(software, e-books, videos) – Retail services (travel, food, cloths) – Marketing services (advertising, \n",
"auctions) – Customer services (help centers, online banking).\n",
"Advantages of e-commerce . To businesses – After the capital cost, maintenance cost is low – Global \n",
"customers – Increased market share. Disadvantages of e-commerce. To businesses – For physical items, \n",
"storing and distributing is needed – Need to update the system frequently – Depends on the power and \n",
"the internet. DNS is a network, which consists of Domain Name Servers . DNS helps to map the domain name to the \n",
"IP address. Unified Resource Identifier (URI) 18 . URI is a string of characters designed for unambiguous \n",
"identification of resources. URI is extensible via the URI scheme. Unified Resource Name(URN) is a \n",
"persistent, location-independent identifier. \n",
"Website can be seen as a collection of web pages with static content .Early websites were entirely \n",
"developed only using HTML – Nowadays, some server-side application components and databases are \n",
"used to dynamically generate the content – However, still the content is not user tailored. Web \n",
"application is a single page or a collection of web pages, with interactive components to dynamically \n",
"generate the content \n",
"E-commerce is a large domain, which covers many related concepts like – Internet marketing –\n",
"Electronic fund transfer – Online transaction processing. E-commerce systems provide online buying and \n",
"selling over the internet. There is a large variety of types of ecommerce systems – Online goods/soft \n",
"items(software, e-books, videos) – Retail services (travel, food, cloths) – Marketing services (advertising, \n",
"auctions) – Customer services (help centers, online banking).\n",
"Advantages of e-commerce . To businesses – After the capital cost, maintenance cost is low – Global \n",
"customers – Increased market share. Disadvantages of e-commerce. To businesses – For physical items, \n",
"storing and distributing is needed – Need to update the system frequently – Depends on the power and \n",
"the internet\n",
"The Internet is the global system of interconnected computer networks that use the Internet protocol \n",
"suite to link devices worldwide. ▪It is a network of networks ▪Consists of private, public, academic, \n",
"business, and government networks of local to global scope. ▪Linked by a broad array of electronic, \n",
"wireless, and optical networking technologies. system of rules that allow two or more entities of a \n",
"communications system to transmit information (wiki) ▪ the formal system of rules for correct behavior \n",
"on official occasions (Cambridge\n",
"Extensible markup language is Designed to store and transport data ,Both human- and machine\u0002readable (self descriptive) ,Often used for distributing data over networks ,Used by may other tools like \n",
"protocols. The main and the only component of XML is called an element . An element has 3 \n",
"components 1. Start tag 2. Body 3. End tag. An element has a name . Element names are case-sensitive \n",
".Element names must start with a letter or underscore . \n",
"This is the XML declaration ◦ Provides the instructions for the processor to understand the details of the \n",
"XML file ◦ Encoding attribute indicates the character set ◦ UTF-8 = Unicode Transformation Format (with \n",
"8-bit blocks to represent a character) .An element may have attribute(s) ◦ Attributes describe the \n",
"element .Attribute value is always quoted (either single or double quote).\n",
"Computer based systems can be mainly divided into 2 types, according to the distribution of the \n",
"components. Standalone Computer System - All the components are executed within a single device, Do \n",
"not need a network, Usually one or tightly coupled set of technologies are used to develop (JAVA, .NET).\n",
"Distributed system- The components are distributed and executed in multiple devices, Need a network,\n",
"Multiple and loosely coupled set of technologies are used to develop (HTML+CSS+JS + PHP).\n",
"Client-server architecture (3-tier)- 3-tier architecture is used, when there is a need for data persistence \n",
"and also to separate the application logic from the data . This can be seen as an extension of 2-tier \n",
"architecture. Client-server architecture (n-tier)- When there is a need for further separation and \n",
"distribution of the components, more tiers can be added and extend the 2-tier or 3-tier architecture into \n",
"an n-tier architecture.\n",
"DNS is a network, which consists of Domain Name Servers . DNS helps to map the domain name to the \n",
"IP address. Unified Resource Identifier (URI) 18 . URI is a string of characters designed for unambiguous \n",
"identification of resources. URI is extensible via the URI scheme. Unified Resource Name(URN) is a \n",
"persistent, location-independent identifier. \n",
"Website can be seen as a collection of web pages with static content .Early websites were entirely \n",
"developed only using HTML – Nowadays, some server-side application components and databases are \n",
"used to dynamically generate the content – However, still the content is not user tailored. Web \n",
"application is a single page or a collection of web pages, with interactive components to dynamically \n",
"generate the content \n",
"E-commerce is a large domain, which covers many related concepts like – Internet marketing –\n",
"Electronic fund transfer – Online transaction processing. E-commerce systems provide online buying and \n",
"selling over the internet. There is a large variety of types of ecommerce systems – Online goods/soft \n",
"items(software, e-books, videos) – Retail services (travel, food, cloths) – Marketing services (advertising, \n",
"auctions) – Customer services (help centers, online banking).\n",
"Advantages of e-commerce . To businesses – After the capital cost, maintenance cost is low – Global \n",
"customers – Increased market share. Disadvantages of e-commerce. To businesses – For physical items, \n",
"storing and distributing is needed – Need to update the system frequently – Depends on the power and \n",
"the internet\n",
" \"\"\"\n",
"\n",
"for wrp in wrap(text, 150):\n",
" print (wrp)\n",
"print (\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"device is cuda\n"
]
}
],
"source": [
"summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')\n",
"summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')\n",
"\n",
"question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')\n",
"question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") #checks if a CUDA-enabled GPU is available. \n",
"question_model = question_model.to(device)\n",
"summary_model = summary_model.to(device)\n",
"\n",
"print(\"device is\", device)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"original Text >>\n",
" A network is a large system consisting of many similar parts that are connected together to allow movement or communication along the parts, or\n",
"between the parts and a control centre. There are different types of networks available. Telecommunication networks , Television or radio network\n",
",Transport networks , Social networks. a digital telecommunications network, which allows nodes to share resources. In computer networks, computing\n",
"devices exchange data with each other using connections between nodes (data links). A network is a large system consisting of many similar parts that\n",
"are connected together to allow movement or communication along the parts, or between the parts and a control centre. There are different types of\n",
"networks available. Telecommunication networks , Television or radio network ,Transport networks , Social networks. a digital telecommunications\n",
"network, which allows nodes to share resources. In computer networks, computing devices exchange data with each other using connections between\n",
"nodes (data links). The Internet is the global system of interconnected computer networks that use the Internet protocol suite to link devices\n",
"worldwide. ▪It is a network of networks ▪Consists of private, public, academic, business, and government networks of local to global scope. ▪Linked\n",
"by a broad array of electronic, wireless, and optical networking technologies. system of rules that allow two or more entities of a communications\n",
"system to transmit information (wiki) ▪ the formal system of rules for correct behavior on official occasions (Cambridge Extensible markup language\n",
"is Designed to store and transport data ,Both human- and machine\u0002readable (self descriptive) , Often used for distributing data over networks ,Used by\n",
"may other tools like protocols. The main and the only component of XML is called an element . An element has 3 components 1. Start tag 2. Body 3.\n",
"End tag. An element has a name . Element names are case-sensitive .Element names must start with a letter or underscore . This is the XML\n",
"declaration ◦ Provides the instructions for the processor to understand the details of the XML file ◦ Encoding attribute indicates the character set\n",
"◦ UTF-8 = Unicode Transformation Format (with 8-bit blocks to represent a character) .An element may have attribute(s) ◦ Attributes describe the\n",
"element .Attribute value is always quoted (either single or double quote). Computer based systems can be mainly divided into 2 types, according to the\n",
"distribution of the components. Standalone Computer System - All the components are executed within a single device, Do not need a network, Usually\n",
"one or tightly coupled set of technologies are used to develop (JAVA, .NET). Distributed system- The components are distributed and executed in\n",
"multiple devices, Need a network, Multiple and loosely coupled set of technologies are used to develop (HTML+CSS+JS + PHP). Client-server architecture\n",
"(3-tier)- 3-tier architecture is used, when there is a need for data persistence and also to separate the application logic from the data . This can\n",
"be seen as an extension of 2-tier architecture. Client-server architecture (n-tier)- When there is a need for further separation and distribution of\n",
"the components, more tiers can be added and extend the 2-tier or 3-tier architecture into an n-tier architecture. A network is a large system\n",
"consisting of many similar parts that are connected together to allow movement or communication along the parts, or between the parts and a control\n",
"centre. There are different types of networks available. Telecommunication networks , Television or radio network ,Transport networks , Social\n",
"networks. a digital telecommunications network, which allows nodes to share resources. In computer networks, computing devices exchange data with\n",
"each other using connections between nodes (data links). The Internet is the global system of interconnected computer networks that use the Internet\n",
"protocol suite to link devices worldwide. ▪It is a network of networks ▪Consists of private, public, academic, business, and government networks of\n",
"local to global scope. ▪Linked by a broad array of electronic, wireless, and optical networking technologies. system of rules that allow two or more\n",
"entities of a communications system to transmit information (wiki) ▪ the formal system of rules for correct behavior on official occasions\n",
"(Cambridge Extensible markup language is Designed to store and transport data ,Both human- and machine\u0002readable (self descriptive) ,Often used for\n",
"distributing data over networks ,Used by may other tools like protocols. The main and the only component of XML is called an element . An element has\n",
"3 components 1. Start tag 2. Body 3. End tag. An element has a name . Element names are case-sensitive .Element names must start with a letter or\n",
"underscore . This is the XML declaration ◦ Provides the instructions for the processor to understand the details of the XML file ◦ Encoding\n",
"attribute indicates the character set ◦ UTF-8 = Unicode Transformation Format (with 8-bit blocks to represent a character) .An element may have\n",
"attribute(s) ◦ Attributes describe the element .Attribute value is always quoted (either single or double quote). Computer based systems can be\n",
"mainly divided into 2 types, according to the distribution of the components. Standalone Computer System - All the components are executed within a\n",
"single device, Do not need a network, Usually one or tightly coupled set of technologies are used to develop (JAVA, .NET). Distributed system- The\n",
"components are distributed and executed in multiple devices, Need a network, Multiple and loosely coupled set of technologies are used to develop\n",
"(HTML+CSS+JS + PHP). Client-server architecture (3-tier)- 3-tier architecture is used, when there is a need for data persistence and also to separate\n",
"the application logic from the data . This can be seen as an extension of 2-tier architecture. Client-server architecture (n-tier)- When there is a\n",
"need for further separation and distribution of the components, more tiers can be added and extend the 2-tier or 3-tier architecture into an n-tier\n",
"architecture. DNS is a network, which consists of Domain Name Servers . DNS helps to map the domain name to the IP address. Unified Resource\n",
"Identifier (URI) 18 . URI is a string of characters designed for unambiguous identification of resources. URI is extensible via the URI scheme.\n",
"Unified Resource Name(URN) is a persistent, location-independent identifier. Website can be seen as a collection of web pages with static content\n",
".Early websites were entirely developed only using HTML – Nowadays, some server-side application components and databases are used to dynamically\n",
"generate the content – However, still the content is not user tailored. Web application is a single page or a collection of web pages, with\n",
"interactive components to dynamically generate the content E-commerce is a large domain, which covers many related concepts like – Internet\n",
"marketing – Electronic fund transfer – Online transaction processing. E-commerce systems provide online buying and selling over the internet. There\n",
"is a large variety of types of ecommerce systems – Online goods/soft items(software, e-books, videos) – Retail services (travel, food, cloths) –\n",
"Marketing services (advertising, auctions) – Customer services (help centers, online banking). Advantages of e-commerce . To businesses – After the\n",
"capital cost, maintenance cost is low – Global customers – Increased market share. Disadvantages of e-commerce. To businesses – For physical items,\n",
"storing and distributing is needed – Need to update the system frequently – Depends on the power and the internet. DNS is a network, which consists\n",
"of Domain Name Servers . DNS helps to map the domain name to the IP address. Unified Resource Identifier (URI) 18 . URI is a string of characters\n",
"designed for unambiguous identification of resources. URI is extensible via the URI scheme. Unified Resource Name(URN) is a persistent, location-\n",
"independent identifier. Website can be seen as a collection of web pages with static content .Early websites were entirely developed only using HTML\n",
"– Nowadays, some server-side application components and databases are used to dynamically generate the content – However, still the content is not\n",
"user tailored. Web application is a single page or a collection of web pages, with interactive components to dynamically generate the content\n",
"E-commerce is a large domain, which covers many related concepts like – Internet marketing – Electronic fund transfer – Online transaction processing.\n",
"E-commerce systems provide online buying and selling over the internet. There is a large variety of types of ecommerce systems – Online goods/soft\n",
"items(software, e-books, videos) – Retail services (travel, food, cloths) – Marketing services (advertising, auctions) – Customer services (help\n",
"centers, online banking). Advantages of e-commerce . To businesses – After the capital cost, maintenance cost is low – Global customers – Increased\n",
"market share. Disadvantages of e-commerce. To businesses – For physical items, storing and distributing is needed – Need to update the system\n",
"frequently – Depends on the power and the internet The Internet is the global system of interconnected computer networks that use the Internet\n",
"protocol suite to link devices worldwide. ▪It is a network of networks ▪Consists of private, public, academic, business, and government networks of\n",
"local to global scope. ▪Linked by a broad array of electronic, wireless, and optical networking technologies. system of rules that allow two or more\n",
"entities of a communications system to transmit information (wiki) ▪ the formal system of rules for correct behavior on official occasions\n",
"(Cambridge Extensible markup language is Designed to store and transport data ,Both human- and machine\u0002readable (self descriptive) ,Often used for\n",
"distributing data over networks ,Used by may other tools like protocols. The main and the only component of XML is called an element . An element has\n",
"3 components 1. Start tag 2. Body 3. End tag. An element has a name . Element names are case-sensitive .Element names must start with a letter or\n",
"underscore . This is the XML declaration ◦ Provides the instructions for the processor to understand the details of the XML file ◦ Encoding\n",
"attribute indicates the character set ◦ UTF-8 = Unicode Transformation Format (with 8-bit blocks to represent a character) .An element may have\n",
"attribute(s) ◦ Attributes describe the element .Attribute value is always quoted (either single or double quote). Computer based systems can be\n",
"mainly divided into 2 types, according to the distribution of the components. Standalone Computer System - All the components are executed within a\n",
"single device, Do not need a network, Usually one or tightly coupled set of technologies are used to develop (JAVA, .NET). Distributed system- The\n",
"components are distributed and executed in multiple devices, Need a network, Multiple and loosely coupled set of technologies are used to develop\n",
"(HTML+CSS+JS + PHP). Client-server architecture (3-tier)- 3-tier architecture is used, when there is a need for data persistence and also to separate\n",
"the application logic from the data . This can be seen as an extension of 2-tier architecture. Client-server architecture (n-tier)- When there is a\n",
"need for further separation and distribution of the components, more tiers can be added and extend the 2-tier or 3-tier architecture into an n-tier\n",
"architecture. DNS is a network, which consists of Domain Name Servers . DNS helps to map the domain name to the IP address. Unified Resource\n",
"Identifier (URI) 18 . URI is a string of characters designed for unambiguous identification of resources. URI is extensible via the URI scheme.\n",
"Unified Resource Name(URN) is a persistent, location-independent identifier. Website can be seen as a collection of web pages with static content\n",
".Early websites were entirely developed only using HTML – Nowadays, some server-side application components and databases are used to dynamically\n",
"generate the content – However, still the content is not user tailored. Web application is a single page or a collection of web pages, with\n",
"interactive components to dynamically generate the content E-commerce is a large domain, which covers many related concepts like – Internet\n",
"marketing – Electronic fund transfer – Online transaction processing. E-commerce systems provide online buying and selling over the internet. There\n",
"is a large variety of types of ecommerce systems – Online goods/soft items(software, e-books, videos) – Retail services (travel, food, cloths) –\n",
"Marketing services (advertising, auctions) – Customer services (help centers, online banking). Advantages of e-commerce . To businesses – After the\n",
"capital cost, maintenance cost is low – Global customers – Increased market share. Disadvantages of e-commerce. To businesses – For physical items,\n",
"storing and distributing is needed – Need to update the system frequently – Depends on the power and the internet\n",
"\n",
"\n",
"Summarized Text >>\n",
"Network is a large system consisting of many similar parts that are connected together to allow movement or communication along the parts. In computer networks, computing devices exchange data with each other using connections between nodes (data links) the main and the only component of xml is\n",
"called an element - it has three components: start tag 2. Body 3. End tag.\n",
"\n",
"\n"
]
}
],
"source": [
"def postprocesstext (content): # takes a string content as input\n",
" final=\"\"\n",
" for sent in sent_tokenize(content):\n",
" sent = sent.capitalize()\n",
" final = final +\" \"+sent\n",
" return final\n",
"\n",
"def summarizer(\n",
" text,\n",
" model,\n",
" tokenizer,\n",
" max_len = 512\n",
" ):\n",
" text = text.strip().replace(\"\\n\",\" \")#removes leading and trailing whitespace and replaces newline characters in the input\n",
" text = \"summarize: \"+text\n",
" #encodes the modified text using the tokenizer.\n",
" encoding = tokenizer.encode_plus(\n",
" text,\n",
" max_length=max_len, \n",
" pad_to_max_length=False,\n",
" return_tensors=\"pt\",\n",
" truncation=True\n",
" ).to(device)\n",
" input_ids, attention_mask = encoding[\"input_ids\"], encoding[\"attention_mask\"]\n",
"\n",
" outs = model.generate(\n",
" input_ids=input_ids, #generated summary is decoded from token IDs to text using the provided tokenizer.\n",
" attention_mask=attention_mask,\n",
" early_stopping=True,\n",
" num_beams=3,\n",
" num_return_sequences=1,\n",
" no_repeat_ngram_size=2,\n",
" min_length = 75,\n",
" max_length=300\n",
" )\n",
"\n",
"\n",
" dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]\n",
" summary = dec[0]\n",
" summary = postprocesstext(summary)\n",
" summary= summary.strip()\n",
"\n",
" return summary\n",
"\n",
"summarized_text = summarizer(text,summary_model,summary_tokenizer)\n",
"\n",
"\n",
"print (\"\\noriginal Text >>\")\n",
"for wrp in wrap(text, 150):\n",
" print (wrp)\n",
"print (\"\\n\")\n",
"print (\"Summarized Text >>\")\n",
"for wrp in wrap(summarized_text, 300):\n",
" print (wrp)\n",
"print (\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def get_nouns_multipartite(content): #takes a string content as input and aims to extract important keywords \n",
" out=[]\n",
" try:\n",
" extractor = pke.unsupervised.MultipartiteRank()\n",
" extractor.load_document(input=content,language='en')\n",
"\n",
" #part-of-speech (POS) tags for proper nouns (PROPN) and common nouns (NOUN)\n",
" pos = {'PROPN','NOUN'}\n",
" # Stoplist containing punctuation marks, special tokens, and English stopwords\n",
" stoplist = list(string.punctuation)\n",
" stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']\n",
" stoplist += stopwords.words('english')\n",
"\n",
" #Candidate keyphrases are selected by the extractor based on the specified POS tags\n",
" extractor.candidate_selection(pos=pos) \n",
"\n",
" #Candidate keyphrases are weighted using the MultipartiteRank method.\n",
" extractor.candidate_weighting(alpha=1.1,\n",
" threshold=0.75,\n",
" method='average')\n",
" keyphrases = extractor.get_n_best(n=25)\n",
"\n",
"\n",
" for val in keyphrases:\n",
" out.append(val[0])\n",
" except:\n",
" out = []\n",
" traceback.print_exc()\n",
"\n",
" return out\n",
"\n",
"#identify important keywords from the original text that also appear in the summary text\n",
"def get_keywords(\n",
" originaltext,\n",
" summarytext,\n",
" n_questions = 15\n",
" ):\n",
" keywords = get_nouns_multipartite(originaltext)\n",
" # print (\"keywords unsummarized: \",keywords)\n",
" keyword_processor = KeywordProcessor()\n",
" for keyword in keywords:\n",
" keyword_processor.add_keyword(keyword)\n",
"\n",
" keywords_found = keyword_processor.extract_keywords(summarytext)\n",
" keywords_found = list(set(keywords_found))\n",
" # print (\"keywords_found in summarized: \",keywords_found)\n",
"\n",
" important_keywords =[]\n",
" for keyword in keywords:\n",
" if keyword in keywords_found:\n",
" important_keywords.append(keyword)\n",
"\n",
" #returns a sublist of important_keywords, with a maximum length of n_questions\n",
" return important_keywords[:n_questions] if len(important_keywords) > n_questions else important_keywords"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"#Encode the context and answer using a tokenizer, generating the question using the model and decoding the generated question.\n",
"def get_question(context,answer,model,tokenizer):\n",
" text = \"context: {} answer: {}\".format(context,answer)\n",
" encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors=\"pt\").to(device)\n",
"\n",
" #input_ids contains the numerical representations of the tokens, while attention_mask marks which tokens should be attended to by the model and which should be ignored.\n",
" input_ids, attention_mask = encoding[\"input_ids\"], encoding[\"attention_mask\"]\n",
"\n",
" outs = model.generate(\n",
" input_ids=input_ids,\n",
" attention_mask=attention_mask,\n",
" early_stopping=True,\n",
" num_beams=5,\n",
" num_return_sequences=1,\n",
" no_repeat_ngram_size=2,\n",
" max_length=72\n",
" )\n",
"\n",
"\n",
" dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]\n",
"\n",
"\n",
" Question = dec[0].replace(\"question:\",\"\")\n",
" Question= Question.strip()\n",
" return Question\n",
"\n",
"#Generates a list of distractors (alternative options) for a given word using WordNet-finds hyponyms (related concepts) for a given synonym set (syn) \n",
"def get_distractors_wordnet(syn,word):\n",
" distractors=[]\n",
" word= word.lower()\n",
" orig_word = word\n",
" if len(word.split())>0:\n",
" word = word.replace(\" \",\"_\")\n",
"\n",
" hypernym = syn.hypernyms() #hypernyms- a more general concept\n",
" if len(hypernym) == 0: \n",
" return distractors\n",
" \n",
" for item in hypernym[0].hyponyms():\n",
" name = item.lemmas()[0].name()\n",
" if name == orig_word:\n",
" continue\n",
" name = name.replace(\"_\",\" \")\n",
" name = \" \".join(w.capitalize() for w in name.split())\n",
" if name is not None and name not in distractors:\n",
" distractors.append(name)\n",
" #list of formatted distractors.\n",
" return distractors\n",
"\n",
"#Similarity measures and the Lesk algorithm to choose the most relevant synset\n",
"def get_wordsense(sent,word):\n",
" word= word.lower()\n",
" \n",
" if len(word.split())>0:\n",
" word = word.replace(\" \",\"_\")\n",
" \n",
" \n",
" synsets = wn.synsets(word,'n')\n",
" if synsets:\n",
" wup = max_similarity(sent, word, 'wup', pos='n')\n",
" adapted_lesk_output = adapted_lesk(sent, word, pos='n')\n",
" lowest_index = min (synsets.index(wup),synsets.index(adapted_lesk_output))\n",
" return synsets[lowest_index]\n",
" else:\n",
" return None\n",
"\n",
"#Generates distractors using the ConceptNet API.\n",
"def get_distractors_conceptnet(word):\n",
" word = word.lower()\n",
" original_word= word\n",
" if (len(word.split())>0):\n",
" word = word.replace(\" \",\"_\")\n",
"\n",
" distractor_list = [] \n",
" #Construct a URL to query the ConceptNet API with the word as the start and end node.\n",
" url = \"http://api.conceptnet.io/query?node=/c/en/%s/n&rel=/r/PartOf&start=/c/en/%s&limit=20\"%(word,word)\n",
" obj = requests.get(url).json()\n",
"\n",
" for edge in obj['edges']:\n",
" link = edge['end']['term'] \n",
"\n",
" #Construct another URL to query the ConceptNet API using the extracted term as the node.\n",
" url2 = \"http://api.conceptnet.io/query?node=%s&rel=/r/PartOf&end=%s&limit=20\"%(link,link)\n",
" obj2 = requests.get(url2).json()\n",
"\n",
" for edge in obj2['edges']:\n",
" word2 = edge['start']['label']\n",
"\n",
" if word2 not in distractor_list and original_word.lower() not in word2.lower():\n",
" distractor_list.append(word2)\n",
" \n",
" return distractor_list\n",
"\n",
"#Combines the results from WordNet and ConceptNet to generate a comprehensive list of distractors for a given q and a\n",
"def get_distractors_ensemble(q, a):\n",
" try:\n",
" wordsense = get_wordsense(q,a) #find WordNet synset\n",
" if wordsense:\n",
" distractors = get_distractors_wordnet(wordsense,a)\n",
" if len(distractors) ==0:\n",
" distractors = get_distractors_conceptnet(a)\n",
" if len(distractors) != 0:\n",
" distractors =[dis.capitalize() for dis in distractors if dis.lower() not in a.lower()]\n",
" return distractors\n",
" else:\n",
" distractors = get_distractors_conceptnet(a)\n",
" if len(distractors) != 0:\n",
" distractors =[dis.capitalize() for dis in distractors if dis.lower() not in a.lower()]\n",
" return distractors\n",
" except:\n",
" return []\n",
" \n",
"def qna_generation_pipeline(text, n_questions):\n",
" summarized_text = summarizer(text,summary_model,summary_tokenizer)\n",
" imp_keywords = get_keywords(text,summarized_text, n_questions = n_questions)\n",
"\n",
" data = []\n",
" for answer in imp_keywords:\n",
" ##Creates a dictionary (q_json) to store the question, answer, and choices.\n",
" q_json = {}\n",
" ques = get_question(summarized_text,answer,question_model,question_tokenizer)\n",
" #Generates distractors using the question and answer.\n",
" distractors = get_distractors_ensemble(ques, answer)\n",
" answer = answer.capitalize()\n",
" #Shuffles the choices to randomize the order.\n",
" choices = [answer] + distractors[:3]\n",
" random.shuffle(choices)\n",
"\n",
" q_json[\"question\"] = ques\n",
" q_json[\"answer\"] = answer\n",
" q_json[\"choices\"] = choices\n",
"\n",
" data.append(q_json)\n",
"\n",
" return data\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'question': 'What is a large system consisting of many similar parts?',\n",
" 'answer': 'Network',\n",
" 'choices': ['Dragnet', 'Body', 'Economy', 'Network']},\n",
" {'question': 'What is a network?',\n",
" 'answer': 'System',\n",
" 'choices': ['Association', 'Arrangement', 'System', 'Actinoid']},\n",
" {'question': 'What is the only component of an xml called?',\n",
" 'answer': 'Component',\n",
" 'choices': ['Affinity', 'Business relation', 'Association', 'Component']},\n",
" {'question': 'What do computers exchange using connections between nodes?',\n",
" 'answer': 'Data',\n",
" 'choices': ['Agglomeration', 'Ana', 'Data', 'Armamentarium']},\n",
" {'question': 'Where do computing devices exchange data with each other?',\n",
" 'answer': 'Computer networks',\n",
" 'choices': ['Early warning system', 'Superhighway', 'Computer networks']},\n",
" {'question': 'What exchange data with each other using connections between nodes?',\n",
" 'answer': 'Devices',\n",
" 'choices': ['Container', 'Devices', 'Connection', 'Ceramic']},\n",
" {'question': 'What is the main component of an xml called?',\n",
" 'answer': 'Element',\n",
" 'choices': ['Element', 'Bottleneck', 'Bit', 'Appendage']},\n",
" {'question': 'What is a network composed of?',\n",
" 'answer': 'Parts',\n",
" 'choices': ['Parts', 'Association', 'Business relation', 'Affinity']},\n",
" {'question': 'What are data links?',\n",
" 'answer': 'Nodes',\n",
" 'choices': ['Articulation', 'Nodes']}]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#The input text or content for generate questions and answers,the number of questions want to generate\n",
"data = qna_generation_pipeline(text, 20)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "tf210",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\GAMER\\AppData\\Roaming\\Python\\Python38\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import warnings\n",
"import numpy as np\n",
"import pandas as pd\n",
"import PyPDF2, re, os\n",
"from datasets import Dataset\n",
"from transformers import DataCollatorForSeq2Seq\n",
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, \\\n",
" Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"model_card = \"t5-base\" #name of the pre-trained T5 model\n",
"tokenizer = AutoTokenizer.from_pretrained(model_card)#initializes a tokenizer for the T5 model. \n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_card)#loads the pre-trained T5 model for sequence-to-sequence tasks.\n",
"model.to('cuda')#moves the model to the GPU\n",
"\n",
"data_collator = DataCollatorForSeq2Seq(\n",
" tokenizer=tokenizer, \n",
" model=model_card\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#to load data from an Excel file, preprocess it, and split it into training and test sets \n",
"def load_data(lesson_dir = 'data/PPT1Dtaa.xlsx'):\n",
" df = pd.read_excel(lesson_dir)\n",
" df = df[['Normal_Tesxt', 'Summarized_text']]\n",
" df = df.dropna(subset=['Normal_Tesxt', 'Summarized_text'])\n",
" lessons = df['Normal_Tesxt'].tolist()\n",
" summaries = df['Summarized_text'].tolist()\n",
" \n",
"#create the datasets using lessons and summaries \n",
" dataset = {}\n",
" dataset['lessons'] = lessons\n",
" dataset['summaries'] = summaries\n",
"\n",
" dataset = Dataset.from_dict(dataset)\n",
" dataset = dataset.train_test_split(test_size=0.1) #train data set to divide in 10% spliit\n",
" train_dataset = dataset['train']\n",
" test_dataset = dataset['test']\n",
" return train_dataset, test_dataset, lessons, summaries"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"train_dataset, test_dataset, lessons, summaries = load_data()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Lesson token length: \n",
"count 72.000000\n",
"mean 94.319444\n",
"std 21.845054\n",
"min 35.000000\n",
"25% 79.500000\n",
"50% 93.000000\n",
"75% 109.000000\n",
"max 162.000000\n",
"dtype: float64\n",
"\n",
"Summary token length: \n",
"count 72.000000\n",
"mean 78.125000\n",
"std 24.467246\n",
"min 34.000000\n",
"25% 60.000000\n",
"50% 76.000000\n",
"75% 90.250000\n",
"max 136.000000\n",
"dtype: float64\n"
]
}
],
"source": [
"# analyze the token lengths \n",
"lesson_token_lengths = [len(tokenizer.encode(lesson)) for lesson in lessons]\n",
"summary_token_lengths = [len(tokenizer.encode(summary)) for summary in summaries]\n",
"\n",
"lesson_token_lengths = pd.Series(lesson_token_lengths)\n",
"summary_token_lengths = pd.Series(summary_token_lengths)\n",
"\n",
"print(f\"Lesson token length: \\n{lesson_token_lengths.describe()}\")\n",
"print(f\"\\nSummary token length: \\n{summary_token_lengths.describe()}\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Map: 0%| | 0/64 [00:00<?, ? examples/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Map: 100%|██████████| 64/64 [00:00<00:00, 3378.29 examples/s]\n",
"Map: 100%|██████████| 8/8 [00:00<00:00, 1146.02 examples/s]\n"
]
}
],
"source": [
"# analyze the token lengths- taking the length of text to same length\n",
"prefix = \"summarize: \"\n",
"\n",
"def preprocess_function(examples):\n",
" inputs = [prefix + doc for doc in examples[\"lessons\"]]\n",
" model_inputs = tokenizer(inputs, max_length=160, truncation=True)#max words in lesson\n",
"\n",
" labels = tokenizer(text_target=examples[\"summaries\"], max_length=135, truncation=True)#max words in summary\n",
"\n",
" model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
" return model_inputs\n",
"\n",
"train_dataset = train_dataset.map(preprocess_function, batched=True)\n",
"test_dataset = test_dataset.map(preprocess_function, batched=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\Edu Me\\PP2\\files/lesson-summarization-qna is already a clone of https://huggingface.co/HiranyaDilukshi/lesson-summarization-qna. Make sure you pull the latest changes with `repo.git_pull()`.\n",
" 0%| | 0/640 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
" 4%|▍ | 27/640 [00:59<20:21, 1.99s/it]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 4%|▍ | 28/640 [01:01<20:55, 2.05s/it]"
]
}
],
"source": [
"# Setting up and initialization of the training of Seq2Seq model for text summarization\n",
"training_args = Seq2SeqTrainingArguments(\n",
" output_dir=\"files/lesson-summarization-qna\",\n",
" evaluation_strategy=\"steps\",\n",
" learning_rate=2e-5,\n",
" per_device_train_batch_size=1,\n",
" per_device_eval_batch_size=1,\n",
" weight_decay=0.01,\n",
" save_total_limit=3,\n",
" num_train_epochs=10,\n",
" predict_with_generate=True,\n",
" fp16=True,\n",
" push_to_hub=True,\n",
" save_steps=400,\n",
" logging_steps=200\n",
" )\n",
"\n",
"trainer = Seq2SeqTrainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=test_dataset,\n",
" tokenizer=tokenizer,\n",
" data_collator=data_collator\n",
" )\n",
"\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'trainer' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32md:\\Edu Me\\PP2\\summarization-qna.ipynb Cell 8\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/d%3A/Edu%20Me/PP2/summarization-qna.ipynb#X10sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m trainer\u001b[39m.\u001b[39msave_model(\u001b[39m'\u001b[39m\u001b[39m./weights/lesson-summarization-qna\u001b[39m\u001b[39m'\u001b[39m)\n",
"\u001b[1;31mNameError\u001b[0m: name 'trainer' is not defined"
]
}
],
"source": [
"trainer.save_model('./weights/lesson-summarization-qna')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'pipeline' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32md:\\Edu Me\\PP2\\summarization-qna.ipynb Cell 9\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/d%3A/Edu%20Me/PP2/summarization-qna.ipynb#X11sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m pipeline_lesson \u001b[39m=\u001b[39m pipeline(\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/Edu%20Me/PP2/summarization-qna.ipynb#X11sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m task\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39msummarization\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/Edu%20Me/PP2/summarization-qna.ipynb#X11sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m model\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m./weights/lesson-summarization-qna\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/Edu%20Me/PP2/summarization-qna.ipynb#X11sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m device\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/Edu%20Me/PP2/summarization-qna.ipynb#X11sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m )\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/Edu%20Me/PP2/summarization-qna.ipynb#X11sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mTrained Model Loaded !!!\u001b[39m\u001b[39m\"\u001b[39m)\n",
"\u001b[1;31mNameError\u001b[0m: name 'pipeline' is not defined"
]
}
],
"source": [
"pipeline_lesson = pipeline(\n",
" task=\"summarization\",\n",
" model=\"./weights/lesson-summarization-qna\", # path to save the trained summarization model you want to use. \n",
" device=0\n",
" )\n",
"\n",
"print(\"Trained Model Loaded !!!\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "torch113",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment