Commit 30d3d546 authored by Gangoda G.G.W.N's avatar Gangoda G.G.W.N

Upload Name Entity Recognition

parent a50c5cf3
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOA3HAnMZc5/QOztx0zB/cy"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":38,"metadata":{"id":"BzBuNs92xkf7","executionInfo":{"status":"ok","timestamp":1698911092144,"user_tz":-330,"elapsed":652,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}}},"outputs":[],"source":["import nltk\n","import spacy\n","\n","\n","\n","from nltk.corpus import stopwords\n","from nltk.tokenize import sent_tokenize, word_tokenize\n","from nltk.stem import PorterStemmer,SnowballStemmer\n","from nltk.stem import WordNetLemmatizer\n","from spacy import displacy\n","from IPython.display import HTML\n"]},{"cell_type":"code","source":["nltk.download('punkt')\n","nltk.download(\"stopwords\")\n","nltk.download('wordnet')\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"w3SrGk_UyEmL","executionInfo":{"status":"ok","timestamp":1698908258582,"user_tz":-330,"elapsed":783,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}},"outputId":"46f9b04f-2b37-4d01-83b4-150277176c51"},"execution_count":15,"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n","[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Package stopwords is already up-to-date!\n","[nltk_data] Downloading package wordnet to /root/nltk_data...\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":15}]},{"cell_type":"markdown","source":["Text"],"metadata":{"id":"S8Bp0pkIyMZW"}},{"cell_type":"code","source":["text=\"John Smith, a seasoned engineer, hails from the picturesque town of Willowdale.His address, 123 Elm Street, is nestled amid the rolling hills,offering an idyllic backdrop for his work. At 40 years old,John brings a wealth of experience to the table, having spent two decades mastering his craft.His qualifications include a Bachelor's degree in Mechanical Engineering from Willowdale University,and a Master's in Sustainable Energy from Green Valley Institute. With his diverse knowledge and expertise,John has successfully spearheaded several eco-friendly projects, making him a renowned figure in sustainable engineering,contributing to both his community and the world at large.\""],"metadata":{"id":"1JI2BAbRyOyY","executionInfo":{"status":"ok","timestamp":1698908054347,"user_tz":-330,"elapsed":811,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}}},"execution_count":7,"outputs":[]},{"cell_type":"markdown","source":["Word tokenization"],"metadata":{"id":"EtU6z5ehyHqx"}},{"cell_type":"code","source":["tokens = nltk.word_tokenize(text)\n","print (tokens)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hcDqGBtuyHH0","executionInfo":{"status":"ok","timestamp":1698908056817,"user_tz":-330,"elapsed":24,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}},"outputId":"4a8f6971-e97d-47c8-9f52-3dfb448c9434"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["['John', 'Smith', ',', 'a', 'seasoned', 'engineer', ',', 'hails', 'from', 'the', 'picturesque', 'town', 'of', 'Willowdale.His', 'address', ',', '123', 'Elm', 'Street', ',', 'is', 'nestled', 'amid', 'the', 'rolling', 'hills', ',', 'offering', 'an', 'idyllic', 'backdrop', 'for', 'his', 'work', '.', 'At', '40', 'years', 'old', ',', 'John', 'brings', 'a', 'wealth', 'of', 'experience', 'to', 'the', 'table', ',', 'having', 'spent', 'two', 'decades', 'mastering', 'his', 'craft.His', 'qualifications', 'include', 'a', 'Bachelor', \"'s\", 'degree', 'in', 'Mechanical', 'Engineering', 'from', 'Willowdale', 'University', ',', 'and', 'a', 'Master', \"'s\", 'in', 'Sustainable', 'Energy', 'from', 'Green', 'Valley', 'Institute', '.', 'With', 'his', 'diverse', 'knowledge', 'and', 'expertise', ',', 'John', 'has', 'successfully', 'spearheaded', 'several', 'eco-friendly', 'projects', ',', 'making', 'him', 'a', 'renowned', 'figure', 'in', 'sustainable', 'engineering', ',', 'contributing', 'to', 'both', 'his', 'community', 'and', 'the', 'world', 'at', 'large', '.']\n"]}]},{"cell_type":"markdown","source":["Remove Stop words"],"metadata":{"id":"KQOw8lRSy90M"}},{"cell_type":"code","source":["#download stop words in english\n","stop_words = set(stopwords.words(\"english\"))\n","#print(stop_words)\n","non_stop_words=[i for i in tokens if i.casefold() not in stop_words]\n","print(non_stop_words)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jQkRIIary-9z","executionInfo":{"status":"ok","timestamp":1698908126512,"user_tz":-330,"elapsed":1348,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}},"outputId":"79b84f35-0cc1-4b52-bdd1-0a514bf7285d"},"execution_count":11,"outputs":[{"output_type":"stream","name":"stdout","text":["['John', 'Smith', ',', 'seasoned', 'engineer', ',', 'hails', 'picturesque', 'town', 'Willowdale.His', 'address', ',', '123', 'Elm', 'Street', ',', 'nestled', 'amid', 'rolling', 'hills', ',', 'offering', 'idyllic', 'backdrop', 'work', '.', '40', 'years', 'old', ',', 'John', 'brings', 'wealth', 'experience', 'table', ',', 'spent', 'two', 'decades', 'mastering', 'craft.His', 'qualifications', 'include', 'Bachelor', \"'s\", 'degree', 'Mechanical', 'Engineering', 'Willowdale', 'University', ',', 'Master', \"'s\", 'Sustainable', 'Energy', 'Green', 'Valley', 'Institute', '.', 'diverse', 'knowledge', 'expertise', ',', 'John', 'successfully', 'spearheaded', 'several', 'eco-friendly', 'projects', ',', 'making', 'renowned', 'figure', 'sustainable', 'engineering', ',', 'contributing', 'community', 'world', 'large', '.']\n"]}]},{"cell_type":"markdown","source":["Stemming"],"metadata":{"id":"1nomMpIW0QPD"}},{"cell_type":"code","source":["stemmer = PorterStemmer()\n","#stemmer = SnowballStemmer(\"english\")\n","\n","stemmed_words = [stemmer.stem(i) for i in non_stop_words]\n","print(stemmed_words)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"HgZlHbI90QD9","executionInfo":{"status":"ok","timestamp":1698908212668,"user_tz":-330,"elapsed":794,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}},"outputId":"fc58a186-b82e-47a2-a7f5-8f94be039e6d"},"execution_count":13,"outputs":[{"output_type":"stream","name":"stdout","text":["['john', 'smith', ',', 'season', 'engin', ',', 'hail', 'picturesqu', 'town', 'willowdale.hi', 'address', ',', '123', 'elm', 'street', ',', 'nestl', 'amid', 'roll', 'hill', ',', 'offer', 'idyl', 'backdrop', 'work', '.', '40', 'year', 'old', ',', 'john', 'bring', 'wealth', 'experi', 'tabl', ',', 'spent', 'two', 'decad', 'master', 'craft.hi', 'qualif', 'includ', 'bachelor', \"'s\", 'degre', 'mechan', 'engin', 'willowdal', 'univers', ',', 'master', \"'s\", 'sustain', 'energi', 'green', 'valley', 'institut', '.', 'divers', 'knowledg', 'expertis', ',', 'john', 'success', 'spearhead', 'sever', 'eco-friendli', 'project', ',', 'make', 'renown', 'figur', 'sustain', 'engin', ',', 'contribut', 'commun', 'world', 'larg', '.']\n"]}]},{"cell_type":"markdown","source":["Lemmatizing"],"metadata":{"id":"w1HW7yMt0j41"}},{"cell_type":"code","source":["lemmatizer = WordNetLemmatizer()\n","lemmatized_words = [lemmatizer.lemmatize(i) for i in stemmed_words]\n","lemmatized_words"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"inJoDXRn0lJS","executionInfo":{"status":"ok","timestamp":1698908265562,"user_tz":-330,"elapsed":2183,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}},"outputId":"1f6bf05a-a46f-495c-dd63-ac77fcbf76e5"},"execution_count":16,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['john',\n"," 'smith',\n"," ',',\n"," 'season',\n"," 'engin',\n"," ',',\n"," 'hail',\n"," 'picturesqu',\n"," 'town',\n"," 'willowdale.hi',\n"," 'address',\n"," ',',\n"," '123',\n"," 'elm',\n"," 'street',\n"," ',',\n"," 'nestl',\n"," 'amid',\n"," 'roll',\n"," 'hill',\n"," ',',\n"," 'offer',\n"," 'idyl',\n"," 'backdrop',\n"," 'work',\n"," '.',\n"," '40',\n"," 'year',\n"," 'old',\n"," ',',\n"," 'john',\n"," 'bring',\n"," 'wealth',\n"," 'experi',\n"," 'tabl',\n"," ',',\n"," 'spent',\n"," 'two',\n"," 'decad',\n"," 'master',\n"," 'craft.hi',\n"," 'qualif',\n"," 'includ',\n"," 'bachelor',\n"," \"'s\",\n"," 'degre',\n"," 'mechan',\n"," 'engin',\n"," 'willowdal',\n"," 'univers',\n"," ',',\n"," 'master',\n"," \"'s\",\n"," 'sustain',\n"," 'energi',\n"," 'green',\n"," 'valley',\n"," 'institut',\n"," '.',\n"," 'diver',\n"," 'knowledg',\n"," 'expertis',\n"," ',',\n"," 'john',\n"," 'success',\n"," 'spearhead',\n"," 'sever',\n"," 'eco-friendli',\n"," 'project',\n"," ',',\n"," 'make',\n"," 'renown',\n"," 'figur',\n"," 'sustain',\n"," 'engin',\n"," ',',\n"," 'contribut',\n"," 'commun',\n"," 'world',\n"," 'larg',\n"," '.']"]},"metadata":{},"execution_count":16}]},{"cell_type":"code","source":[],"metadata":{"id":"Ylp-ih6k0-dU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["text_to_analyzed = \" \".join(lemmatized_words)\n","print(text_to_analyzed)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"AAHB9m0p0y5S","executionInfo":{"status":"ok","timestamp":1698908293400,"user_tz":-330,"elapsed":826,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}},"outputId":"e888dfe1-62c1-4ec1-ec72-df0da19d001a"},"execution_count":17,"outputs":[{"output_type":"stream","name":"stdout","text":["john smith , season engin , hail picturesqu town willowdale.hi address , 123 elm street , nestl amid roll hill , offer idyl backdrop work . 40 year old , john bring wealth experi tabl , spent two decad master craft.hi qualif includ bachelor 's degre mechan engin willowdal univers , master 's sustain energi green valley institut . diver knowledg expertis , john success spearhead sever eco-friendli project , make renown figur sustain engin , contribut commun world larg .\n"]}]},{"cell_type":"markdown","source":["Spacy model"],"metadata":{"id":"kTS99ALW08t8"}},{"cell_type":"code","source":["\n","# load spacy model\n","#nlp = spacy.load('en_core_web_lg')\n","#nlp = spacy.load('en_core_web_md')\n","nlp = spacy.load('en_core_web_sm')\n","\n","# load data\n","sentence =text_to_analyzed\n","doc = nlp(sentence)\n","\n","person=[]\n","place=[]\n","date=[]\n","age=[]\n","organization=[]\n","products=[]\n","locations=[]\n","\n","# print entities\n","for ent in doc.ents:\n"," if ent.label_==\"PERSON\":\n"," person.append(ent.text)\n","\n","\n"," elif ent.label_==\"GPE\":\n"," place.append(ent.text)\n","\n","\n"," elif ent.label_==\"DATE\":\n"," date.append(ent.text)\n","\n","\n"," elif ent.label_==\"AGE\":\n"," age.append(ent.text)\n","\n"," elif ent.label_==\"ORG\":\n"," organization.append(ent.text)\n","\n"," elif ent.label_==\"PRODUCT\":\n"," products.append(ent.text)\n","\n"," elif ent.label_==\"LOC\":\n"," locations.append(ent.text)\n","\n","\n","print(person)\n","print(place)\n","print(date)\n","print(age)\n","print(organization)\n","print(products)\n","print(locations)\n","\n","options = {\"ents\": [\"PERSON\", \"GPE\",\"DATE\",\"AGE\"]}\n","html = displacy.render(doc, style=\"ent\", options=options)\n","HTML(html)\n","\n","# Save the highlighted HTML to a file\n","with open(\"highlighted_text.html\", \"w\", encoding=\"utf-8\") as file:\n"," file.write(html)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"DSKRtwfk1IWO","executionInfo":{"status":"ok","timestamp":1698912640778,"user_tz":-330,"elapsed":1363,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}},"outputId":"9ba8df73-7682-4f71-ef6a-904aba16d6c4"},"execution_count":52,"outputs":[{"output_type":"stream","name":"stdout","text":["['John Smith', 'John', 'John']\n","['Willowdale']\n","['40 years old', 'two decades']\n","[]\n","['Bachelor', 'Mechanical Engineering', 'Willowdale University']\n","[]\n","[]\n"]}]},{"cell_type":"code","source":["# load spacy model\n","#nlp = spacy.load('en_core_web_lg')\n","#nlp = spacy.load('en_core_web_md')\n","nlp = spacy.load('en_core_web_sm')\n","\n","# load data\n","sentence =text\n","doc = nlp(sentence)\n","\n","person=[]\n","place=[]\n","date=[]\n","age=[]\n","organization=[]\n","products=[]\n","locations=[]\n","\n","# print entities\n","for ent in doc.ents:\n"," if ent.label_==\"PERSON\":\n"," person.append(ent.text)\n","\n","\n"," elif ent.label_==\"GPE\":\n"," place.append(ent.text)\n","\n","\n"," elif ent.label_==\"DATE\":\n"," date.append(ent.text)\n","\n","\n"," elif ent.label_==\"AGE\":\n"," age.append(ent.text)\n","\n"," elif ent.label_==\"ORG\":\n"," organization.append(ent.text)\n","\n"," elif ent.label_==\"PRODUCT\":\n"," products.append(ent.text)\n","\n"," elif ent.label_==\"LOC\":\n"," locations.append(ent.text)\n","\n","\n","print(person)\n","print(place)\n","print(date)\n","print(age)\n","print(organization)\n","print(products)\n","print(locations)\n","\n","options = {\"ents\": [\"PERSON\", \"GPE\",\"DATE\",\"AGE\"]}\n","html = displacy.render(doc, style=\"ent\", options=options)\n","HTML(html)\n","\n","# Save the highlighted HTML to a file\n","with open(\"highlighted_text.html\", \"w\", encoding=\"utf-8\") as file:\n"," file.write(html)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"f8RdV6h0Cu50","executionInfo":{"status":"ok","timestamp":1698912711065,"user_tz":-330,"elapsed":1656,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}},"outputId":"723bdc0f-c01b-4033-8fa9-cb3764b0a20f"},"execution_count":55,"outputs":[{"output_type":"stream","name":"stdout","text":["['John Smith', 'John', 'John']\n","['Willowdale']\n","['40 years old', 'two decades']\n","[]\n","['Bachelor', 'Mechanical Engineering', 'Willowdale University']\n","[]\n","[]\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"4gn1Jm9PCwWv"},"execution_count":null,"outputs":[]}]}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment