Commit 08de5812 authored by S.T. Galappaththi's avatar S.T. Galappaththi

Merge branch 'master' of http://gitlab.sliit.lk/2023-118/2023-118 into IT20167264

# Conflicts:
#	OuterScopeTesting.ipynb
parents af11c45e ef835ac5
......@@ -66,3 +66,5 @@ env/
# Vector files
*.vec
README.md
\ No newline at end of file
This diff is collapsed.
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Testing from outside"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"from sinhala_data_processor.singlish.hybrid_transliterator import HybridTransliterator, RuleBasedTransliterator"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
"\u001B[1;32m~\\AppData\\Local\\Temp/ipykernel_21020/1015352440.py\u001B[0m in \u001B[0;36m<module>\u001B[1;34m\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[0mtext\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"mama oyaata aadareyi\"\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 4\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 5\u001B[1;33m \u001B[0mo\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mtransliterator\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mtext\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mtext\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m",
"\u001B[1;32m~\\PycharmProjects\\SinhalaDataProcessor\\sinhala_data_processor\\singlish\\hybrid_transliterator.py\u001B[0m in \u001B[0;36mtransliterator\u001B[1;34m(self, text)\u001B[0m\n\u001B[0;32m 117\u001B[0m \u001B[1;32mcontinue\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 118\u001B[0m \u001B[1;31m# Get GPT response for level 0\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 119\u001B[1;33m \u001B[0mgpt_response_1\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m__get_gpt_response\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mtext\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0msentence\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mlevel\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;36m0\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 120\u001B[0m \u001B[0mdictionary_1\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mjson\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mloads\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mgpt_response_1\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 121\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\PycharmProjects\\SinhalaDataProcessor\\sinhala_data_processor\\singlish\\hybrid_transliterator.py\u001B[0m in \u001B[0;36m__get_gpt_response\u001B[1;34m(self, text, level, word)\u001B[0m\n\u001B[0;32m 78\u001B[0m \u001B[0mresult\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mcompletion\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mchoices\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;36m0\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mmessage\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mcontent\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 79\u001B[0m \u001B[0msleep_time\u001B[0m \u001B[1;33m=\u001B[0m \u001B[1;36m2\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m---> 80\u001B[1;33m \u001B[0mtime\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0msleep\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0msleep_time\u001B[0m\u001B[1;33m)\u001B[0m \u001B[1;31m# To avoid rate limit\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 81\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mresult\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 82\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mException\u001B[0m \u001B[1;32mas\u001B[0m \u001B[0me\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;31mKeyboardInterrupt\u001B[0m: "
]
}
],
"source": [
"o = HybridTransliterator()\n",
"\n",
"text=\"mama oyaata aadareyi\"\n",
"\n",
"o.transliterator(text=text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"i = RuleBasedTransliterator()\n",
"\n",
"text=\"mama oyaata aadareyi\"\n",
"\n",
"i.transliterator(text=text)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sinhala_data_processor.singlish.machine_transliterator import MachineTransliterator\n",
"\n",
"transliterator = MachineTransliterator(model=\"resources/models/cc.si.300.vec\")\n",
"input_word = \"oyaata kohomada\"\n",
"\n",
"out = transliterator.transliterator(input_word)\n",
"print(out)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sinhala_data_processor.singlish.hybrid_transliterator import HybridTransliterator\n",
"\n",
"a = HybridTransliterator()\n",
"\n",
"a.view_prompt(0)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"ename": "AttributeError",
"evalue": "'GrammarMain' object has no attribute 'mapper'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mAttributeError\u001B[0m Traceback (most recent call last)",
"\u001B[1;32m~\\AppData\\Local\\Temp\\ipykernel_27136\\1852559308.py\u001B[0m in \u001B[0;36m<module>\u001B[1;34m\u001B[0m\n\u001B[0;32m 4\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 5\u001B[0m \u001B[0msentence\u001B[0m \u001B[1;33m=\u001B[0m \u001B[1;34m\"මම මේ ටික හොඳට බලන\"\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 6\u001B[1;33m \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mobj\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mmapper\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0msentence\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0msentence\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 7\u001B[0m \u001B[0mprint\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mout\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;31mAttributeError\u001B[0m: 'GrammarMain' object has no attribute 'mapper'"
]
}
],
"source": [
"from sinhala_data_processor.grammar_rule.grammar_main import GrammarMain\n",
"\n",
"obj = GrammarMain()\n",
"\n",
"sentence = \"දරුවා වෙහෙස මහන්සියෙන් ඉගෙන ගන්නවා\"\n",
"out = obj.mapper(sentence=sentence)\n",
"print(out)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-08-22T22:31:14.201083400Z",
"start_time": "2023-08-22T22:31:14.170322400Z"
}
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fetching File\n",
"Converting audio transcripts into text ...\n",
"කුඹුර ගොවියාට වී ලබාගැනීමට උපකාරී වීම වශයෙන් පිහිටවන්න කි\n"
]
}
],
"source": [
"from sinhala_data_processor.sinhala_audio.audio_to_text import conversion\n",
"path=\"resources/datasets/IT20167264/test_audio/pn_sin_01_00001.wav\"\n",
"\n",
"text = conversion(path=path)\n",
"print(text)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-08-22T11:28:38.382431Z",
"start_time": "2023-08-22T11:28:37.738017500Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# 2023-118
# Sinhala Data Cleaning and Preprocessing Library
Welcome to the Sinhala Data Cleaning and Preprocessing Library! This comprehensive Python library is designed to streamline the process of cleaning and preprocessing Sinhala text data for various natural language processing (NLP) tasks. Our library addresses the unique challenges posed by the Sinhala language, providing efficient solutions for data cleaning, text summarization, and more.
## Objective
Our primary objective is to create an all-encompassing library that enhances the quality and accessibility of Sinhala text data. Through machine learning techniques and innovative approaches, we aim to empower users across industries to efficiently process, understand, and utilize Sinhala language content.
## Key Research Questions
### Sameera W.G.G.A.S - IT20146924
- How can Singlish text be accurately and efficiently converted to Sinhala language using machine learning techniques?
- What approaches can be taken to support different variations of Singlish text and real-time conversion?
- How can integration with other systems be seamlessly achieved to enhance user experience?
### Galappaththi S.T - 20167264
- What are the key challenges in accurately converting Sinhala speech to text while maintaining grammar rules?
- How can verbal Sinhala text be effectively converted to written text while adhering to the language's specific grammar?
- What machine learning and deep learning models can be utilized for optimal performance in Sinhala language processing?
### Yasodya P.B.B - 20227586
- What are the limitations of existing tools for Sinhala text cleaning and preprocessing, and how can they be addressed?
- How can hybrid approaches combining rule-based and machine learning methods improve the accuracy and scalability of text processing?
- What strategies can be implemented to support various Sinhala text variations and promote integration with other systems?
### Wijesinghe W.R.A.S.S - 20181406
- What are the current challenges in summarizing Sinhala news articles, and how can they be overcome?
- How can the library effectively combine extractive and abstractive summarization techniques to generate high-quality summaries?
- What methods can be employed to provide users with customizable summarization lengths, formats, and translation options?
## Individual Objectives
### Sameera W.G.G.A.S - IT20146924
Develop a Python library for converting Singlish text to Sinhala language. Our library leverages deep learning models for accurate and real-time conversion, supporting various Singlish variations and integration options.
### Galappaththi S.T - 20167264
Create a comprehensive Sinhala Data Cleaning and Preprocessing Library. Focus areas include accurate Sinhala speech-to-text and verbal text-to-written text conversion, machine learning integration, and real-time performance optimization.
### Yasodya P.B.B - 20227586
Develop a robust Python library for cleaning and preprocessing Sinhala text data. Employ hybrid approaches combining rule-based and machine learning methods for accuracy and scalability. Prioritize support for various Sinhala text variations and integration with other systems.
### Wijesinghe W.R.A.S.S - 20181406
Design an advanced Sinhala text summarization library with extractive and abstractive techniques. Offer customizable summarization lengths and formats, and integrate translation features for wider accessibility of Sinhala news content.
## Contact
For inquiries or feedback, please contact us at sssbprojects@gmail.com.
---
Note: This version includes the individual research questions and objectives for each team member, providing a more detailed overview of their contributions to the project. Adapt the content to match your README's format and design.
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import openai\n",
"import json\n",
"import time\n",
"\n",
"def read_json_config(file_path):\n",
" try:\n",
" with open(file_path, 'r') as json_file:\n",
" json_data = json.load(json_file)\n",
" return json_data\n",
" except Exception as e:\n",
" print(f\"Error while reading JSON configuration file '{file_path}': {str(e)}\")\n",
" return {}\n",
"\n",
"def get_gpt_response(text: str, json_data: dict, level: int, word: str = \"\"):\n",
" completion = None\n",
" try:\n",
" openai.api_key = json_data['api_key']\n",
" openai.organization = json_data['org_key']\n",
" user_prompt = json_data['Prompts'][level]['content'].replace(\"{{masked-sentence}}\", text).replace(\"{{misspelled-word}}\", word)\n",
" if not text.strip():\n",
" raise ValueError(\"Text is empty. Please provide a valid text string.\")\n",
"\n",
" success = False\n",
" while not success:\n",
" try:\n",
" completion = openai.ChatCompletion.create(\n",
" model=json_data['model'],\n",
" messages=[\n",
" {\n",
" \"role\": json_data['Prompts'][level]['role'],\n",
" \"content\": user_prompt\n",
" }\n",
" ],\n",
" n=1,\n",
" temperature=json_data['temperature'],\n",
" max_tokens=json_data['max_tokens'],\n",
" top_p=json_data['Top_P'],\n",
" frequency_penalty=json_data['Frequency_penalty'],\n",
" presence_penalty=json_data['Presence_penalty']\n",
" )\n",
" success = True\n",
" except Exception as e:\n",
" sleep_time = 2\n",
" time.sleep(sleep_time)\n",
" print(\"Error:\", e)\n",
" print(\"Retrying...\")\n",
"\n",
" result = completion.choices[0].message.content\n",
" sleep_time = 2\n",
" time.sleep(sleep_time) # to avoid rate limit\n",
" return result\n",
" except Exception as e:\n",
" print(f\"Error in GPT response for text '{text}': {str(e)}\")\n",
" return \"\"\n",
"\n",
"def remove_duplicates(input_list):\n",
" unique_words = set()\n",
" result = []\n",
"\n",
" for word in input_list:\n",
" if word not in unique_words:\n",
" unique_words.add(word)\n",
" result.append(word)\n",
"\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"def main(config_file: str, text: str):\n",
" json_data = read_json_config(config_file)\n",
"\n",
" gpt_response_1 = get_gpt_response(text=text, json_data=json_data, level=0)\n",
" dictionary_1 = json.loads(gpt_response_1)\n",
" print(f\"Sentence: {text}\")\n",
"\n",
" primary_list_item = 0 # Initialize the index for the while loop\n",
" dictionary_list = dictionary_1[\"word_list\"]\n",
" dictionary_list = remove_duplicates(dictionary_list)\n",
" print(f\"Misspelled Word List: {dictionary_list}\")\n",
" while primary_list_item < len(dictionary_1[\"word_list\"]):\n",
" for list_item in dictionary_list:\n",
" print(f\"Word to Mask: {list_item}\")\n",
" if text.count(list_item) > 1:\n",
" text = text.replace(list_item, \"<mask>\", 1)\n",
" print(f\"Masked Sentence: {text}\")\n",
" dictionary_1[\"word_list\"].append(list_item)\n",
" dictionary_list.remove(list_item)\n",
" dictionary_list.append(list_item)\n",
" gpt_response_2 = get_gpt_response(text=text, json_data=json_data, level=1, word=list_item)\n",
" print(f\"Predicted Word for Mask: {gpt_response_2}\")\n",
" dictionary_2 = json.loads(gpt_response_2)\n",
" if list_item in dictionary_2:\n",
" text = text.replace(\"<mask>\", dictionary_2[list_item])\n",
" print(f\"Mask Replaced Sentence: {text}\")\n",
" break\n",
" elif text.count(list_item) == 1:\n",
" text = text.replace(list_item, \"<mask>\", 1)\n",
" print(f\"Masked Sentence: {text}\")\n",
" gpt_response_2 = get_gpt_response(text=text, json_data=json_data, level=1, word=list_item)\n",
" print(f\"Predicted Word for Mask: {gpt_response_2}\")\n",
" dictionary_2 = json.loads(gpt_response_2)\n",
" if list_item in dictionary_2:\n",
" text = text.replace(\"<mask>\", dictionary_2[list_item])\n",
" print(f\"Mask Replaced Sentence: {text}\")\n",
" break\n",
"\n",
" primary_list_item += 1\n",
" return text"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"# conf_file_path = \"../../resources/configuration/IT20146924/config.json\"\n",
"# masked_sentence = 'මම ඔයාට <mask>'\n",
"# masked_word = 'ආඩරෙයි'\n",
"# main(conf_file_path, masked_sentence, masked_word)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"# from sinhala_data_processor.singlish.rulebased_transliterator import RuleBasedTransliterator\n",
"#\n",
"# transliterator = RuleBasedTransliterator()\n",
"# input_word = \"kohomada oyaata. mamanam hondhin\"\n",
"#\n",
"# out = transliterator.transliterator(input_word)\n",
"# print(out)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence: කොහොමඩ ඔයාට. කොහොමඩ\n",
"Misspelled Word List: ['කොහොමඩ']\n",
"Word to Mask: කොහොමඩ\n",
"Masked Sentence: <mask> ඔයාට. කොහොමඩ\n",
"Predicted Word for Mask: {\n",
" \"කොහොමඩ\": \"කොහෟඩ්\",\n",
" \"category\": \"complement\"\n",
"}\n",
"Mask Replaced Sentence: කොහෟඩ් ඔයාට. කොහොමඩ\n",
"Word to Mask: කොහොමඩ\n",
"Masked Sentence: කොහෟඩ් ඔයාට. <mask>\n",
"Predicted Word for Mask: {\n",
" \"කොහොමඩ\": \"කොහොමද්දී\",\n",
" \"category\": \"object\"\n",
"}\n",
"Mask Replaced Sentence: කොහෟඩ් ඔයාට. කොහොමද්දී\n",
"Word to Mask: කොහොමඩ\n",
"කොහෟඩ් ඔයාට. කොහොමද්දී\n"
]
}
],
"source": [
"conf_file_path = \"../../resources/configuration/IT20146924/config.json\"\n",
"masked_sentence = \"කොහොමඩ ඔයාට. කොහොමඩ\"\n",
"sen = main(conf_file_path, masked_sentence)\n",
"print(sen)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2023-05-20T15:34:17.370768300Z",
"start_time": "2023-05-20T15:34:17.186405800Z"
}
},
"outputs": [],
"source": [
"# import library\n",
"import speech_recognition as sr"
]
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"# Initialize recognizer class (for recognizing the speech)\n",
"r = sr.Recognizer()\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-20T15:35:10.086838500Z",
"start_time": "2023-05-20T15:35:10.067440800Z"
}
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"# Reading Audio file as source\n",
"# listening the audio file and store in audio_text variable\n",
"def startConvertion(path='recorded_1.wav', lang='si-LK'):\n",
" with sr.AudioFile(path) as source:\n",
" print('Fetching File')\n",
" audio_text = r.listen(source)\n",
" # recoginize_() method will throw a request error if the API is unreachable, hence using exception handling\n",
" try:\n",
"\n",
" # using google speech recognition\n",
" print('Converting audio transcripts into text ...')\n",
" text = r.recognize_google(audio_text, language=lang)\n",
" print(text)\n",
"\n",
" except:\n",
" print('Sorry.. run again...')\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-20T15:37:39.259626200Z",
"start_time": "2023-05-20T15:37:39.231404100Z"
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Here your text : \n",
"Fetching File\n",
"Converting audio transcripts into text ...\n",
"ගොවියා ගේ ප්‍රයෝජනය පිණිස අල්ප ව්‍යායාම් එකතු කුඹුර නොකරන්නේය\n"
]
}
],
"source": [
"print('Here your text : ')\n",
"# calling startConvertion method to start process\n",
"startConvertion(r\"../../resources/datasets/IT20167264/test_audio/pn_sin_01_00003.wav\", 'si-LK')"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-20T15:42:33.788572600Z",
"start_time": "2023-05-20T15:42:29.682423300Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2023-05-20T15:45:27.459402100Z",
"start_time": "2023-05-20T15:45:27.348747700Z"
}
},
"outputs": [],
"source": [
"import speech_recognition as sr"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"def startConversion():\n",
" # Initialize recognizer class (for recognizing the speech)\n",
" recognizer = sr.Recognizer()\n",
"\n",
" # Read the audio file path from user input\n",
" audio_file = input(\"Enter the path to the audio file: \")\n",
"\n",
" # Perform speech recognition\n",
" with sr.AudioFile(audio_file) as source:\n",
" print('Fetching File')\n",
" audio_text = recognizer.listen(source)\n",
" try:\n",
" print('Converting audio transcripts into text ...')\n",
" text = recognizer.recognize_google(audio_text, language='si-LK')\n",
" print(text)\n",
" except sr.UnknownValueError:\n",
" print('Speech recognition could not understand audio')\n",
" except sr.RequestError as e:\n",
" print(f'Error occurred during speech recognition: {e}')\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-20T15:45:58.749059400Z",
"start_time": "2023-05-20T15:45:58.737592100Z"
}
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Speech-to-Text Conversion\n",
"Fetching File\n",
"Converting audio transcripts into text ...\n",
"ගොවියා ගේ ප්‍රයෝජනය පිණිස අල්ප ව්‍යායාම් එකතු කුඹුර නොකරන්නේය\n"
]
}
],
"source": [
"# Prompt the user to start the conversion\n",
"print('Speech-to-Text Conversion')\n",
"startConversion()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-20T15:48:39.991168500Z",
"start_time": "2023-05-20T15:48:30.395209400Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2023-05-20T17:17:27.353830200Z",
"start_time": "2023-05-20T17:17:27.329849800Z"
}
},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"def conjugate_sentence(sentence):\n",
" # Define the pattern to match Sinhala verbs\n",
" verb_pattern = r\"([\\u0D80-\\u0DFF]+)\"\n",
"\n",
" # Check if the sentence starts with \"මම\"\n",
" if sentence.startswith(\"මම\"):\n",
" # Find all verbs in the sentence\n",
" verbs = re.findall(verb_pattern, sentence)\n",
"\n",
" # Check if there is at least one verb in the sentence\n",
" if verbs:\n",
" # Get the last verb in the sentence\n",
" last_verb = verbs[-1]\n",
"\n",
" # Extract the verb stem\n",
" verb_stem = last_verb[:2]\n",
"\n",
" # Conjugate the verb stem with \"මි\"\n",
" conjugated_verb = verb_stem + \"මි\"\n",
"\n",
" # Remove the last word (verb) from the sentence\n",
" words = sentence.split()\n",
" words.pop()\n",
"\n",
" # Append the conjugated verb to the sentence\n",
" words.append(conjugated_verb)\n",
"\n",
" # Reconstruct the sentence\n",
" conjugated_sentence = \" \".join(words)\n",
"\n",
" return conjugated_sentence\n",
"\n",
" return sentence"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-20T17:24:31.851227400Z",
"start_time": "2023-05-20T17:24:31.835226600Z"
}
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"මම ඉතා හොදින් පීමි\n"
]
}
],
"source": [
"# Example usage\n",
"sentence = \"මම ඉතා හොදින් පීනනවා\"\n",
"conjugated_sentence = conjugate_sentence(sentence)\n",
"print(conjugated_sentence) # Output: මම සෙල්ලම් කරමි"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-20T17:24:34.100069200Z",
"start_time": "2023-05-20T17:24:34.074817400Z"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of Sinhala letters: 30\n"
]
}
],
"source": [
"import regex\n",
"\n",
"def count_sinhala_letters(text):\n",
" # Define the pattern for Sinhala letters using Unicode properties\n",
" pattern = regex.compile(r'\\p{Script=Sinhala}')\n",
"\n",
" # Find all matches of Sinhala letters in the text\n",
" matches = pattern.findall(text)\n",
"\n",
" # Return the count of Sinhala letters\n",
" return len(matches)\n",
"\n",
"\n",
"# Example usage\n",
"text = \"මම සිංහල බස්නාහිර භාෂාවෙන් කතා කරයි\"\n",
"letter_count = count_sinhala_letters(text)\n",
"print(\"Number of Sinhala letters:\", letter_count)\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-20T17:37:09.749693300Z",
"start_time": "2023-05-20T17:37:09.695584900Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2023-08-08T23:13:59.733165Z",
"start_time": "2023-08-08T23:13:54.502417700Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prediction: [0]\n"
]
}
],
"source": [
"import pickle\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"# Load the trained model\n",
"with open('../../resources/pickels/svm_singular_plural.pkl', 'rb') as file:\n",
" saved_data = pickle.load(file)\n",
"\n",
"svm_model = saved_data['model']\n",
"vectorizer = saved_data['vectorizer']\n",
"\n",
"# Prepare input data\n",
"input_data = [\"සමනලයා\"] # Example input data as a list\n",
"\n",
"# Transform the input data features using the same vectorizer\n",
"input_data_features = vectorizer.transform(input_data)\n",
"\n",
"# Pass input data to the model for prediction\n",
"prediction = svm_model.predict(input_data_features)\n",
"\n",
"# Use the model's output\n",
"print(f\"Prediction: {prediction}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-08-08T23:13:59.754162300Z",
"start_time": "2023-08-08T23:13:59.733165Z"
}
}
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-08-08T23:13:59.828562600Z",
"start_time": "2023-08-08T23:13:59.758670400Z"
}
}
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-08-08T23:13:59.829579200Z",
"start_time": "2023-08-08T23:13:59.773617700Z"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "sinlingua_test"
version = "0.0.4"
authors = [
{ name="Supun Gurusinghe", email="supunsameeran@gmail.com" },
{ name="Sandaruwini Galappaththi", email="sandaruwinigalappaththi@gmail.com" },
{ name="Supun Sarada Wijesinghe", email="saradawijesinghe@gmail.com" },
{ name="Binura Yasodya", email="binurayasodya24@gmail.com" },
]
description = "Package for Sinhala data processing"
readme = "README.md"
requires-python = ">=3.7"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
[project.urls]
"Homepage" = "https://github.com/SupunGurusinghe/SinlinguaDocumentation/blob/main/README.md"
[tool.poetry.dependencies]
python = "^3.7"
chardet = "^3.0.4"
click = "^8.1.7"
colorama = "0.4.6"
gensim = "4.3.1"
joblib = "1.2.0"
nltk = "3.8.1"
numpy = "^1.22.4"
regex = "^2022.10.31"
scipy = "^1.9.3"
tqdm = "^4.64.1"
urllib3 = "^1.26.12"
pip = "^21.2.4"
wheel = "^0.37.1"
cryptography = "^38.0.3"
py = "^1.10.0"
lxml = "^4.9.2"
future = "^0.18.2"
matplotlib = "^3.5.3"
pytest = "^6.2.4"
sklearn = "^0.0"
scikit-learn = "^1.1.3"
requests = "^2.27.1"
pyparsing = "^2.4.7"
keras = "^2.11.0"
ipywidgets = "^7.6.3"
ipython = "^7.26.0"
notebook = "^6.5.2"
hypothesis = "^6.56.4"
setuptools = "^56.0.0"
pytz = "^2022.5"
cffi = "^1.14.6"
mpmath = "^1.3.0"
sympy = "^1.12"
fasttext = "^0.9.2"
psutil = "^5.9.3"
boto3 = "^1.26.43"
botocore = "^1.29.43"
pandas = "^1.3.5"
pygtrie = "^2.5.0"
fuzzywuzzy = "^0.18.0"
asyncio = "^3.4.3"
Levenshtein = "^0.21.0"
idna = "^2.10"
multidict = "^6.0.4"
attrs = "^21.2.0"
openai = "^0.27.2"
aiohttp = "^3.8.3"
plotly = "^5.5.0"
tenacity = "^8.1.0"
yarl = "^1.8.2"
aiosignal = "^1.3.1"
frozenlist = "^1.3.3"
python-multipart = "^0.0.6"
certifi = "^2021.5.30"
simplejson = "^3.18.1"
sinling = "^0.3.6"
hpack = "^3.0.0"
hyperframe = "^5.2.0"
h2 = "^3.2.0"
h11 = "^0.9.0"
hstspreload = "^2023.1.1"
httpcore = "^0.9.1"
rfc3986 = "^1.5.0"
sniffio = "^1.3.0"
httpx = "^0.13.3"
googletrans = "^3.0.0"
torch = "^2.0.1"
transformers = "^4.29.1"
\ No newline at end of file
කෑවා
නෑවා
හෑවා
ආවා
ගියා
බිව්වා
කැඩුවා
නැටුවා
සේදුවා
හේදුවා
මැරුවා
පැගුවා
හැකිලුවා
දිව්වා
කැපුවා
කලා
කෙරුවා
කිව්වා
කැවුනා
දැනුනා
බැලුවා
නිවුනා
පියෑඹුවා
පිලිස්සුවා
හිනැහුනා
ගත්තා
උනා
වුනා
පෑව්වා
හැපුවා
පීරුවා
නැරඹුවා
ගැයුවා
වැයුවා
දැමුවා
කෙරුවා
වැන්දා
නැමුනා
පිම්බා
නැග්ගා
බැස්සා
නැග්ගුවා
ගෑවා
රිංගුවා
පීනුවා
හෑරුවා
හිතුවා
සිතුවා
ඇන්දා
පැලදුවා
හැදුවා
මැසුවා
ගෙතුවා
ඉව්වා
ඇරියා
වැහුවා
පැන්නා
මැහුවා
\ No newline at end of file
from resources.removables.core.tokenizer import *
__all__ = [
'Stemmer'
]
class Stemmer:
def __init__(self):
pass
def stem(self, text):
raise NotImplementedError
\ No newline at end of file
from typing import Text, List
__all__ = [
'StopRemover'
]
# noinspection SpellCheckingInspection
class StopRemover:
def stop_word_remove(self, sentence: Text) -> List[Text]:
raise NotImplementedError()
\ No newline at end of file
from typing import Text, List
__all__ = [
'Tokenizer'
]
# noinspection SpellCheckingInspection
class Tokenizer:
def tokenize(self, sentence: Text) -> List[Text]:
raise NotImplementedError()
\ No newline at end of file
from os.path import dirname, abspath, join
PROJECT_PATH = join(dirname(abspath(__file__)), '..')
RESOURCE_PATH = join(PROJECT_PATH, 'sinhala_data_processor', 'resources')
\ No newline at end of file
from sinhala_data_processor.preprocessor.tokenizer import *
\ No newline at end of file
ගුරුවරු
ළමයි
ශිෂ්‍යයෝ
මව්වරු
පියවරු
දෙමව්පියෝ
සත්තු
දොස්තරවරු
ගායකයො
ප්‍රේක්ශකයො
නලුවො
සන්ගීතකාරයො
මනුස්ස්‍යො
මනුශ්‍යයෝ
වදුරෝ
ගොවියෝ
රජවරු
නායකයෝ
මිනිස්සු
සාමාජිකයෝ
සමූහයෝ
දරුවෝ
පාත්තයෝ
කුරුල්ලෝ
බල්ලෝ
පටව්
බබාලා
මීයෝ
පූසෝ
මාළු
දෙවියෝ
වැද්දෝ
කපුටෝ
දිම්යෝ
කඩියෝ
අලි
යාලුවො
යාළුවො
කැල
සමූහය
රැල
රන්චුව
පෙල
රචකයො
ලේඛකයෝ
ඇදුරෝ
කථිකාචාර්යවරු
නිලියෝ
නිළයෝ
ඉන්ජිනේරුවරු
කොන්දොස්තරවරු
නිලධාරියෝ
සර්පයෝ
කතුවරු
මනාලියෝ
මනාලයෝ
කුමාරියෝ
කුමාරයෝ
කැරපොත්තො
වැඩිහිටියෝ
කොල්ලො
කෙල්ලො
යක්කු
හිමිවරු
ශ්‍රමිකයෝ
කම්කරුවෝ
ශ්‍රමනයෝ
ශ්‍රමණයෝ
ගණිකාවො
කාන්තාවෝ
පිරිමි
අය
නාටිකාන්ගනාවෝ
අන්ගනාවො
කසකරුවෝ
ගිනිබෝලකරුවෝ
විදේශිකයෝ
පෙම්වත්තු
පෙම්වතියෝ
ප්‍රේමවන්තයෝ
ධීවරයෝ
පෙදරේරුවෝ
කාර්මිකයෝ
තරඟකරුවෝ
ජයග්‍රාහකයො
පරාජිකයෝ
ක්‍රීඩකයෝ
දායකයෝ
පාලකයෝ
දේශපාලකයෝ
සිරකරුවෝ
නිලදාරීවරු
වරු
පිරිස
පාහරයෝ
ආක්‍රමණිකයෝ
ද්‍රෝහියෝ
සේවකයෝ
සහෝදරියෝ
සහෝදරයෝ
අම්මලා
තාත්තලා
කුරුමිනියෝ
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import pkg_resources
PROJECT_PATH = pkg_resources.resource_filename('sinlingua', '')
RESOURCE_PATH = pkg_resources.resource_filename('sinlingua', 'resources')
\ No newline at end of file
......@@ -4,7 +4,7 @@ import openai
import json
import time
import requests
from sinhala_data_processor.config import RESOURCE_PATH
from sinlingua.config import RESOURCE_PATH
class LLMConfig:
......
from sinhala_data_processor.grammar_rule.grammar_rules import GrammarRules
from sinhala_data_processor.grammar_rule.rule_based_1 import FirstPerson
from sinhala_data_processor.grammar_rule.rule_based_2 import SecondPersonSingular
from sinhala_data_processor.grammar_rule.rule_based_3 import SecondPersonPlural
from sinhala_data_processor.grammar_rule.rule_based_4 import FourthPerson
from sinhala_data_processor.grammar_rule.rule_based_future_1 import FirstPersonFuture
from sinhala_data_processor.grammar_rule.rule_based_plural import PluralSubject
from sinhala_data_processor.grammar_rule.rule_based_plural_past import PluralSubjectPast
from sinhala_data_processor.grammar_rule.rule_based_singular import SingularSubject
from sinhala_data_processor.grammar_rule.rule_based_past_1 import PastFirstPerson
from sinhala_data_processor.grammar_rule.rule_based_past_2 import PastSecondPersonSingular
from sinhala_data_processor.grammar_rule.rule_based_past_3 import PastSecondPersonPlural
from sinhala_data_processor.grammar_rule.mask import PredictNoun
from sinlingua.grammar_rule.grammar_rules import GrammarRules
from sinlingua.grammar_rule.rule_based_1 import FirstPerson
from sinlingua.grammar_rule.rule_based_2 import SecondPersonSingular
from sinlingua.grammar_rule.rule_based_3 import SecondPersonPlural
from sinlingua.grammar_rule.rule_based_4 import FourthPerson
from sinlingua.grammar_rule.rule_based_future_1 import FirstPersonFuture
from sinlingua.grammar_rule.rule_based_plural import PluralSubject
from sinlingua.grammar_rule.rule_based_plural_past import PluralSubjectPast
from sinlingua.grammar_rule.rule_based_singular import SingularSubject
from sinlingua.grammar_rule.rule_based_past_1 import PastFirstPerson
from sinlingua.grammar_rule.rule_based_past_2 import PastSecondPersonSingular
from sinlingua.grammar_rule.rule_based_past_3 import PastSecondPersonPlural
from sinlingua.grammar_rule.mask import PredictNoun
class GrammarMain:
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment