Commit 49a6cf2a authored by Perera M.A.L.L.P IT20120320's avatar Perera M.A.L.L.P IT20120320

Merge branch 'master' of http://gitlab.sliit.lk/2023-165/2023-165 into...

Merge branch 'master' of http://gitlab.sliit.lk/2023-165/2023-165 into feature/water-consumption-management
parents 02f7b4d2 82344197
{
"ExpandedNodes": [
""
],
"PreviewInSolutionExplorer": false
}
\ No newline at end of file
File added
Key Category
DIALOG Telecomunication
KEELS Grocery
RESTAURANT Dining out
PHARMACY Medical
TELECOM Telecomunication
RAFFLES Dining out
BURGER Dining out
PETRO Gasoline
EATS Dining out
CARGILLS Grocery
LAUGFS Grocery
SEN SAAL Dining out
HUTCHISON Telecomunication
SPOTIFY Entertainment
ARPICO Grocery
AUTO Vehical
FOODS Dining out
SINGER Home Supplies
import fitz # PyMuPDF
import csv
from path import Path
import os
class PDFDataExtractor:
def __init__(self, num_columns=1):
self.num_columns = num_columns
self.column_widths = [100.0] * self.num_columns
self.script_directory = os.path.dirname(os.path.abspath(__file__))
self.output_path = Path(
Path(self.script_directory) / "ExtractedCSV/Temp.csv")
self.pdf_path = os.path.join(self.script_directory, 'pdf_files', 'Temp.pdf')
def extract_data(self):
print(self.pdf_path)
doc = fitz.open(self.pdf_path)
#doc =
extracted_data = []
for page_num in range(doc.page_count):
page = doc[page_num]
text = page.get_text("text")
lines = text.split('\n')
page_data = []
for line in lines:
columns = []
start = 0
for width in self.column_widths:
end = start + int(len(line) * width / 100)
columns.append(line[start:end].strip())
start = end
if len(columns) == self.num_columns:
page_data.append(columns)
extracted_data.extend(page_data)
doc.close()
return extracted_data
def save_to_csv(self):
data = self.extract_data()
with open(self.output_path, 'w', newline='', encoding='utf-8') as csvfile:
csv_writer = csv.writer(csvfile)
for row in data:
csv_writer.writerow(row)
if __name__ == "__main__":
pdf_extractor = PDFDataExtractor()
extracted_data = pdf_extractor.save_to_csv()
import re
from decimal import Decimal
from PDFDataExtractor import PDFDataExtractor
import csv
from path import Path
import os
import datetime
from textblob import TextBlob
class TransactionExtractor:
def __init__(self):
#self.input_file_path = input_file_path
self.script_directory = os.path.dirname(os.path.abspath(__file__))
self.csvPath = Path(
Path(self.script_directory) / "ExtractedCSV/Temp.csv")
@staticmethod
def clean_transaction(transaction):
transaction_remove_newline = transaction.strip().replace('\n', ' ')
cleaned_transaction = re.sub(r'\s+', ' ', transaction_remove_newline)
return cleaned_transaction
@staticmethod
def direction(amount, balance):
num_amount = Decimal(amount)
if balance == 'CR':
num_amount = num_amount * -1
return num_amount
def save_to_csv(self):
output_path = self.csvPath
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
csv_writer = csv.writer(csvfile)
for row in self.rawdata :
csv_writer.writerow(row)
def extract_data(self):
with open(self.csvPath, 'r') as file:
data = file.read()
data_without_quotes = data.replace('"', '')
tran_pattern = r'(\d{2} [A-Za-z]{3})\n+(\d{2} [A-Za-z]{3})+\n+([\s\S]*?)\n+([\d,]+\.\d{2})+(\n|[A-Z]{2})'
tran_matches = re.findall(tran_pattern, data_without_quotes)
tran_matches_extracted_data = [
[date1, date2, self.clean_transaction(transaction), self.direction(amount.replace(',', ''), balance)]
for date1, date2, transaction, amount, balance in tran_matches
]
extraction_patterns = {
'username': r'Colombo\.\n\D+Statement',
'Pre_Statement_Bal': r'Previous Statement Balance\s*([\d,.]+)',
'Pay_Cre_Bal': r'Payments & Credits\s*([\d,.]+)',
'Pay_Deb_Bal': r'Purchases & Debits\s*([\d,.]+)',
'Tol_ACC_Bal': r'Total Account Balance\s*([\d,.]+)',
'payment_due_date': r'Payment Due Date\s*([\d]{1,2} [A-Za-z]{3} [\d]{4})',
'current_due': r'Current Due\s*([\d,.]+)',
'overlimit_due': r'Overlimit Due Amount\s*([\d,.]+)',
'minimum_payment': r'Minimum Payment\s*([\d,.]+)',
'credit_limit': r'Credit Limit \(LKR\)\s*([\d,.]+)',
'cash_limit': r'Cash Limit \(LKR\)\s*([\d,.]+)',
'available_credit': r'Available Credit \(LKR\)\s*([\d,.]+)',
'annual_interest_rate': r'Annual Interest Rate\s+Purchase\s+([\d.]+%)',
'cash_advance_rate': r'Cash Advance\s+([\d.]+%)'
}
extracted_values = {}
for key, pattern in extraction_patterns.items():
matches = re.findall(pattern, data_without_quotes)
if matches:
patternValue = matches[0].replace(',', '')
patternValue = patternValue.replace('Colombo.\n','')
patternValue = patternValue.replace('\nStatement','')
extracted_values[key] = patternValue
return tran_matches_extracted_data, extracted_values
class TransactionCategorizer:
def __init__(self):
self.script_directory = os.path.dirname(os.path.abspath(__file__))
def categorize_transactions(self):
categories = {}
with open('Category.csv', 'r') as csvfile:
csvreader = csv.reader(csvfile, delimiter='\t') # Specify tab as delimiter
next(csvreader) # Skip header row
for row in csvreader:
keyword = row[0].lower() # Convert keyword to lowercase
category = row[1]
if category in categories:
categories[category].append(keyword)
else:
categories[category] = [keyword]
tran_matches_extracted_data, extracted_data = TransactionExtractor().extract_data()
categorized_transactions = []
category_sum = {} # Store the sum of amounts for each category
for extracted_row in tran_matches_extracted_data:
transaction = extracted_row[2]
amount = extracted_row[3]
transaction_blob = TextBlob(transaction.lower())
predicted_category = "Other" # Default category if not matched
for category_label, keywords in categories.items():
if any(keyword in transaction_blob for keyword in keywords):
predicted_category = category_label
break
extracted_row.append(predicted_category)
categorized_transactions.append(extracted_row)
# Update the category_sum dictionary
if predicted_category in category_sum:
category_sum[predicted_category] += Decimal(amount)
else:
category_sum[predicted_category] = Decimal(amount)
return categorized_transactions, extracted_data, category_sum
if __name__ == "__main__":
transaction_categorizer = TransactionCategorizer()
categorized_transactions, extracted_data, category_sum = transaction_categorizer.categorize_transactions()
# print("Categorized Transactions:")
# for row in categorized_transactions:
# print(f"Date1: {row[0]}, Date2: {row[1]}, Transaction: {row[2]}, Amount: {row[3]}, Category: {row[4]}")
# print("\nExtracted Data:")
# for key, value in extracted_data.items():
# print(f"{key}: {value}")
# print("\nCategory Sums:")
# for category, sum_amount in category_sum.items():
# print(f"{category}: {sum_amount}")
print(category_sum['Medical'])
\ No newline at end of file
from flask import Flask, request, jsonify
import os
from TransactionExtractor import TransactionCategorizer
from PDFDataExtractor import PDFDataExtractor
import shutil
app = Flask(__name__)
UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'pdf_files')
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
EXTRACTED_CSV_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ExtractedCSV')
if not os.path.exists(EXTRACTED_CSV_FOLDER):
os.makedirs(EXTRACTED_CSV_FOLDER)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
@app.route('/extract_data', methods=['POST'])
def extract_data():
try:
pdf_file = request.files['pdf']
pdf_data = pdf_file.read()
# Save the uploaded PDF file to the specified folder
pdf_filename = os.path.join(app.config['UPLOAD_FOLDER'], 'Temp.pdf')
with open(pdf_filename, 'wb') as pdf_output:
pdf_output.write(pdf_data)
pdf_extractor = PDFDataExtractor()
extracted_data = pdf_extractor.save_to_csv()
# Create an instance of TransactionExtractor and process the PDF data
transaction_extractor = TransactionCategorizer()
extracted_data= transaction_extractor.categorize_transactions()
# Delete files in the specified folders
for file in os.listdir(UPLOAD_FOLDER):
os.remove(os.path.join(UPLOAD_FOLDER, file))
for file in os.listdir(EXTRACTED_CSV_FOLDER):
os.remove(os.path.join(EXTRACTED_CSV_FOLDER, file))
return jsonify(extracted_data)
except Exception as e:
return jsonify({"error": str(e)})
if __name__ == '__main__':
app.run(debug=True)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment