File size: 2,870 Bytes
fb845b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# import libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import json
import requests
import pandas as pd
import re
# load the model and it's tokenizer
tokenizer = AutoTokenizer.from_pretrained('alfiinyang/txt2xl_classifier_model')
model = AutoModelForSequenceClassification.from_pretrained('alfiinyang/txt2xl_classifier_model')
url = "https://huggingface.co/alfiinyang/txt2xl_classifier/resolve/main/label_map.json"
response = requests.get(url)
label_map = json.loads(response.text)
# Define a function to classify a new description
def classify_(description):
"""Function for classifying descriptions"""
with torch.no_grad():
encoding = tokenizer.encode_plus(
description,
add_special_tokens=True,
max_length=45,
return_token_type_ids=False,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
outputs = model(input_ids, attention_mask=attention_mask)
_, prediction = torch.max(outputs.logits, dim=1)
return label_map[str(prediction.item())]
def txt2xl(input_text):
# Regular expression patterns
date_pattern = r'\d{2}/\d{2}/\d{4}'
entry_pattern1 = r'([\w\s,()]+)'
entry_pattern2 = r'([\w\s,()]+) - ([\d, ]+)'
t_entries = input_text.split('\n\n')
data = []
# Extract entries by date
for line in t_entries:
# Extract date
date = re.search(date_pattern, line).group()
# Extract entries
entries = line[len(date)+1:].strip().split('\n')
for i, entry in enumerate(entries):
if re.findall(entry_pattern2, entry) == []:
desc = re.findall(entry_pattern1, entry)[0]
if desc.lower().strip().endswith('cash out'):
desc = 'POS cash out'
cost = re.findall(r'\d+', entry)[0] + '000'
else:
cost = '0'
entries[i] = date, desc, cost
else:
desc, cost = re.findall(entry_pattern2, entry)[0]
entries[i] = date, desc, cost
# Store entries in a DataFrame
for date, item, cost in entries:
total_cost = '=SUM(' + cost + ')'
if item == 'POS cash out':
data.append([date, item, total_cost, '', '', ''])
else:
data.append([date, item, '', total_cost, '', ''])
new_df = pd.DataFrame(data, columns=['DATE', 'COMMENT', 'CREDIT', 'DEBIT', 'SOURCE', 'CATEGORY'])
new_df['DATE'] = pd.to_datetime(new_df.DATE, dayfirst=True)
new_df['DATE'] = new_df.DATE.dt.date
# Classify Transactions
new_df['CATEGORY'] = new_df.COMMENT.map(classify_)
return new_df
|