|
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
import json |
|
import requests |
|
import pandas as pd |
|
import re |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('alfiinyang/txt2xl_classifier_model') |
|
model = AutoModelForSequenceClassification.from_pretrained('alfiinyang/txt2xl_classifier_model') |
|
|
|
url = "https://huggingface.co/alfiinyang/txt2xl_classifier/resolve/main/label_map.json" |
|
response = requests.get(url) |
|
label_map = json.loads(response.text) |
|
|
|
|
|
def classify_(description): |
|
"""Function for classifying descriptions""" |
|
|
|
with torch.no_grad(): |
|
encoding = tokenizer.encode_plus( |
|
description, |
|
add_special_tokens=True, |
|
max_length=45, |
|
return_token_type_ids=False, |
|
padding='max_length', |
|
truncation=True, |
|
return_attention_mask=True, |
|
return_tensors='pt', |
|
) |
|
input_ids = encoding['input_ids'] |
|
attention_mask = encoding['attention_mask'] |
|
outputs = model(input_ids, attention_mask=attention_mask) |
|
_, prediction = torch.max(outputs.logits, dim=1) |
|
return label_map[str(prediction.item())] |
|
|
|
def txt2xl(input_text): |
|
|
|
date_pattern = r'\d{2}/\d{2}/\d{4}' |
|
entry_pattern1 = r'([\w\s,()]+)' |
|
entry_pattern2 = r'([\w\s,()]+) - ([\d, ]+)' |
|
|
|
t_entries = input_text.split('\n\n') |
|
data = [] |
|
|
|
|
|
|
|
for line in t_entries: |
|
|
|
date = re.search(date_pattern, line).group() |
|
|
|
|
|
entries = line[len(date)+1:].strip().split('\n') |
|
|
|
for i, entry in enumerate(entries): |
|
if re.findall(entry_pattern2, entry) == []: |
|
desc = re.findall(entry_pattern1, entry)[0] |
|
if desc.lower().strip().endswith('cash out'): |
|
desc = 'POS cash out' |
|
cost = re.findall(r'\d+', entry)[0] + '000' |
|
else: |
|
cost = '0' |
|
entries[i] = date, desc, cost |
|
else: |
|
desc, cost = re.findall(entry_pattern2, entry)[0] |
|
entries[i] = date, desc, cost |
|
|
|
|
|
for date, item, cost in entries: |
|
total_cost = '=SUM(' + cost + ')' |
|
if item == 'POS cash out': |
|
data.append([date, item, total_cost, '', '', '']) |
|
else: |
|
data.append([date, item, '', total_cost, '', '']) |
|
|
|
new_df = pd.DataFrame(data, columns=['DATE', 'COMMENT', 'CREDIT', 'DEBIT', 'SOURCE', 'CATEGORY']) |
|
new_df['DATE'] = pd.to_datetime(new_df.DATE, dayfirst=True) |
|
new_df['DATE'] = new_df.DATE.dt.date |
|
|
|
|
|
new_df['CATEGORY'] = new_df.COMMENT.map(classify_) |
|
|
|
return new_df |
|
|