# import libraries from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import json import requests import pandas as pd import re # load the model and it's tokenizer tokenizer = AutoTokenizer.from_pretrained('alfiinyang/txt2xl_classifier_model') model = AutoModelForSequenceClassification.from_pretrained('alfiinyang/txt2xl_classifier_model') url = "https://huggingface.co/alfiinyang/txt2xl_classifier/resolve/main/label_map.json" response = requests.get(url) label_map = json.loads(response.text) # Define a function to classify a new description def classify_(description): """Function for classifying descriptions""" with torch.no_grad(): encoding = tokenizer.encode_plus( description, add_special_tokens=True, max_length=45, return_token_type_ids=False, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt', ) input_ids = encoding['input_ids'] attention_mask = encoding['attention_mask'] outputs = model(input_ids, attention_mask=attention_mask) _, prediction = torch.max(outputs.logits, dim=1) return label_map[str(prediction.item())] def txt2xl(input_text): # Regular expression patterns date_pattern = r'\d{2}/\d{2}/\d{4}' entry_pattern1 = r'([\w\s,()]+)' entry_pattern2 = r'([\w\s,()]+) - ([\d, ]+)' t_entries = input_text.split('\n\n') data = [] # Extract entries by date for line in t_entries: # Extract date date = re.search(date_pattern, line).group() # Extract entries entries = line[len(date)+1:].strip().split('\n') for i, entry in enumerate(entries): if re.findall(entry_pattern2, entry) == []: desc = re.findall(entry_pattern1, entry)[0] if desc.lower().strip().endswith('cash out'): desc = 'POS cash out' cost = re.findall(r'\d+', entry)[0] + '000' else: cost = '0' entries[i] = date, desc, cost else: desc, cost = re.findall(entry_pattern2, entry)[0] entries[i] = date, desc, cost # Store entries in a DataFrame for date, item, cost in entries: total_cost = '=SUM(' + cost + ')' if item == 'POS cash out': data.append([date, item, total_cost, '', '', '']) else: data.append([date, item, '', total_cost, '', '']) new_df = pd.DataFrame(data, columns=['DATE', 'COMMENT', 'CREDIT', 'DEBIT', 'SOURCE', 'CATEGORY']) new_df['DATE'] = pd.to_datetime(new_df.DATE, dayfirst=True) new_df['DATE'] = new_df.DATE.dt.date # Classify Transactions new_df['CATEGORY'] = new_df.COMMENT.map(classify_) return new_df