File size: 2,870 Bytes
fb845b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# import libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import json
import requests
import pandas as pd
import re

# load the model and it's tokenizer
tokenizer = AutoTokenizer.from_pretrained('alfiinyang/txt2xl_classifier_model')
model = AutoModelForSequenceClassification.from_pretrained('alfiinyang/txt2xl_classifier_model')

url = "https://huggingface.co/alfiinyang/txt2xl_classifier/resolve/main/label_map.json"
response = requests.get(url)
label_map = json.loads(response.text)

# Define a function to classify a new description
def classify_(description):
  """Function for classifying descriptions"""

    with torch.no_grad():
        encoding = tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=45,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        outputs = model(input_ids, attention_mask=attention_mask)
        _, prediction = torch.max(outputs.logits, dim=1)
        return label_map[str(prediction.item())]

def txt2xl(input_text):
    # Regular expression patterns
    date_pattern = r'\d{2}/\d{2}/\d{4}'
    entry_pattern1 = r'([\w\s,()]+)'
    entry_pattern2 = r'([\w\s,()]+) - ([\d, ]+)'

    t_entries = input_text.split('\n\n')
    data = []


    # Extract entries by date
    for line in t_entries:
      # Extract date
      date = re.search(date_pattern, line).group()

      # Extract entries
      entries = line[len(date)+1:].strip().split('\n')

      for i, entry in enumerate(entries):
          if re.findall(entry_pattern2, entry) == []:
              desc = re.findall(entry_pattern1, entry)[0]
              if desc.lower().strip().endswith('cash out'):
                  desc = 'POS cash out'
                  cost = re.findall(r'\d+', entry)[0] + '000'
              else:
                  cost = '0'
              entries[i] = date, desc, cost
          else:
              desc, cost = re.findall(entry_pattern2, entry)[0]
              entries[i] = date, desc, cost

      # Store entries in a DataFrame
      for date, item, cost in entries:
          total_cost = '=SUM(' + cost + ')'
          if item == 'POS cash out':
              data.append([date, item, total_cost, '', '', ''])
          else:
              data.append([date, item, '', total_cost, '', ''])

    new_df = pd.DataFrame(data, columns=['DATE', 'COMMENT', 'CREDIT', 'DEBIT', 'SOURCE', 'CATEGORY'])
    new_df['DATE'] = pd.to_datetime(new_df.DATE, dayfirst=True)
    new_df['DATE'] = new_df.DATE.dt.date

    # Classify Transactions
    new_df['CATEGORY'] = new_df.COMMENT.map(classify_)

    return new_df