File size: 3,612 Bytes
97beac5
4be804d
 
70f757d
4be804d
97beac5
4be804d
 
 
 
aff4412
4be804d
 
 
 
 
 
c344f79
4be804d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c344f79
2b6b5fd
26e951a
 
0314595
 
 
 
 
 
 
 
 
c344f79
 
0314595
0c7ed46
 
0314595
 
 
 
 
0c7ed46
 
0314595
 
a02613f
8cd9729
 
529aa95
4be804d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import streamlit as st
import torch
import pandas as pd
from io import StringIO
from transformers import AutoTokenizer,  AutoModelForSeq2SeqLM

class preProcess:
    def __init__(self, filename, titlename):
      self.filename = filename
      self.title = titlename + '\n'

    def read_data(self):
      df = pd.read_csv(self.filename)
      return df


    def check_columns(self, df):
      if (len(df.columns) > 4):
        st.error('File has more than 3 coloumns.')
        return False
      if (len(df.columns) == 0):
        st.error('File has no column.')
        return False
      else:
        return True

    def format_data(self, df):
        headers = [[] for i in range(0, len(df.columns))]
        for i in range(len(df.columns)):
            headers[i] = list(df[df.columns[i]])
        zipped = list(zip(*headers))
        res = [' '.join(map(str,tups)) for tups in zipped]
        input_format = ' labels ' + ' - '.join(list(df.columns)) +  ' values '  + ' , '.join(res)

        return input_format


    def combine_title_data(self,df):
      data = self.format_data(df)
      title_data = ' '.join([self.title,data])
      
      return title_data
      
class Model:
    def __init__(self,text,mode):
      self.padding = 'max_length'
      self.truncation = True
      self.prefix = 'C2T: '
      self.device = device = "cuda:0" if torch.cuda.is_available() else "cpu"
      self.text = text
      if mode.lower() == 'simple':
        self.tokenizer = AutoTokenizer.from_pretrained('saadob12/t5_C2T_big')
        self.model = AutoModelForSeq2SeqLM.from_pretrained('saadob12/t5_C2T_big').to(self.device)
      elif mode.lower() == 'analytical':
        self.tokenizer = AutoTokenizer.from_pretrained('saadob12/t5_C2T_autochart')
        self.model = AutoModelForSeq2SeqLM.from_pretrained('saadob12/t5_C2T_autochart').to(self.device)

    def generate(self):
      tokens = self.tokenizer.encode(self.prefix + self.text,  truncation=self.truncation, padding=self.padding, return_tensors='pt').to(self.device)
      generated = self.model.generate(tokens, num_beams=4, max_length=256)
      tgt_text = self.tokenizer.decode(generated[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
      summary = str(tgt_text).strip('[]""')
      return summary
      
      
st.write('This application generates a summary of a datafile (.csv). Right now, it only generates summaries of files with maximum of four columns. If the file contains more than four columns, the app will throw an error.')
mode = st.selectbox('What kind of summary do you want?',
     ('Simple', 'Analytical'))
st.write('You selected: ' + mode + ' summary.') 
title = st.text_input('Title of the .csv file', 'State minimum wage rates in the United States as of January 1 , 2020 , by state ( in U.S. dollars )')
st.write('Title of the file is: ' + title) 
uploaded_file = st.file_uploader("Upload only .csv file")
if uploaded_file is not None and mode is not None and title is not None:
  st.write('Preprocessing file...')
  p = preProcess(uploaded_file, title)
  contents = p.read_data()
  check = p.check_columns(contents)
  if check:
    st.write('Your file contents:\n')
    st.write(contents)
    title_data = p.combine_title_data(contents)
    st.write('Linearized input format of the data file:\n ')
    st.markdown('**'+ title_data + '**')
    
    st.write('Loading model...')
    model = Model(title_data, mode)
    st.write('Model loading done!\nGenerating Summary...')
    summary = model.generate()
    st.write('Generated Summary:\n')
    st.markdown('**'+ summary + '**')