hertogateis commited on
Commit
e4382ce
·
verified ·
1 Parent(s): 7ffe460

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -44
app.py CHANGED
@@ -1,13 +1,13 @@
1
  import os
2
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
3
-
4
  import streamlit as st
5
  from st_aggrid import AgGrid
6
- import pandas as pd
7
  from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
8
 
 
9
  st.set_page_config(layout="wide")
10
 
 
11
  style = '''
12
  <style>
13
  body {background-color: #F5F5F5; color: #000000;}
@@ -23,45 +23,22 @@ style = '''
23
  '''
24
  st.markdown(style, unsafe_allow_html=True)
25
 
26
- st.markdown('<p style="font-family:sans-serif;font-size: 1.9rem;"> HertogAI Q&A table V1 using TAPAS and Text Generated</p>', unsafe_allow_html=True)
27
  st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)
28
 
29
- # Initialize TAPAS and Hugging Face Model (T5 for NLP generation)
30
  tqa = pipeline(task="table-question-answering",
31
  model="google/tapas-large-finetuned-wtq",
32
  device="cpu")
33
 
34
- model_name = "t5-small" # You can use a larger model or GPT as needed
35
- tokenizer = T5Tokenizer.from_pretrained(model_name)
36
- model = T5ForConditionalGeneration.from_pretrained(model_name)
37
-
38
- # Function to generate natural language from TAPAS output
39
- def generate_nlp_from_tapas(tapas_output, df):
40
- """
41
- Use Hugging Face's T5 model to generate natural language text from TAPAS output.
42
- """
43
- try:
44
- # Construct prompt using TAPAS output
45
- answer = tapas_output['answer']
46
- coordinates = tapas_output['coordinates']
47
- answer_data = [df.iloc[row, col] for row, col in coordinates]
48
-
49
- # Format the prompt for NLP model
50
- prompt = f"Answer: {answer}. Data Location: Rows {coordinates}, Values: {answer_data}. Please summarize this information in a natural language sentence."
51
-
52
- # Tokenize input and generate response
53
- inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512)
54
- outputs = model.generate(inputs, max_length=100, num_beams=5, early_stopping=True)
55
-
56
- # Decode and return the generated response
57
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
58
- return response
59
- except Exception as e:
60
- return f"Error generating response: {str(e)}"
61
-
62
 
 
63
  file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])
64
 
 
65
  if file_name is None:
66
  st.markdown('<p class="font">Please upload an excel or csv file </p>', unsafe_allow_html=True)
67
  else:
@@ -88,6 +65,7 @@ else:
88
  df_numeric = df.copy()
89
  df = df.astype(str)
90
 
 
91
  grid_response = AgGrid(
92
  df.head(5),
93
  columns_auto_size_mode='FIT_CONTENTS',
@@ -99,24 +77,50 @@ else:
99
  except Exception as e:
100
  st.error(f"Error reading file: {str(e)}")
101
 
 
102
  question = st.text_input('Type your question')
103
-
 
104
  with st.spinner():
105
- if(st.button('Answer')):
106
  try:
107
  # Get the raw answer from TAPAS
108
  raw_answer = tqa(table=df, query=question, truncation=True)
109
 
110
- st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result: </p>", unsafe_allow_html=True)
 
111
  st.success(raw_answer)
112
 
113
- # Use Hugging Face's T5 model to generate NLP text from TAPAS output
114
- final_answer = generate_nlp_from_tapas(raw_answer, df)
115
-
116
- # Display the generated answer in a simple format
117
- st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Generated Answer: </p>", unsafe_allow_html=True)
118
- st.success(final_answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  except Exception as e:
121
- st.warning(f"Error: {str(e)} - Please retype your question and ensure it is correctly formatted.")
122
-
 
1
  import os
 
 
2
  import streamlit as st
3
  from st_aggrid import AgGrid
4
+ import pandas as pd
5
  from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
6
 
7
+ # Set the page layout for Streamlit
8
  st.set_page_config(layout="wide")
9
 
10
+ # CSS styling
11
  style = '''
12
  <style>
13
  body {background-color: #F5F5F5; color: #000000;}
 
23
  '''
24
  st.markdown(style, unsafe_allow_html=True)
25
 
26
+ st.markdown('<p style="font-family:sans-serif;font-size: 1.9rem;"> HertogAI Question Answering using TAPAS</p>', unsafe_allow_html=True)
27
  st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'>Pre-trained TAPAS model runs on max 64 rows and 32 columns data. Make sure the file data doesn't exceed these dimensions.</p>", unsafe_allow_html=True)
28
 
29
+ # Initialize TAPAS pipeline
30
  tqa = pipeline(task="table-question-answering",
31
  model="google/tapas-large-finetuned-wtq",
32
  device="cpu")
33
 
34
+ # Initialize T5 tokenizer and model for text generation
35
+ t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
36
+ t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # File uploader in the sidebar
39
  file_name = st.sidebar.file_uploader("Upload file:", type=['csv', 'xlsx'])
40
 
41
+ # File processing and question answering
42
  if file_name is None:
43
  st.markdown('<p class="font">Please upload an excel or csv file </p>', unsafe_allow_html=True)
44
  else:
 
65
  df_numeric = df.copy()
66
  df = df.astype(str)
67
 
68
+ # Display the first 5 rows of the dataframe in an editable grid
69
  grid_response = AgGrid(
70
  df.head(5),
71
  columns_auto_size_mode='FIT_CONTENTS',
 
77
  except Exception as e:
78
  st.error(f"Error reading file: {str(e)}")
79
 
80
+ # User input for the question
81
  question = st.text_input('Type your question')
82
+
83
+ # Process the answer using TAPAS and T5
84
  with st.spinner():
85
+ if st.button('Answer'):
86
  try:
87
  # Get the raw answer from TAPAS
88
  raw_answer = tqa(table=df, query=question, truncation=True)
89
 
90
+ st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Raw Result: </p>",
91
+ unsafe_allow_html=True)
92
  st.success(raw_answer)
93
 
94
+ # Extract relevant information from the TAPAS result
95
+ answer = raw_answer['answer']
96
+ aggregator = raw_answer.get('aggregator', '')
97
+ coordinates = raw_answer.get('coordinates', [])
98
+ cells = raw_answer.get('cells', [])
99
+
100
+ # Create a base sentence
101
+ if aggregator:
102
+ base_sentence = f"The {aggregator.lower()} of the selected data is {answer}."
103
+ if coordinates and cells:
104
+ rows_info = [f"Row {coordinate[0]+1}, Column {df.columns[coordinate[1]]} with value {cell}"
105
+ for coordinate, cell in zip(coordinates, cells)]
106
+ rows_description = " and ".join(rows_info)
107
+ base_sentence += f" This includes the following data: {rows_description}."
108
+ else:
109
+ base_sentence = f"The answer is: {answer}"
110
+
111
+ # Construct the full input for T5 model by including the original question
112
+ input_text = f"Given the question: '{question}', generate a more human-readable response: {base_sentence}"
113
+
114
+ # Tokenize the input and generate a fluent response using T5
115
+ inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
116
+ summary_ids = t5_model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
117
+
118
+ # Decode the generated text
119
+ generated_text = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
120
+
121
+ # Display the final generated response
122
+ st.markdown("<p style='font-family:sans-serif;font-size: 0.9rem;'> Final Generated Response: </p>", unsafe_allow_html=True)
123
+ st.success(generated_text)
124
 
125
  except Exception as e:
126
+ st.warning("Please retype your question and make sure to use the column name and cell value correctly.")