PierreBrunelle commited on
Commit
cb808f6
·
verified ·
1 Parent(s): 61df238

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -27
app.py CHANGED
@@ -1,15 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
- """LLM Comparison
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/156SKaX3DY6jwOhcpwZVM5AiLscOAbNNJ
8
- """
9
-
10
- # Commented out IPython magic to ensure Python compatibility.
11
- # %pip install -qU pixeltable gradio sentence-transformers tiktoken openai openpyxl
12
-
13
  import gradio as gr
14
  import pandas as pd
15
  import pixeltable as pxt
@@ -50,22 +38,29 @@ def create_prompt(top_k_list: list[dict], question: str) -> str:
50
 
51
  {question}'''
52
 
 
53
  def process_files(ground_truth_file, pdf_files):
54
- # Process ground truth file
 
 
 
 
 
55
  if ground_truth_file.name.endswith('.csv'):
56
  queries_t = pxt.io.import_csv('rag_demo.queries', ground_truth_file.name)
57
  else:
58
  queries_t = pxt.io.import_excel('rag_demo.queries', ground_truth_file.name)
59
 
60
- # Process PDF files
61
  documents_t = pxt.create_table(
62
  'rag_demo.documents',
63
  {'document': pxt.DocumentType()}
64
  )
65
 
 
66
  documents_t.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf'))
67
 
68
- # Create chunks view
69
  chunks_t = pxt.create_view(
70
  'rag_demo.chunks',
71
  documents_t,
@@ -76,10 +71,10 @@ def process_files(ground_truth_file, pdf_files):
76
  )
77
  )
78
 
79
- # Add embedding index
80
  chunks_t.add_embedding_index('text', string_embed=e5_embed)
81
 
82
- # Create top_k query
83
  @chunks_t.query
84
  def top_k(query_text: str):
85
  sim = chunks_t.text.similarity(query_text)
@@ -89,13 +84,13 @@ def process_files(ground_truth_file, pdf_files):
89
  .limit(5)
90
  )
91
 
92
- # Add computed columns to queries_t
93
  queries_t['question_context'] = chunks_t.top_k(queries_t.Question)
94
  queries_t['prompt'] = create_prompt(
95
  queries_t.question_context, queries_t.Question
96
  )
97
 
98
- # Prepare messages for OpenAI
99
  messages = [
100
  {
101
  'role': 'system',
@@ -109,17 +104,18 @@ def process_files(ground_truth_file, pdf_files):
109
 
110
  # Add OpenAI response column
111
  queries_t['response'] = openai.chat_completions(
112
- model='gpt-4o-mini-2024-07-18', messages=messages
113
  )
114
 
115
- queries_t['answer'] = queries_t.response.choices[0].message.content
 
116
 
 
117
  df_output = queries_t.select(queries_t.Question, queries_t.correct_answer, queries_t.answer).collect().to_pandas()
118
 
119
  try:
120
- #Display content
121
- return df_output
122
-
123
  except Exception as e:
124
  return f"An error occurred: {str(e)}", None
125
 
@@ -127,18 +123,21 @@ def process_files(ground_truth_file, pdf_files):
127
  with gr.Blocks() as demo:
128
  gr.Markdown("# RAG Demo App")
129
 
 
130
  with gr.Row():
131
  ground_truth_file = gr.File(label="Upload Ground Truth (CSV or XLSX)", file_count="single")
132
  pdf_files = gr.File(label="Upload PDF Documents", file_count="multiple")
133
 
134
- process_button = gr.Button("Process Files")
 
135
 
 
136
  df_output = gr.DataFrame(label="Pixeltable Table")
137
 
 
138
  #question_input = gr.Textbox(label="Enter your question")
139
  #query_button = gr.Button("Query LLM")
140
-
141
- process_button.click(process_files, inputs=[ground_truth_file, pdf_files], outputs=df_output)
142
  #query_button.click(query_llm, inputs=question_input, outputs=output_dataframe)
143
 
144
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import pixeltable as pxt
 
38
 
39
  {question}'''
40
 
41
+ # Gradio Application
42
  def process_files(ground_truth_file, pdf_files):
43
+ # Ensure a clean slate for the demo by removing and recreating the 'rag_demo' directory
44
+ pxt.drop_dir('rag_demo', force=True)
45
+ pxt.create_dir('rag_demo')
46
+
47
+ # Process the ground truth file, which contains questions and correct answers
48
+ # Import as CSV or Excel depending on the file extension
49
  if ground_truth_file.name.endswith('.csv'):
50
  queries_t = pxt.io.import_csv('rag_demo.queries', ground_truth_file.name)
51
  else:
52
  queries_t = pxt.io.import_excel('rag_demo.queries', ground_truth_file.name)
53
 
54
+ # Create a table to store the uploaded PDF documents
55
  documents_t = pxt.create_table(
56
  'rag_demo.documents',
57
  {'document': pxt.DocumentType()}
58
  )
59
 
60
+ # Insert the PDF files into the documents table
61
  documents_t.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf'))
62
 
63
+ # Create a view that splits the documents into smaller chunks
64
  chunks_t = pxt.create_view(
65
  'rag_demo.chunks',
66
  documents_t,
 
71
  )
72
  )
73
 
74
+ # Add an embedding index to the chunks for similarity search
75
  chunks_t.add_embedding_index('text', string_embed=e5_embed)
76
 
77
+ # Define a query function to retrieve the top-k most similar chunks for a given question
78
  @chunks_t.query
79
  def top_k(query_text: str):
80
  sim = chunks_t.text.similarity(query_text)
 
84
  .limit(5)
85
  )
86
 
87
+ # Add computed columns to the queries table for context retrieval and prompt creation
88
  queries_t['question_context'] = chunks_t.top_k(queries_t.Question)
89
  queries_t['prompt'] = create_prompt(
90
  queries_t.question_context, queries_t.Question
91
  )
92
 
93
+ # Prepare messages for the OpenAI API, including system instructions and user prompt
94
  messages = [
95
  {
96
  'role': 'system',
 
104
 
105
  # Add OpenAI response column
106
  queries_t['response'] = openai.chat_completions(
107
+ model='gpt-4o-mini-2024-07-18, messages=messages
108
  )
109
 
110
+ # Extract the answer text from the API response
111
+ queries_t['answer'] = queries_t.response.choices[0].message.content.astype(pxt.StringType())
112
 
113
+ # Prepare the output dataframe with questions, correct answers, and model-generated answers
114
  df_output = queries_t.select(queries_t.Question, queries_t.correct_answer, queries_t.answer).collect().to_pandas()
115
 
116
  try:
117
+ # Return the output dataframe for display
118
+ return df_output
 
119
  except Exception as e:
120
  return f"An error occurred: {str(e)}", None
121
 
 
123
  with gr.Blocks() as demo:
124
  gr.Markdown("# RAG Demo App")
125
 
126
+ # File upload components for ground truth and PDF documents
127
  with gr.Row():
128
  ground_truth_file = gr.File(label="Upload Ground Truth (CSV or XLSX)", file_count="single")
129
  pdf_files = gr.File(label="Upload PDF Documents", file_count="multiple")
130
 
131
+ # Button to trigger file processing
132
+ process_button = gr.Button("Process Files and Generate Outputs")
133
 
134
+ # Output component to display the results
135
  df_output = gr.DataFrame(label="Pixeltable Table")
136
 
137
+ process_button.click(process_files, inputs=[ground_truth_file, pdf_files], outputs=df_output)
138
  #question_input = gr.Textbox(label="Enter your question")
139
  #query_button = gr.Button("Query LLM")
140
+
 
141
  #query_button.click(query_llm, inputs=question_input, outputs=output_dataframe)
142
 
143
  if __name__ == "__main__":