jojortz commited on
Commit
9fe1137
β€’
1 Parent(s): 626460e

add sample files

Browse files
data/GamingRevenueReport.pdf ADDED
Binary file (115 kB). View file
 
data/SamplePortfolioStatement.pdf ADDED
Binary file (351 kB). View file
 
tabs/arena_sxs.py CHANGED
@@ -2,6 +2,11 @@ import gradio as gr
2
  from utils.pdf_utils import update_page, load_pdf
3
  from extractors.model_runner import models, run_extract_parallel
4
 
 
 
 
 
 
5
  def update_dropdowns(model_a_choice, model_b_choice):
6
  if model_a_choice == model_b_choice:
7
  options_b = [m for m in models if m != model_a_choice]
@@ -11,8 +16,7 @@ def update_dropdowns(model_a_choice, model_b_choice):
11
  return gr.update(choices=options_b, value=model_b_choice)
12
 
13
  def sync_models(model_a_choice, model_b_choice):
14
- updated_dropdown_b = update_dropdowns(model_a_choice, model_b_choice)
15
- return updated_dropdown_b
16
 
17
  def dummy_function_a(model_a_choice):
18
  return f"Model A selected: {model_a_choice}"
@@ -30,20 +34,27 @@ def update_vote_button(output):
30
  def clear_outputs():
31
  return "", ""
32
 
 
 
 
 
33
  def arena_sxs():
34
  with gr.Blocks() as arena_block:
35
  gr.Markdown("# Rules")
36
  gr.Markdown("- Upload a PDF file to extract with two chosen models (e.g., Llama, Unstructured, ChatGPT, Claude) and vote for the better one!")
37
  gr.Markdown("- You can upload multiple files until you identify a winner.")
38
 
39
- gr.Markdown("## 1. Upload a file.")
40
  gr.Markdown("Only PDF files supported.")
41
  with gr.Row():
42
  with gr.Column(scale=2):
43
- pdf = gr.File(type="filepath", label="Upload PDF", file_types=[".pdf"])
 
 
44
  pdf_image = gr.Image(label="PDF Page")
45
  page_info = gr.Textbox(label="")
46
  current_page = gr.State(value=0)
 
47
  with gr.Row():
48
  prev_button = gr.Button("Previous")
49
  next_button = gr.Button("Next")
@@ -63,37 +74,55 @@ def arena_sxs():
63
  with gr.Row():
64
  extract_button = gr.Button("Parse", interactive=False)
65
 
66
- # with gr.Row():
67
- # vote_model_a_button = gr.Button("πŸ‘ˆ A is better", interactive=False)
68
- # vote_model_b_button = gr.Button("πŸ‘‰ B is better", interactive=False)
69
- # vote_tie_button = gr.Button("🀝 Tie", interactive=False)
70
- # vote_bad_button = gr.Button("πŸ‘Ž Both are bad", interactive=False)
71
-
72
  # Handlers
73
- pdf.change(load_pdf, inputs=[pdf], outputs=[pdf_image, page_info, current_page])
74
- pdf.change(fn=update_button, inputs=pdf, outputs=extract_button)
75
- model_a.change(sync_models, inputs=[model_a, model_b], outputs=model_b)
76
- model_a.change(dummy_function_a, inputs=model_a)
77
- model_b.change(dummy_function_b, inputs=model_b)
78
- # output_a.change(fn=update_vote_button, inputs=output_a, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
79
- # output_b.change(fn=update_vote_button, inputs=output_b, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
80
 
81
- # # Button Handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  prev_button.click(
83
  fn=lambda file_path, page: update_page(file_path, page, -1),
84
- inputs=[pdf, current_page],
85
- outputs=[pdf_image, page_info, current_page],
86
  )
87
 
88
  next_button.click(
89
  fn=lambda file_path, page: update_page(file_path, page, 1),
90
- inputs=[pdf, current_page],
91
- outputs=[pdf_image, page_info, current_page],
92
  )
93
 
94
  extract_button.click(
95
  fn=run_extract_parallel,
96
- inputs=[model_a, model_b, pdf],
97
  outputs=[output_a, output_b]
98
  )
99
 
@@ -102,9 +131,21 @@ def arena_sxs():
102
  outputs=[output_a, output_b]
103
  )
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  # vote_model_a_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.GOOD, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
106
  # vote_model_b_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.GOOD), inputs=[model_a, model_b], outputs=[output_a, output_b])
107
  # vote_tie_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
108
  # vote_bad_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.BAD, model_b, Vote.BAD), inputs=[model_a, model_b], outputs=[output_a, output_b])
109
 
110
- return arena_block
 
2
  from utils.pdf_utils import update_page, load_pdf
3
  from extractors.model_runner import models, run_extract_parallel
4
 
5
+ sample_files = {
6
+ "Portfolio Statement": "data/SamplePortfolioStatement.pdf",
7
+ "Gaming Revenue Report": "data/GamingRevenueReport.pdf",
8
+ }
9
+
10
  def update_dropdowns(model_a_choice, model_b_choice):
11
  if model_a_choice == model_b_choice:
12
  options_b = [m for m in models if m != model_a_choice]
 
16
  return gr.update(choices=options_b, value=model_b_choice)
17
 
18
  def sync_models(model_a_choice, model_b_choice):
19
+ return update_dropdowns(model_a_choice, model_b_choice)
 
20
 
21
  def dummy_function_a(model_a_choice):
22
  return f"Model A selected: {model_a_choice}"
 
34
  def clear_outputs():
35
  return "", ""
36
 
37
+ def load_selected_pdf(file_path):
38
+ img, page_info, page_num = load_pdf(file_path)
39
+ return img, page_info, page_num
40
+
41
  def arena_sxs():
42
  with gr.Blocks() as arena_block:
43
  gr.Markdown("# Rules")
44
  gr.Markdown("- Upload a PDF file to extract with two chosen models (e.g., Llama, Unstructured, ChatGPT, Claude) and vote for the better one!")
45
  gr.Markdown("- You can upload multiple files until you identify a winner.")
46
 
47
+ gr.Markdown("## 1. Upload a file or choose a sample.")
48
  gr.Markdown("Only PDF files supported.")
49
  with gr.Row():
50
  with gr.Column(scale=2):
51
+ with gr.Row():
52
+ pdf_radio = gr.Radio(choices=list(sample_files.keys()), label="Choose a Sample PDF")
53
+ pdf_file = gr.File(type="filepath", label="Upload PDF", file_types=[".pdf"])
54
  pdf_image = gr.Image(label="PDF Page")
55
  page_info = gr.Textbox(label="")
56
  current_page = gr.State(value=0)
57
+ file_path_state = gr.State(value=None) # Unified file path state
58
  with gr.Row():
59
  prev_button = gr.Button("Previous")
60
  next_button = gr.Button("Next")
 
74
  with gr.Row():
75
  extract_button = gr.Button("Parse", interactive=False)
76
 
 
 
 
 
 
 
77
  # Handlers
78
+ def update_pdf_from_source(file_path):
79
+ img, page_info, page_num = load_pdf(file_path)
80
+ return img, page_info, page_num
 
 
 
 
81
 
82
+ def update_pdf_and_button(file=None, radio=None):
83
+ file_path = file if file else sample_files.get(radio)
84
+ if file_path:
85
+ img, page_info, page_num = update_pdf_from_source(file_path)
86
+ return img, page_info, 0, file_path, update_button(file_path) # Include file_path in the outputs
87
+ return None, "No file selected", 0, None, update_button(None) # Include file_path in the outputs
88
+
89
+ def handle_file_change(file):
90
+ return update_pdf_and_button(file=file)
91
+
92
+ def handle_radio_change(radio):
93
+ return update_pdf_and_button(radio=radio)
94
+
95
+
96
+ # Handle file uploads
97
+ pdf_file.change(
98
+ fn=handle_file_change,
99
+ inputs=pdf_file,
100
+ outputs=[pdf_image, page_info, current_page, file_path_state, extract_button]
101
+ )
102
+
103
+ # Handle sample PDF selection from the radio
104
+ pdf_radio.change(
105
+ fn=handle_radio_change,
106
+ inputs=pdf_radio,
107
+ outputs=[pdf_image, page_info, current_page, file_path_state, extract_button]
108
+ )
109
+
110
+ # Button interactions
111
  prev_button.click(
112
  fn=lambda file_path, page: update_page(file_path, page, -1),
113
+ inputs=[file_path_state, current_page],
114
+ outputs=[pdf_image, page_info, current_page]
115
  )
116
 
117
  next_button.click(
118
  fn=lambda file_path, page: update_page(file_path, page, 1),
119
+ inputs=[file_path_state, current_page],
120
+ outputs=[pdf_image, page_info, current_page]
121
  )
122
 
123
  extract_button.click(
124
  fn=run_extract_parallel,
125
+ inputs=[model_a, model_b, file_path_state], # Use the unified file path state
126
  outputs=[output_a, output_b]
127
  )
128
 
 
131
  outputs=[output_a, output_b]
132
  )
133
 
134
+ return arena_block
135
+
136
+
137
+ # with gr.Row():
138
+ # vote_model_a_button = gr.Button("πŸ‘ˆ A is better", interactive=False)
139
+ # vote_model_b_button = gr.Button("πŸ‘‰ B is better", interactive=False)
140
+ # vote_tie_button = gr.Button("🀝 Tie", interactive=False)
141
+ # vote_bad_button = gr.Button("πŸ‘Ž Both are bad", interactive=False)
142
+ # output_a.change(fn=update_vote_button, inputs=output_a, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
143
+ # output_b.change(fn=update_vote_button, inputs=output_b, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
144
+
145
+ # # Button Handlers
146
+
147
  # vote_model_a_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.GOOD, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
148
  # vote_model_b_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.GOOD), inputs=[model_a, model_b], outputs=[output_a, output_b])
149
  # vote_tie_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
150
  # vote_bad_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.BAD, model_b, Vote.BAD), inputs=[model_a, model_b], outputs=[output_a, output_b])
151
 
 
utils/pdf_utils.py CHANGED
@@ -1,22 +1,43 @@
1
- import fitz
2
  from PIL import Image
3
 
4
- def update_page(file_path, page_num, direction):
5
- if page_num is None:
6
- page_num = 0
7
- new_page_num = page_num + direction
8
- img, actual_page_num, total_pages = get_pdf_page(file_path, new_page_num)
9
- return img, f"Page {actual_page_num + 1} of {total_pages}", actual_page_num
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def get_pdf_page(file_path, page_num):
12
- doc = fitz.open(file_path)
13
- page_count = len(doc)
14
- page_num = max(0, min(page_num, page_count - 1)) # Ensure page_num is within bounds
15
- page = doc.load_page(page_num)
16
- pix = page.get_pixmap()
17
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
18
- return img, page_num, page_count
 
 
 
 
 
19
 
20
  def load_pdf(file_path):
21
  img, page_num, total_pages = get_pdf_page(file_path, 0)
22
- return img, f"Page {page_num + 1} of {total_pages}", page_num
 
 
1
+ import fitz # PyMuPDF
2
  from PIL import Image
3
 
4
+ def update_page(file_path, current_page, direction):
5
+ if not file_path:
6
+ return None, "No file selected", current_page # If no file is selected, do nothing
7
+
8
+ # Calculate new page index
9
+ new_page = current_page + direction
10
+
11
+ # Load the initial page to get the total pages
12
+ _, _, total_pages = load_pdf(file_path)
13
+
14
+ # Ensure the new page is within bounds
15
+ if new_page < 0:
16
+ new_page = 0
17
+ elif new_page >= total_pages:
18
+ new_page = total_pages - 1
19
+
20
+ # Load the updated page
21
+ img, page_info, _ = get_pdf_page(file_path, new_page)
22
+
23
+ return img, f"Page {new_page + 1} of {total_pages}", new_page
24
+
25
 
26
  def get_pdf_page(file_path, page_num):
27
+ try:
28
+ doc = fitz.open(file_path)
29
+ page_count = len(doc)
30
+ page_num = max(0, min(page_num, page_count - 1)) # Ensure page_num is within bounds
31
+ page = doc.load_page(page_num)
32
+ pix = page.get_pixmap()
33
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
34
+ return img, page_num, page_count
35
+ except Exception as e:
36
+ print(f"Error loading PDF page: {e}")
37
+ return None, page_num, 0
38
+
39
 
40
  def load_pdf(file_path):
41
  img, page_num, total_pages = get_pdf_page(file_path, 0)
42
+ return img, f"Page {page_num + 1} of {total_pages}", total_pages
43
+