Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import fitz
|
3 |
+
from PIL import Image
|
4 |
+
import pytesseract
|
5 |
+
import gradio as gr
|
6 |
+
import pandas as pd
|
7 |
+
import os
|
8 |
+
|
9 |
+
config_val = "--psm 6 -c tessedit_char_whitelist=0123456789,.-+"
|
10 |
+
|
11 |
+
# Rectangles for Form 1040 Pages 1 & 2
|
12 |
+
page1_rects = [
|
13 |
+
[(464, 399), (576, 399), (575, 409), (462, 410)],
|
14 |
+
[(462, 519), (577, 518), (577, 531), (463, 529)],
|
15 |
+
[(225, 517), (340, 518), (339, 530), (224, 530)],
|
16 |
+
[(225, 530), (339, 532), (340, 541), (225, 542)],
|
17 |
+
[(464, 531), (576, 531), (576, 542), (464, 542)],
|
18 |
+
[(464, 589), (578, 589), (577, 602), (464, 602)],
|
19 |
+
[(463, 624), (578, 626), (576, 639), (464, 637)],
|
20 |
+
[(462, 652), (576, 651), (577, 661), (464, 663)],
|
21 |
+
[(463, 661), (578, 664), (578, 676), (462, 674)],
|
22 |
+
[(464, 699), (578, 684), (578, 699), (464, 699)]
|
23 |
+
]
|
24 |
+
page2_rects = [
|
25 |
+
[(462, 15), (575, 15), (576, 26), (463, 26)],
|
26 |
+
[(462, 62), (577, 63), (579, 75), (462, 73)],
|
27 |
+
[(463, 98), (576, 98), (578, 110), (462, 110)],
|
28 |
+
[(461, 111), (576, 111), (578, 123), (459, 122)]
|
29 |
+
]
|
30 |
+
|
31 |
+
schedule1_rects = [
|
32 |
+
[(470, 204), (579, 203), (577, 216), (471, 216)], # Schedule 1 Line 3
|
33 |
+
[(470, 228), (577, 229), (576, 240), (470, 240)], # Schedule 1 Line 5
|
34 |
+
[(362, 274), (466, 274), (468, 288), (360, 288)] # Schedule 1 Line 8
|
35 |
+
]
|
36 |
+
|
37 |
+
adjusted_page1_rects = [[(x, y + 23) for (x, y) in rect] for rect in page1_rects]
|
38 |
+
adjusted_page2_rects = [[(x, y + 23) for (x, y) in rect] for rect in page2_rects]
|
39 |
+
|
40 |
+
def get_bounding_rect(points):
|
41 |
+
xs = [pt[0] for pt in points]
|
42 |
+
ys = [pt[1] for pt in points]
|
43 |
+
return fitz.Rect(min(xs), min(ys), max(xs), max(ys))
|
44 |
+
|
45 |
+
|
46 |
+
def extract_numeric_values(pdf_file, schedule1_file=None, client_name="Unknown Client", draw_bboxes=False):
|
47 |
+
try:
|
48 |
+
if not client_name or client_name.strip() == "":
|
49 |
+
return "Error: Client name is required.", None
|
50 |
+
|
51 |
+
if isinstance(pdf_file, str):
|
52 |
+
doc = fitz.open(pdf_file)
|
53 |
+
else:
|
54 |
+
pdf_file.seek(0)
|
55 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
56 |
+
|
57 |
+
if len(doc) < 2:
|
58 |
+
return "Error: Main PDF must have at least 2 pages.", None, None
|
59 |
+
|
60 |
+
zoom = fitz.Matrix(2, 2)
|
61 |
+
page1 = doc[0]
|
62 |
+
page2 = doc[1]
|
63 |
+
|
64 |
+
page1_values, page2_values = [], []
|
65 |
+
|
66 |
+
for rect_points in adjusted_page1_rects:
|
67 |
+
rect = get_bounding_rect(rect_points)
|
68 |
+
pix = page1.get_pixmap(matrix=zoom, clip=rect)
|
69 |
+
cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
70 |
+
w, h = cropped_img.size
|
71 |
+
val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
|
72 |
+
raw = pytesseract.image_to_string(val_img, config=config_val).strip()
|
73 |
+
value_text = re.sub(r"[^\d,.\-+]", "", raw)
|
74 |
+
page1_values.append(value_text)
|
75 |
+
if draw_bboxes:
|
76 |
+
page1.draw_rect(rect, color=(1, 0, 0), width=1.5)
|
77 |
+
|
78 |
+
for rect_points in adjusted_page2_rects:
|
79 |
+
rect = get_bounding_rect(rect_points)
|
80 |
+
pix = page2.get_pixmap(matrix=zoom, clip=rect)
|
81 |
+
cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
82 |
+
w, h = cropped_img.size
|
83 |
+
val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
|
84 |
+
raw = pytesseract.image_to_string(val_img, config=config_val).strip()
|
85 |
+
value_text = re.sub(r"[^\d,.\-+]", "", raw)
|
86 |
+
page2_values.append(value_text)
|
87 |
+
if draw_bboxes:
|
88 |
+
page2.draw_rect(rect, color=(0, 0, 1), width=1.5)
|
89 |
+
|
90 |
+
output_pdf_path = None
|
91 |
+
if draw_bboxes:
|
92 |
+
output_pdf_path = "output_shifted.pdf"
|
93 |
+
doc.save(output_pdf_path)
|
94 |
+
|
95 |
+
doc.close()
|
96 |
+
|
97 |
+
output = [f"1040 Value {i+1}: {val}" for i, val in enumerate(page1_values + page2_values)]
|
98 |
+
all_extracted_values = page1_values + page2_values
|
99 |
+
schedule1_values = []
|
100 |
+
|
101 |
+
output_folder = "./temp_outputs"
|
102 |
+
os.makedirs(output_folder, exist_ok=True)
|
103 |
+
schedule1_pdf_path = None # initialize
|
104 |
+
|
105 |
+
if schedule1_file:
|
106 |
+
if isinstance(schedule1_file, str):
|
107 |
+
doc = fitz.open(schedule1_file)
|
108 |
+
else:
|
109 |
+
schedule1_file.seek(0)
|
110 |
+
doc = fitz.open(stream=schedule1_file.read(), filetype="pdf")
|
111 |
+
|
112 |
+
if len(doc) >= 1:
|
113 |
+
page = doc[0]
|
114 |
+
for idx, rect_points in enumerate(schedule1_rects):
|
115 |
+
rect = get_bounding_rect(rect_points)
|
116 |
+
pix = page.get_pixmap(matrix=zoom, clip=rect)
|
117 |
+
cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
118 |
+
w, h = cropped_img.size
|
119 |
+
val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
|
120 |
+
raw = pytesseract.image_to_string(val_img, config=config_val).strip()
|
121 |
+
value_text = re.sub(r"[^\d,.\-+]", "", raw)
|
122 |
+
schedule1_values.append(value_text)
|
123 |
+
if draw_bboxes:
|
124 |
+
page.draw_rect(rect, color=(0, 1, 0), width=1.5) # green boxes
|
125 |
+
schedule1_pdf_path = None
|
126 |
+
if draw_bboxes and schedule1_file:
|
127 |
+
schedule1_pdf_path = "schedule1_bbox.pdf"
|
128 |
+
doc.save(schedule1_pdf_path)
|
129 |
+
|
130 |
+
output += [f"Schedule 1 Line {i*2+1 if i < 2 else 8}: {val}" for i, val in enumerate(schedule1_values)]
|
131 |
+
doc.close()
|
132 |
+
|
133 |
+
else:
|
134 |
+
# Create a temporary empty file as a placeholder
|
135 |
+
empty_path = os.path.join(output_folder, "empty_schedule1.pdf")
|
136 |
+
if not os.path.exists(empty_path):
|
137 |
+
with open(empty_path, "wb") as f:
|
138 |
+
f.write(b"%PDF-1.4\n%%EOF")
|
139 |
+
schedule1_pdf_output = empty_path
|
140 |
+
|
141 |
+
output_dir = "./Clients_Output_Data_Form_1040.csv"
|
142 |
+
save_to_csv_flat(all_extracted_values, schedule1_values, client_name=client_name, csv_path=output_dir)
|
143 |
+
|
144 |
+
return "\n".join(output), output_dir, output_pdf_path, schedule1_pdf_path
|
145 |
+
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
return f"Error occurred:\n{str(e)}", None, None
|
149 |
+
|
150 |
+
|
151 |
+
def save_to_csv_flat(all_extracted_values, schedule1_values, client_name="Unknown Client", csv_path=None):
|
152 |
+
|
153 |
+
# Define the directory path explicitly
|
154 |
+
if csv_path is None:
|
155 |
+
csv_path = "./Clients_Output_Data_Form_1040.csv"
|
156 |
+
|
157 |
+
# Header components
|
158 |
+
header_level_1 = [
|
159 |
+
"Client Name","Gross Comp", "Taxable Wages", "Taxable Interest Income: Sch. B", "Tax- Exempt Interest",
|
160 |
+
"Qualified Dividends", "Ordinary Dividends", "Long Term Capital Gain or Loss",
|
161 |
+
"Other Adjustments (from Schedule 1)", "Business Income or Loss (Schedule C)",
|
162 |
+
"Rent/ Royalty (Schedule E)", "Other Income", "Standard Deduction", "Qualified Business Income Deduction",
|
163 |
+
"Taxable Income", "Tax", "", "", "Total Tax"
|
164 |
+
]
|
165 |
+
header_level_2 = [
|
166 |
+
"","W2 Box 5", "Line 1", "Line 2b", "Line 2a", "Line 3a", "Line 3b", "Line 7",
|
167 |
+
"Line 10", "Schedule 1, Line 3", "Schedule 1, Line 5", "Schedule 1, Line 8",
|
168 |
+
"Line 12", "Line 13", "Line 15", "Line 16", "Line 20, Schedule 3", "Line 23, Schedule 2", "Line 24"
|
169 |
+
]
|
170 |
+
|
171 |
+
# Flatten headers for CSV
|
172 |
+
flat_columns = [
|
173 |
+
f"{h1.strip()} - {h2.strip()}" if h1.strip() and h2.strip()
|
174 |
+
else (h1.strip() + h2.strip()) for h1, h2 in zip(header_level_1, header_level_2)
|
175 |
+
]
|
176 |
+
|
177 |
+
# If file doesn't exist, create new DataFrame and write headers
|
178 |
+
# if os.path.exists(csv_path):
|
179 |
+
# df = pd.read_csv(csv_path)
|
180 |
+
# else:
|
181 |
+
# df = pd.DataFrame(columns=flat_columns)
|
182 |
+
|
183 |
+
if os.path.exists(csv_path):
|
184 |
+
print(f"Reading existing CSV file: {csv_path}")
|
185 |
+
df = pd.read_csv(csv_path)
|
186 |
+
print(f"CSV columns: {df.columns.tolist()}")
|
187 |
+
print(f"CSV rows before append: {len(df)}")
|
188 |
+
else:
|
189 |
+
print("CSV does not exist. Creating new DataFrame.")
|
190 |
+
df = pd.DataFrame(columns=flat_columns)
|
191 |
+
|
192 |
+
# Create new row with None
|
193 |
+
new_row = pd.Series([None] * len(flat_columns), index=flat_columns)
|
194 |
+
new_row.iloc[0] = client_name
|
195 |
+
# Map Page 1-2 values
|
196 |
+
line_mapping = {
|
197 |
+
"Taxable Wages - Line 1": 0,
|
198 |
+
"Taxable Interest Income: Sch. B - Line 2b": 1,
|
199 |
+
"Tax- Exempt Interest - Line 2a": 2,
|
200 |
+
"Qualified Dividends - Line 3a": 3,
|
201 |
+
"Ordinary Dividends - Line 3b": 4,
|
202 |
+
"Long Term Capital Gain or Loss - Line 7": 5,
|
203 |
+
"Other Adjustments (from Schedule 1) - Line 10": 6,
|
204 |
+
"Standard Deduction - Line 12": 7,
|
205 |
+
"Qualified Business Income Deduction - Line 13": 8,
|
206 |
+
"Taxable Income - Line 15": 9,
|
207 |
+
"Tax - Line 16": 10,
|
208 |
+
"Line 20, Schedule 3": 11,
|
209 |
+
"Line 23, Schedule 2": 12,
|
210 |
+
"Total Tax - Line 24": 13
|
211 |
+
}
|
212 |
+
|
213 |
+
for key, idx in line_mapping.items():
|
214 |
+
if idx < len(all_extracted_values):
|
215 |
+
new_row[key] = all_extracted_values[idx] if all_extracted_values[idx] != '' else '0'
|
216 |
+
|
217 |
+
# Add Schedule 1 values
|
218 |
+
if schedule1_values:
|
219 |
+
new_row["Business Income or Loss (Schedule C) - Schedule 1, Line 3"] = schedule1_values[0] if schedule1_values[0] != '' else '0'
|
220 |
+
new_row["Rent/ Royalty (Schedule E) - Schedule 1, Line 5"] = schedule1_values[1] if schedule1_values[1] != '' else '0'
|
221 |
+
new_row["Other Income - Schedule 1, Line 8"] = schedule1_values[2] if schedule1_values[2] != '' else '0'
|
222 |
+
|
223 |
+
# Append and save
|
224 |
+
if not ((df == new_row.iloc[0]).all(axis=1)).any():
|
225 |
+
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
|
226 |
+
print(f"CSV rows after append: {len(df)}")
|
227 |
+
df.to_csv(csv_path, index=False)
|
228 |
+
print(f" Data saved to CSV: {csv_path}")
|
229 |
+
|
230 |
+
# Gradio UI
|
231 |
+
iface = gr.Interface(
|
232 |
+
fn=extract_numeric_values,
|
233 |
+
inputs=[
|
234 |
+
gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"]),
|
235 |
+
gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"]),
|
236 |
+
gr.Textbox(label="Client Name", placeholder="Enter client name"),
|
237 |
+
gr.Checkbox(label="Draw Bounding Boxes on Form 1040 and Schedule 1?")
|
238 |
+
],
|
239 |
+
outputs=[
|
240 |
+
gr.Textbox(label="Extracted Numeric Values", lines=20),
|
241 |
+
gr.File(label="Download Excel Output"),
|
242 |
+
gr.File(label="1040 Bounding Boxes PDF"),
|
243 |
+
gr.File(label="Schedule 1 Bounding Boxes PDF")
|
244 |
+
],
|
245 |
+
|
246 |
+
title="Form 1040 & Schedule 1 Extractor",
|
247 |
+
description="Upload a 2-page 1040 Form and (optionally) Schedule 1 to extract numeric fields. Optionally draw bounding boxes."
|
248 |
+
)
|
249 |
+
|
250 |
+
# with gr.Blocks(title="Tax PDF Extractor") as demo:
|
251 |
+
# gr.Markdown("## Tax PDF Extractor")
|
252 |
+
# gr.Markdown("Upload Form 1040 (at least 2 pages). Optionally upload Schedule 1 for extra fields.")
|
253 |
+
|
254 |
+
# client_name = gr.Textbox(label="Client Name (Required)", placeholder="Enter your full name")
|
255 |
+
|
256 |
+
# form_1040 = gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"])
|
257 |
+
|
258 |
+
# has_schedule1 = gr.Radio(
|
259 |
+
# choices=["Yes", "No"],
|
260 |
+
# label="Do you have Schedule 1?",
|
261 |
+
# value="No"
|
262 |
+
# )
|
263 |
+
|
264 |
+
# schedule1 = gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"], visible=False)
|
265 |
+
|
266 |
+
# # Show/hide schedule1 upload box
|
267 |
+
# def toggle_schedule1(choice):
|
268 |
+
# return gr.update(visible=choice == "Yes")
|
269 |
+
|
270 |
+
# has_schedule1.change(fn=toggle_schedule1, inputs=has_schedule1, outputs=schedule1)
|
271 |
+
|
272 |
+
# output_text = gr.Textbox(label="Extracted Numeric Values", lines=20)
|
273 |
+
# output_file = gr.File(label="Download Excel Output")
|
274 |
+
|
275 |
+
# def wrapper_extract(main_pdf, schedule1_pdf, client_name):
|
276 |
+
# if not client_name:
|
277 |
+
# return "Error: Client name is required.", None
|
278 |
+
# return extract_numeric_values(main_pdf, schedule1_pdf)
|
279 |
+
|
280 |
+
# submit_btn = gr.Button("Extract Data")
|
281 |
+
|
282 |
+
# submit_btn.click(
|
283 |
+
# fn=wrapper_extract,
|
284 |
+
# inputs=[form_1040, schedule1, client_name],
|
285 |
+
# outputs=[output_text, output_file]
|
286 |
+
# )
|
287 |
+
|
288 |
+
|
289 |
+
iface.launch(share=True)
|
290 |
+
|