Ayesha352 commited on
Commit
078ebba
·
verified ·
1 Parent(s): d4bc0a5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +290 -0
app.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import fitz
3
+ from PIL import Image
4
+ import pytesseract
5
+ import gradio as gr
6
+ import pandas as pd
7
+ import os
8
+
9
+ config_val = "--psm 6 -c tessedit_char_whitelist=0123456789,.-+"
10
+
11
+ # Rectangles for Form 1040 Pages 1 & 2
12
+ page1_rects = [
13
+ [(464, 399), (576, 399), (575, 409), (462, 410)],
14
+ [(462, 519), (577, 518), (577, 531), (463, 529)],
15
+ [(225, 517), (340, 518), (339, 530), (224, 530)],
16
+ [(225, 530), (339, 532), (340, 541), (225, 542)],
17
+ [(464, 531), (576, 531), (576, 542), (464, 542)],
18
+ [(464, 589), (578, 589), (577, 602), (464, 602)],
19
+ [(463, 624), (578, 626), (576, 639), (464, 637)],
20
+ [(462, 652), (576, 651), (577, 661), (464, 663)],
21
+ [(463, 661), (578, 664), (578, 676), (462, 674)],
22
+ [(464, 699), (578, 684), (578, 699), (464, 699)]
23
+ ]
24
+ page2_rects = [
25
+ [(462, 15), (575, 15), (576, 26), (463, 26)],
26
+ [(462, 62), (577, 63), (579, 75), (462, 73)],
27
+ [(463, 98), (576, 98), (578, 110), (462, 110)],
28
+ [(461, 111), (576, 111), (578, 123), (459, 122)]
29
+ ]
30
+
31
+ schedule1_rects = [
32
+ [(470, 204), (579, 203), (577, 216), (471, 216)], # Schedule 1 Line 3
33
+ [(470, 228), (577, 229), (576, 240), (470, 240)], # Schedule 1 Line 5
34
+ [(362, 274), (466, 274), (468, 288), (360, 288)] # Schedule 1 Line 8
35
+ ]
36
+
37
+ adjusted_page1_rects = [[(x, y + 23) for (x, y) in rect] for rect in page1_rects]
38
+ adjusted_page2_rects = [[(x, y + 23) for (x, y) in rect] for rect in page2_rects]
39
+
40
+ def get_bounding_rect(points):
41
+ xs = [pt[0] for pt in points]
42
+ ys = [pt[1] for pt in points]
43
+ return fitz.Rect(min(xs), min(ys), max(xs), max(ys))
44
+
45
+
46
+ def extract_numeric_values(pdf_file, schedule1_file=None, client_name="Unknown Client", draw_bboxes=False):
47
+ try:
48
+ if not client_name or client_name.strip() == "":
49
+ return "Error: Client name is required.", None
50
+
51
+ if isinstance(pdf_file, str):
52
+ doc = fitz.open(pdf_file)
53
+ else:
54
+ pdf_file.seek(0)
55
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
56
+
57
+ if len(doc) < 2:
58
+ return "Error: Main PDF must have at least 2 pages.", None, None
59
+
60
+ zoom = fitz.Matrix(2, 2)
61
+ page1 = doc[0]
62
+ page2 = doc[1]
63
+
64
+ page1_values, page2_values = [], []
65
+
66
+ for rect_points in adjusted_page1_rects:
67
+ rect = get_bounding_rect(rect_points)
68
+ pix = page1.get_pixmap(matrix=zoom, clip=rect)
69
+ cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
70
+ w, h = cropped_img.size
71
+ val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
72
+ raw = pytesseract.image_to_string(val_img, config=config_val).strip()
73
+ value_text = re.sub(r"[^\d,.\-+]", "", raw)
74
+ page1_values.append(value_text)
75
+ if draw_bboxes:
76
+ page1.draw_rect(rect, color=(1, 0, 0), width=1.5)
77
+
78
+ for rect_points in adjusted_page2_rects:
79
+ rect = get_bounding_rect(rect_points)
80
+ pix = page2.get_pixmap(matrix=zoom, clip=rect)
81
+ cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
82
+ w, h = cropped_img.size
83
+ val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
84
+ raw = pytesseract.image_to_string(val_img, config=config_val).strip()
85
+ value_text = re.sub(r"[^\d,.\-+]", "", raw)
86
+ page2_values.append(value_text)
87
+ if draw_bboxes:
88
+ page2.draw_rect(rect, color=(0, 0, 1), width=1.5)
89
+
90
+ output_pdf_path = None
91
+ if draw_bboxes:
92
+ output_pdf_path = "output_shifted.pdf"
93
+ doc.save(output_pdf_path)
94
+
95
+ doc.close()
96
+
97
+ output = [f"1040 Value {i+1}: {val}" for i, val in enumerate(page1_values + page2_values)]
98
+ all_extracted_values = page1_values + page2_values
99
+ schedule1_values = []
100
+
101
+ output_folder = "./temp_outputs"
102
+ os.makedirs(output_folder, exist_ok=True)
103
+ schedule1_pdf_path = None # initialize
104
+
105
+ if schedule1_file:
106
+ if isinstance(schedule1_file, str):
107
+ doc = fitz.open(schedule1_file)
108
+ else:
109
+ schedule1_file.seek(0)
110
+ doc = fitz.open(stream=schedule1_file.read(), filetype="pdf")
111
+
112
+ if len(doc) >= 1:
113
+ page = doc[0]
114
+ for idx, rect_points in enumerate(schedule1_rects):
115
+ rect = get_bounding_rect(rect_points)
116
+ pix = page.get_pixmap(matrix=zoom, clip=rect)
117
+ cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
118
+ w, h = cropped_img.size
119
+ val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
120
+ raw = pytesseract.image_to_string(val_img, config=config_val).strip()
121
+ value_text = re.sub(r"[^\d,.\-+]", "", raw)
122
+ schedule1_values.append(value_text)
123
+ if draw_bboxes:
124
+ page.draw_rect(rect, color=(0, 1, 0), width=1.5) # green boxes
125
+ schedule1_pdf_path = None
126
+ if draw_bboxes and schedule1_file:
127
+ schedule1_pdf_path = "schedule1_bbox.pdf"
128
+ doc.save(schedule1_pdf_path)
129
+
130
+ output += [f"Schedule 1 Line {i*2+1 if i < 2 else 8}: {val}" for i, val in enumerate(schedule1_values)]
131
+ doc.close()
132
+
133
+ else:
134
+ # Create a temporary empty file as a placeholder
135
+ empty_path = os.path.join(output_folder, "empty_schedule1.pdf")
136
+ if not os.path.exists(empty_path):
137
+ with open(empty_path, "wb") as f:
138
+ f.write(b"%PDF-1.4\n%%EOF")
139
+ schedule1_pdf_output = empty_path
140
+
141
+ output_dir = "./Clients_Output_Data_Form_1040.csv"
142
+ save_to_csv_flat(all_extracted_values, schedule1_values, client_name=client_name, csv_path=output_dir)
143
+
144
+ return "\n".join(output), output_dir, output_pdf_path, schedule1_pdf_path
145
+
146
+
147
+ except Exception as e:
148
+ return f"Error occurred:\n{str(e)}", None, None
149
+
150
+
151
+ def save_to_csv_flat(all_extracted_values, schedule1_values, client_name="Unknown Client", csv_path=None):
152
+
153
+ # Define the directory path explicitly
154
+ if csv_path is None:
155
+ csv_path = "./Clients_Output_Data_Form_1040.csv"
156
+
157
+ # Header components
158
+ header_level_1 = [
159
+ "Client Name","Gross Comp", "Taxable Wages", "Taxable Interest Income: Sch. B", "Tax- Exempt Interest",
160
+ "Qualified Dividends", "Ordinary Dividends", "Long Term Capital Gain or Loss",
161
+ "Other Adjustments (from Schedule 1)", "Business Income or Loss (Schedule C)",
162
+ "Rent/ Royalty (Schedule E)", "Other Income", "Standard Deduction", "Qualified Business Income Deduction",
163
+ "Taxable Income", "Tax", "", "", "Total Tax"
164
+ ]
165
+ header_level_2 = [
166
+ "","W2 Box 5", "Line 1", "Line 2b", "Line 2a", "Line 3a", "Line 3b", "Line 7",
167
+ "Line 10", "Schedule 1, Line 3", "Schedule 1, Line 5", "Schedule 1, Line 8",
168
+ "Line 12", "Line 13", "Line 15", "Line 16", "Line 20, Schedule 3", "Line 23, Schedule 2", "Line 24"
169
+ ]
170
+
171
+ # Flatten headers for CSV
172
+ flat_columns = [
173
+ f"{h1.strip()} - {h2.strip()}" if h1.strip() and h2.strip()
174
+ else (h1.strip() + h2.strip()) for h1, h2 in zip(header_level_1, header_level_2)
175
+ ]
176
+
177
+ # If file doesn't exist, create new DataFrame and write headers
178
+ # if os.path.exists(csv_path):
179
+ # df = pd.read_csv(csv_path)
180
+ # else:
181
+ # df = pd.DataFrame(columns=flat_columns)
182
+
183
+ if os.path.exists(csv_path):
184
+ print(f"Reading existing CSV file: {csv_path}")
185
+ df = pd.read_csv(csv_path)
186
+ print(f"CSV columns: {df.columns.tolist()}")
187
+ print(f"CSV rows before append: {len(df)}")
188
+ else:
189
+ print("CSV does not exist. Creating new DataFrame.")
190
+ df = pd.DataFrame(columns=flat_columns)
191
+
192
+ # Create new row with None
193
+ new_row = pd.Series([None] * len(flat_columns), index=flat_columns)
194
+ new_row.iloc[0] = client_name
195
+ # Map Page 1-2 values
196
+ line_mapping = {
197
+ "Taxable Wages - Line 1": 0,
198
+ "Taxable Interest Income: Sch. B - Line 2b": 1,
199
+ "Tax- Exempt Interest - Line 2a": 2,
200
+ "Qualified Dividends - Line 3a": 3,
201
+ "Ordinary Dividends - Line 3b": 4,
202
+ "Long Term Capital Gain or Loss - Line 7": 5,
203
+ "Other Adjustments (from Schedule 1) - Line 10": 6,
204
+ "Standard Deduction - Line 12": 7,
205
+ "Qualified Business Income Deduction - Line 13": 8,
206
+ "Taxable Income - Line 15": 9,
207
+ "Tax - Line 16": 10,
208
+ "Line 20, Schedule 3": 11,
209
+ "Line 23, Schedule 2": 12,
210
+ "Total Tax - Line 24": 13
211
+ }
212
+
213
+ for key, idx in line_mapping.items():
214
+ if idx < len(all_extracted_values):
215
+ new_row[key] = all_extracted_values[idx] if all_extracted_values[idx] != '' else '0'
216
+
217
+ # Add Schedule 1 values
218
+ if schedule1_values:
219
+ new_row["Business Income or Loss (Schedule C) - Schedule 1, Line 3"] = schedule1_values[0] if schedule1_values[0] != '' else '0'
220
+ new_row["Rent/ Royalty (Schedule E) - Schedule 1, Line 5"] = schedule1_values[1] if schedule1_values[1] != '' else '0'
221
+ new_row["Other Income - Schedule 1, Line 8"] = schedule1_values[2] if schedule1_values[2] != '' else '0'
222
+
223
+ # Append and save
224
+ if not ((df == new_row.iloc[0]).all(axis=1)).any():
225
+ df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
226
+ print(f"CSV rows after append: {len(df)}")
227
+ df.to_csv(csv_path, index=False)
228
+ print(f" Data saved to CSV: {csv_path}")
229
+
230
+ # Gradio UI
231
+ iface = gr.Interface(
232
+ fn=extract_numeric_values,
233
+ inputs=[
234
+ gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"]),
235
+ gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"]),
236
+ gr.Textbox(label="Client Name", placeholder="Enter client name"),
237
+ gr.Checkbox(label="Draw Bounding Boxes on Form 1040 and Schedule 1?")
238
+ ],
239
+ outputs=[
240
+ gr.Textbox(label="Extracted Numeric Values", lines=20),
241
+ gr.File(label="Download Excel Output"),
242
+ gr.File(label="1040 Bounding Boxes PDF"),
243
+ gr.File(label="Schedule 1 Bounding Boxes PDF")
244
+ ],
245
+
246
+ title="Form 1040 & Schedule 1 Extractor",
247
+ description="Upload a 2-page 1040 Form and (optionally) Schedule 1 to extract numeric fields. Optionally draw bounding boxes."
248
+ )
249
+
250
+ # with gr.Blocks(title="Tax PDF Extractor") as demo:
251
+ # gr.Markdown("## Tax PDF Extractor")
252
+ # gr.Markdown("Upload Form 1040 (at least 2 pages). Optionally upload Schedule 1 for extra fields.")
253
+
254
+ # client_name = gr.Textbox(label="Client Name (Required)", placeholder="Enter your full name")
255
+
256
+ # form_1040 = gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"])
257
+
258
+ # has_schedule1 = gr.Radio(
259
+ # choices=["Yes", "No"],
260
+ # label="Do you have Schedule 1?",
261
+ # value="No"
262
+ # )
263
+
264
+ # schedule1 = gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"], visible=False)
265
+
266
+ # # Show/hide schedule1 upload box
267
+ # def toggle_schedule1(choice):
268
+ # return gr.update(visible=choice == "Yes")
269
+
270
+ # has_schedule1.change(fn=toggle_schedule1, inputs=has_schedule1, outputs=schedule1)
271
+
272
+ # output_text = gr.Textbox(label="Extracted Numeric Values", lines=20)
273
+ # output_file = gr.File(label="Download Excel Output")
274
+
275
+ # def wrapper_extract(main_pdf, schedule1_pdf, client_name):
276
+ # if not client_name:
277
+ # return "Error: Client name is required.", None
278
+ # return extract_numeric_values(main_pdf, schedule1_pdf)
279
+
280
+ # submit_btn = gr.Button("Extract Data")
281
+
282
+ # submit_btn.click(
283
+ # fn=wrapper_extract,
284
+ # inputs=[form_1040, schedule1, client_name],
285
+ # outputs=[output_text, output_file]
286
+ # )
287
+
288
+
289
+ iface.launch(share=True)
290
+