Spaces:

MLBench
/

Tax_Document_Analysis

Running

App Files Files Community

Ayesha352 commited on 11 days ago

Commit

078ebba

verified ·

1 Parent(s): d4bc0a5

Create app.py

Browse files

Files changed (1) hide show

app.py +290 -0

app.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import re
+import fitz
+from PIL import Image
+import pytesseract
+import gradio as gr
+import pandas as pd
+import os
+config_val = "--psm 6 -c tessedit_char_whitelist=0123456789,.-+"
+# Rectangles for Form 1040 Pages 1 & 2
+page1_rects = [
+    [(464, 399), (576, 399), (575, 409), (462, 410)],
+    [(462, 519), (577, 518), (577, 531), (463, 529)],
+    [(225, 517), (340, 518), (339, 530), (224, 530)],
+    [(225, 530), (339, 532), (340, 541), (225, 542)],
+    [(464, 531), (576, 531), (576, 542), (464, 542)],
+    [(464, 589), (578, 589), (577, 602), (464, 602)],
+    [(463, 624), (578, 626), (576, 639), (464, 637)],
+    [(462, 652), (576, 651), (577, 661), (464, 663)],
+    [(463, 661), (578, 664), (578, 676), (462, 674)],
+    [(464, 699), (578, 684), (578, 699), (464, 699)]
+]
+page2_rects = [
+    [(462, 15), (575, 15), (576, 26), (463, 26)],
+    [(462, 62), (577, 63), (579, 75), (462, 73)],
+    [(463, 98), (576, 98), (578, 110), (462, 110)],
+    [(461, 111), (576, 111), (578, 123), (459, 122)]
+]
+schedule1_rects = [
+    [(470, 204), (579, 203), (577, 216), (471, 216)],  # Schedule 1 Line 3
+    [(470, 228), (577, 229), (576, 240), (470, 240)],  # Schedule 1 Line 5
+    [(362, 274), (466, 274), (468, 288), (360, 288)]   # Schedule 1 Line 8
+]
+adjusted_page1_rects = [[(x, y + 23) for (x, y) in rect] for rect in page1_rects]
+adjusted_page2_rects = [[(x, y + 23) for (x, y) in rect] for rect in page2_rects]
+def get_bounding_rect(points):
+    xs = [pt[0] for pt in points]
+    ys = [pt[1] for pt in points]
+    return fitz.Rect(min(xs), min(ys), max(xs), max(ys))
+def extract_numeric_values(pdf_file, schedule1_file=None, client_name="Unknown Client", draw_bboxes=False):
+    try:
+        if not client_name or client_name.strip() == "":
+            return "Error: Client name is required.", None
+        if isinstance(pdf_file, str):
+            doc = fitz.open(pdf_file)
+        else:
+            pdf_file.seek(0)
+            doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+        if len(doc) < 2:
+            return "Error: Main PDF must have at least 2 pages.", None, None
+        zoom = fitz.Matrix(2, 2)
+        page1 = doc[0]
+        page2 = doc[1]
+        page1_values, page2_values = [], []
+        for rect_points in adjusted_page1_rects:
+            rect = get_bounding_rect(rect_points)
+            pix = page1.get_pixmap(matrix=zoom, clip=rect)
+            cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            w, h = cropped_img.size
+            val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
+            raw = pytesseract.image_to_string(val_img, config=config_val).strip()
+            value_text = re.sub(r"[^\d,.\-+]", "", raw)
+            page1_values.append(value_text)
+            if draw_bboxes:
+                page1.draw_rect(rect, color=(1, 0, 0), width=1.5)
+        for rect_points in adjusted_page2_rects:
+            rect = get_bounding_rect(rect_points)
+            pix = page2.get_pixmap(matrix=zoom, clip=rect)
+            cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            w, h = cropped_img.size
+            val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
+            raw = pytesseract.image_to_string(val_img, config=config_val).strip()
+            value_text = re.sub(r"[^\d,.\-+]", "", raw)
+            page2_values.append(value_text)
+            if draw_bboxes:
+                page2.draw_rect(rect, color=(0, 0, 1), width=1.5)
+        output_pdf_path = None
+        if draw_bboxes:
+            output_pdf_path = "output_shifted.pdf"
+            doc.save(output_pdf_path)
+        doc.close()
+        output = [f"1040 Value {i+1}: {val}" for i, val in enumerate(page1_values + page2_values)]
+        all_extracted_values = page1_values + page2_values
+        schedule1_values = []
+        output_folder = "./temp_outputs"
+        os.makedirs(output_folder, exist_ok=True)
+        schedule1_pdf_path = None  # initialize
+        if schedule1_file:
+            if isinstance(schedule1_file, str):
+                doc = fitz.open(schedule1_file)
+            else:
+                schedule1_file.seek(0)
+                doc = fitz.open(stream=schedule1_file.read(), filetype="pdf")
+            if len(doc) >= 1:
+                page = doc[0]
+                for idx, rect_points in enumerate(schedule1_rects):
+                    rect = get_bounding_rect(rect_points)
+                    pix = page.get_pixmap(matrix=zoom, clip=rect)
+                    cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                    w, h = cropped_img.size
+                    val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
+                    raw = pytesseract.image_to_string(val_img, config=config_val).strip()
+                    value_text = re.sub(r"[^\d,.\-+]", "", raw)
+                    schedule1_values.append(value_text)
+                    if draw_bboxes:
+                        page.draw_rect(rect, color=(0, 1, 0), width=1.5)  # green boxes
+                schedule1_pdf_path = None
+                if draw_bboxes and schedule1_file:
+                    schedule1_pdf_path = "schedule1_bbox.pdf"
+                    doc.save(schedule1_pdf_path)
+                output += [f"Schedule 1 Line {i*2+1 if i < 2 else 8}: {val}" for i, val in enumerate(schedule1_values)]
+                doc.close()
+        else:
+            # Create a temporary empty file as a placeholder
+            empty_path = os.path.join(output_folder, "empty_schedule1.pdf")
+            if not os.path.exists(empty_path):
+                with open(empty_path, "wb") as f:
+                    f.write(b"%PDF-1.4\n%%EOF")
+            schedule1_pdf_output = empty_path
+        output_dir = "./Clients_Output_Data_Form_1040.csv"
+        save_to_csv_flat(all_extracted_values, schedule1_values, client_name=client_name, csv_path=output_dir)
+        return "\n".join(output), output_dir, output_pdf_path, schedule1_pdf_path
+    except Exception as e:
+        return f"Error occurred:\n{str(e)}", None, None
+def save_to_csv_flat(all_extracted_values, schedule1_values, client_name="Unknown Client", csv_path=None):
+    # Define the directory path explicitly
+    if csv_path is None:
+        csv_path = "./Clients_Output_Data_Form_1040.csv"
+    # Header components
+    header_level_1 = [
+        "Client Name","Gross Comp", "Taxable Wages", "Taxable Interest Income: Sch. B", "Tax- Exempt Interest",
+        "Qualified Dividends", "Ordinary Dividends", "Long Term Capital Gain or Loss",
+        "Other Adjustments (from Schedule 1)", "Business Income or Loss (Schedule C)",
+        "Rent/ Royalty (Schedule E)", "Other Income", "Standard Deduction", "Qualified Business Income Deduction",
+        "Taxable Income", "Tax", "", "", "Total Tax"
+    ]
+    header_level_2 = [
+        "","W2 Box 5", "Line 1", "Line 2b", "Line 2a", "Line 3a", "Line 3b", "Line 7",
+        "Line 10", "Schedule 1, Line 3", "Schedule 1, Line 5", "Schedule 1, Line 8",
+        "Line 12", "Line 13", "Line 15", "Line 16", "Line 20, Schedule 3", "Line 23, Schedule 2", "Line 24"
+    ]
+    # Flatten headers for CSV
+    flat_columns = [
+        f"{h1.strip()} - {h2.strip()}" if h1.strip() and h2.strip()
+        else (h1.strip() + h2.strip()) for h1, h2 in zip(header_level_1, header_level_2)
+    ]
+    # If file doesn't exist, create new DataFrame and write headers
+    # if os.path.exists(csv_path):
+    #     df = pd.read_csv(csv_path)
+    # else:
+    #     df = pd.DataFrame(columns=flat_columns)
+    if os.path.exists(csv_path):
+        print(f"Reading existing CSV file: {csv_path}")
+        df = pd.read_csv(csv_path)
+        print(f"CSV columns: {df.columns.tolist()}")
+        print(f"CSV rows before append: {len(df)}")
+    else:
+        print("CSV does not exist. Creating new DataFrame.")
+        df = pd.DataFrame(columns=flat_columns)
+    # Create new row with None
+    new_row = pd.Series([None] * len(flat_columns), index=flat_columns)
+    new_row.iloc[0] = client_name
+    # Map Page 1-2 values
+    line_mapping = {
+        "Taxable Wages - Line 1": 0,
+        "Taxable Interest Income: Sch. B - Line 2b": 1,
+        "Tax- Exempt Interest - Line 2a": 2,
+        "Qualified Dividends - Line 3a": 3,
+        "Ordinary Dividends - Line 3b": 4,
+        "Long Term Capital Gain or Loss - Line 7": 5,
+        "Other Adjustments (from Schedule 1) - Line 10": 6,
+        "Standard Deduction - Line 12": 7,
+        "Qualified Business Income Deduction - Line 13": 8,
+        "Taxable Income - Line 15": 9,
+        "Tax - Line 16": 10,
+        "Line 20, Schedule 3": 11,
+        "Line 23, Schedule 2": 12,
+        "Total Tax - Line 24": 13
+    }
+    for key, idx in line_mapping.items():
+        if idx < len(all_extracted_values):
+            new_row[key] = all_extracted_values[idx] if all_extracted_values[idx] != '' else '0'
+    # Add Schedule 1 values
+    if schedule1_values:
+        new_row["Business Income or Loss (Schedule C) - Schedule 1, Line 3"] = schedule1_values[0] if schedule1_values[0] != '' else '0'
+        new_row["Rent/ Royalty (Schedule E) - Schedule 1, Line 5"] = schedule1_values[1] if schedule1_values[1] != '' else '0'
+        new_row["Other Income - Schedule 1, Line 8"] = schedule1_values[2] if schedule1_values[2] != '' else '0'
+    # Append and save
+    if not ((df == new_row.iloc[0]).all(axis=1)).any():
+        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
+    print(f"CSV rows after append: {len(df)}")
+    df.to_csv(csv_path, index=False)
+    print(f" Data saved to CSV: {csv_path}")
+# Gradio UI
+iface = gr.Interface(
+    fn=extract_numeric_values,
+    inputs=[
+        gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"]),
+        gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"]),
+        gr.Textbox(label="Client Name", placeholder="Enter client name"),
+        gr.Checkbox(label="Draw Bounding Boxes on Form 1040 and Schedule 1?")
+    ],
+    outputs=[
+    gr.Textbox(label="Extracted Numeric Values", lines=20),
+    gr.File(label="Download Excel Output"),
+    gr.File(label="1040 Bounding Boxes PDF"),
+    gr.File(label="Schedule 1 Bounding Boxes PDF")
+    ],
+    title="Form 1040 & Schedule 1 Extractor",
+    description="Upload a 2-page 1040 Form and (optionally) Schedule 1 to extract numeric fields. Optionally draw bounding boxes."
+)
+# with gr.Blocks(title="Tax PDF Extractor") as demo:
+#     gr.Markdown("##  Tax PDF Extractor")
+#     gr.Markdown("Upload Form 1040 (at least 2 pages). Optionally upload Schedule 1 for extra fields.")
+#     client_name = gr.Textbox(label="Client Name (Required)", placeholder="Enter your full name")
+#     form_1040 = gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"])
+#     has_schedule1 = gr.Radio(
+#         choices=["Yes", "No"],
+#         label="Do you have Schedule 1?",
+#         value="No"
+#     )
+#     schedule1 = gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"], visible=False)
+#     # Show/hide schedule1 upload box
+#     def toggle_schedule1(choice):
+#         return gr.update(visible=choice == "Yes")
+#     has_schedule1.change(fn=toggle_schedule1, inputs=has_schedule1, outputs=schedule1)
+#     output_text = gr.Textbox(label="Extracted Numeric Values", lines=20)
+#     output_file = gr.File(label="Download Excel Output")
+#     def wrapper_extract(main_pdf, schedule1_pdf, client_name):
+#         if not client_name:
+#             return "Error: Client name is required.", None
+#         return extract_numeric_values(main_pdf, schedule1_pdf)
+#     submit_btn = gr.Button("Extract Data")
+#     submit_btn.click(
+#         fn=wrapper_extract,
+#         inputs=[form_1040, schedule1, client_name],
+#         outputs=[output_text, output_file]
+#     )
+iface.launch(share=True)