Spaces:

SauravCh11
/

DocOrganiserAndExtractor

Running

App Files Files Community

Sandy2636 commited on about 23 hours ago

Commit

40edde0

1 Parent(s): d1c0f9b

New Update

Browse files

Files changed (2) hide show

app.py +272 -8
requirements.txt +6 -2

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
-import os
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow INFO and WARNING messages
-os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 import gradio as gr
 import base64
 import requests
@@ -9,13 +9,30 @@ import re
 import os
 import uuid
 from datetime import datetime
 import time # For potential sleeps if needed, or timing
 # Attempt to import deepface and handle import error gracefully
 try:
     from deepface import DeepFace
-    from deepface.commons import functions as deepface_functions
     DEEPFACE_AVAILABLE = True
 except ImportError:
     DEEPFACE_AVAILABLE = False
     print("Warning: deepface library not found. Facial recognition features will be disabled.")
@@ -48,6 +65,99 @@ processed_files_data = []
 person_profiles = {}
 # --- Helper Functions ---
 def extract_json_from_text(text):
     if not text:
         return {"error": "Empty text provided for JSON extraction."}
@@ -343,7 +453,7 @@ def format_dataframe_data(current_files_data):
     df_rows = []
     for f_data in current_files_data:
         entities = f_data.get("entities") or {}
-        face_info = f_data.get("face_analysis_result", {})
         face_detected_status = "Y" if face_info.get("count", 0) > 0 else "N"
         if "error" in face_info : face_detected_status = "Error"
         elif "message" in face_info and "No face detected" in face_info["message"]: face_detected_status = "N"
@@ -388,7 +498,7 @@ def format_persons_markdown(current_persons_data, current_files_data):
         md_parts.append("\n---\n")
     return "\n".join(md_parts)
-def process_uploaded_files(files_list, progress=gr.Progress(track_tqdm=True)):
     global processed_files_data, person_profiles
     processed_files_data = []
     person_profiles = {}
@@ -485,6 +595,160 @@ def process_uploaded_files(files_list, progress=gr.Progress(track_tqdm=True)):
     final_persons_md = format_persons_markdown(person_profiles, processed_files_data)
     yield (final_df_data, final_persons_md, "{}", f"All {len(processed_files_data)} documents processed.")
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📄 Intelligent Document Processor & Classifier v2 (with Face ID)")
@@ -498,7 +762,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column(scale=1):
             files_input = gr.Files(label="Upload Document Images (Bulk)", file_count="multiple", type="filepath")
-            process_button = gr.Button("🚀 Process Uploaded Documents", variant="primary")
         with gr.Column(scale=2):
             overall_status_textbox = gr.Textbox(label="Current Task & Overall Progress", interactive=False, lines=2)
@@ -508,7 +772,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     document_status_df = gr.Dataframe(
         headers=dataframe_headers, datatype=["str"] * len(dataframe_headers),
         label="Individual Document Status & Extracted Entities",
-        row_count=(1, "dynamic"), col_count=(len(dataframe_headers), "fixed"), wrap=True, height=400
     )
     with gr.Accordion("Selected Document Full OCR JSON", open=False):

+# import os
+# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow INFO and WARNING messages
+# os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 import gradio as gr
 import base64
 import requests
 import os
 import uuid
 from datetime import datetime
+import tempfile  # ✅ Add this
+import shutil
 import time # For potential sleeps if needed, or timing
 # Attempt to import deepface and handle import error gracefully
+try:
+    import fitz  # PyMuPDF
+    PYMUPDF_AVAILABLE = True
+except ImportError:
+    PYMUPDF_AVAILABLE = False
+    print("Warning: PyMuPDF not found. PDF processing will be disabled.")
+try:
+    import docx
+    from PIL import Image, ImageDraw, ImageFont
+    DOCX_AVAILABLE = True
+except ImportError:
+    DOCX_AVAILABLE = False
+    print("Warning: python-docx or Pillow not found. DOCX processing will be disabled.")
 try:
     from deepface import DeepFace
+    # from deepface.commons import functions as deepface_functions
     DEEPFACE_AVAILABLE = True
+    print(f"Got DeepFace")
 except ImportError:
     DEEPFACE_AVAILABLE = False
     print("Warning: deepface library not found. Facial recognition features will be disabled.")
 person_profiles = {}
 # --- Helper Functions ---
+def render_text_to_image(text, output_path):
+    """Renders a string of text onto a new image file."""
+    if not DOCX_AVAILABLE:
+        raise ImportError("Pillow or python-docx is not installed.")
+    try:
+        # Use a built-in font if available, otherwise this might fail on minimal OS
+        font = ImageFont.truetype("DejaVuSans.ttf", 15)
+    except IOError:
+        print("Default font not found, using basic PIL font.")
+        font = ImageFont.load_default()
+    padding = 20
+    image_width = 800
+    # Simple text wrapping
+    lines = []
+    for paragraph in text.split('\n'):
+        words = paragraph.split()
+        line = ""
+        for word in words:
+            # Use getbbox for more accurate width calculation if available (Pillow >= 9.2.0)
+            if hasattr(font, 'getbbox'):
+                box = font.getbbox(line + word)
+                line_width = box[2] - box[0]
+            else: # Fallback for older Pillow
+                line_width = font.getsize(line + word)[0]
+            if line_width <= image_width - 2 * padding:
+                line += word + " "
+            else:
+                lines.append(line.strip())
+                line = word + " "
+        lines.append(line.strip())
+    # Calculate image height
+    _, top, _, bottom = font.getbbox("A")
+    line_height = bottom - top + 5 # Add some line spacing
+    image_height = len(lines) * line_height + 2 * padding
+    img = Image.new('RGB', (image_width, int(image_height)), color='white')
+    draw = ImageDraw.Draw(img)
+    y = padding
+    for line in lines:
+        draw.text((padding, y), line, font=font, fill='black')
+        y += line_height
+    img.save(output_path, format='PNG')
+def convert_file_to_images(original_filepath, temp_output_dir):
+    """
+    Converts an uploaded file (PDF, DOCX) into one or more images.
+    If the file is already an image, it returns its own path.
+    Returns a list of dictionaries, each with 'path' and 'page' keys.
+    """
+    filename_lower = original_filepath.lower()
+    output_paths = []
+    if filename_lower.endswith('.pdf'):
+        if not PYMUPDF_AVAILABLE:
+            raise RuntimeError("PDF processing is disabled (PyMuPDF not installed).")
+        doc = fitz.open(original_filepath)
+        for i, page in enumerate(doc):
+            pix = page.get_pixmap(dpi=200) # Render page to image
+            output_filepath = os.path.join(temp_output_dir, f"{os.path.basename(original_filepath)}_page_{i+1}.png")
+            pix.save(output_filepath)
+            output_paths.append({"path": output_filepath, "page": i + 1})
+        doc.close()
+    elif filename_lower.endswith('.docx'):
+        if not DOCX_AVAILABLE:
+            raise RuntimeError("DOCX processing is disabled (python-docx or Pillow not installed).")
+        doc = docx.Document(original_filepath)
+        full_text = "\n".join([para.text for para in doc.paragraphs])
+        if not full_text.strip():
+             full_text = "--- Document is empty or contains only images/tables ---"
+        output_filepath = os.path.join(temp_output_dir, f"{os.path.basename(original_filepath)}.png")
+        render_text_to_image(full_text, output_filepath)
+        output_paths.append({"path": output_filepath, "page": 1})
+    elif filename_lower.endswith(('.png', '.jpg', '.jpeg', '.webp', '.bmp', '.tiff')):
+        # File is already an image, just return its path
+        output_paths.append({"path": original_filepath, "page": 1})
+    else:
+        raise TypeError(f"Unsupported file type: {os.path.basename(original_filepath)}")
+    return output_paths
 def extract_json_from_text(text):
     if not text:
         return {"error": "Empty text provided for JSON extraction."}
     df_rows = []
     for f_data in current_files_data:
         entities = f_data.get("entities") or {}
+        face_info = f_data.get("face_analysis_result", {}) or {}
         face_detected_status = "Y" if face_info.get("count", 0) > 0 else "N"
         if "error" in face_info : face_detected_status = "Error"
         elif "message" in face_info and "No face detected" in face_info["message"]: face_detected_status = "N"
         md_parts.append("\n---\n")
     return "\n".join(md_parts)
+def process_uploaded_files_old(files_list, progress=gr.Progress(track_tqdm=True)):
     global processed_files_data, person_profiles
     processed_files_data = []
     person_profiles = {}
     final_persons_md = format_persons_markdown(person_profiles, processed_files_data)
     yield (final_df_data, final_persons_md, "{}", f"All {len(processed_files_data)} documents processed.")
+def process_uploaded_files(files_list, progress=gr.Progress(track_tqdm=True)):
+    global processed_files_data, person_profiles
+    processed_files_data = []
+    person_profiles = {}
+    temp_dir = tempfile.mkdtemp()  # Create a temporary directory for converted images
+    empty_df_row = [["N/A"] * 11]  # Match number of headers
+    if not OPENROUTER_API_KEY:
+        yield (empty_df_row, "API Key Missing.", "{}", "Error: API Key not set.")
+        shutil.rmtree(temp_dir)
+        return
+    if not files_list:
+        yield ([], "No files uploaded.", "{}", "Upload files to begin.")
+        shutil.rmtree(temp_dir)
+        return
+    # --- Stage 1: Pre-process files into a job queue of images ---
+    job_queue = []
+    for original_file_obj in progress.tqdm(files_list, desc="Pre-processing Files"):
+        try:
+            image_page_list = convert_file_to_images(original_file_obj.name, temp_dir)
+            total_pages = len(image_page_list)
+            for item in image_page_list:
+                job_queue.append({
+                    "original_filename": os.path.basename(original_file_obj.name),
+                    "page_number": item["page"],
+                    "total_pages": total_pages,
+                    "image_path": item["path"]
+                })
+        except Exception as e:
+            job_queue.append({"original_filename": os.path.basename(original_file_obj.name), "error": str(e)})
+    for job in job_queue:
+        if "error" in job:
+            processed_files_data.append({
+                "doc_id": str(uuid.uuid4()),
+                "original_filename": job["original_filename"],
+                "page_number": 1,
+                "status": f"Error: {job['error']}"
+            })
+        else:
+            processed_files_data.append({
+                "doc_id": str(uuid.uuid4()),
+                "original_filename": job["original_filename"],
+                "page_number": job["page_number"],
+                "total_pages": job["total_pages"],
+                "filepath": job["image_path"],
+                "status": "Queued",
+                "ocr_json": None,
+                "entities": None,
+                "face_analysis_result": None,
+                "facial_embeddings": None,
+                "assigned_person_key": None,
+                "linking_method": ""
+            })
+    initial_df_data = format_dataframe_data(processed_files_data)
+    initial_persons_md = format_persons_markdown(person_profiles, processed_files_data)
+    yield (initial_df_data, initial_persons_md, "{}", f"Pre-processing complete. Analyzing {len(processed_files_data)} pages.")
+    # --- Stage 2: Analyze each page ---
+    current_ocr_json_display = "{}"
+    for i, file_data_item in enumerate(progress.tqdm(processed_files_data, desc="Analyzing Pages")):
+        if file_data_item["status"].startswith("Error"):
+            continue
+        current_filename = f"{file_data_item['original_filename']} (p.{file_data_item['page_number']})"
+        linking_method_log_for_doc = []
+        # 1. OCR
+        file_data_item["status"] = "OCR..."
+        persons_md = format_persons_markdown(person_profiles, processed_files_data)
+        df_data = format_dataframe_data(processed_files_data)
+        yield (df_data, persons_md, current_ocr_json_display, f"OCR: {current_filename}")
+        ocr_result = call_openrouter_ocr(file_data_item["filepath"])
+        file_data_item["ocr_json"] = ocr_result
+        current_ocr_json_display = json.dumps(ocr_result, indent=2)
+        if "error" in ocr_result:
+            file_data_item["status"] = f"OCR Err: {str(ocr_result['error'])[:30]}.."
+            linking_method_log_for_doc.append("OCR Failed.")
+            file_data_item["linking_method"] = " ".join(linking_method_log_for_doc)
+            persons_md = format_persons_markdown(person_profiles, processed_files_data)
+            df_data = format_dataframe_data(processed_files_data)
+            yield (df_data, persons_md, current_ocr_json_display, f"OCR Err: {current_filename}")
+            continue
+        # 2. Entity Extraction
+        file_data_item["status"] = "OCR OK. Entities..."
+        persons_md = format_persons_markdown(person_profiles, processed_files_data)
+        df_data = format_dataframe_data(processed_files_data)
+        yield (df_data, persons_md, current_ocr_json_display, f"Entities: {current_filename}")
+        entities = extract_entities_from_ocr(ocr_result)
+        file_data_item["entities"] = entities
+        # 3. Facial Feature Extraction
+        file_data_item["status"] = "Entities OK. Face..."
+        persons_md = format_persons_markdown(person_profiles, processed_files_data)
+        df_data = format_dataframe_data(processed_files_data)
+        yield (df_data, persons_md, current_ocr_json_display, f"Face Detect: {current_filename}")
+        doc_type_lower = (entities.get("doc_type") or "").lower()
+        if DEEPFACE_AVAILABLE and (
+            "photo" in doc_type_lower or
+            "passport" in doc_type_lower or
+            "id" in doc_type_lower or
+            "selfie" in doc_type_lower or
+            not doc_type_lower
+        ):
+            face_result = get_facial_embeddings_with_deepface(file_data_item["filepath"])
+            file_data_item["face_analysis_result"] = face_result
+            if "embeddings" in face_result and face_result["embeddings"]:
+                file_data_item["facial_embeddings"] = face_result["embeddings"]
+                linking_method_log_for_doc.append(f"{face_result.get('count', 0)} face(s).")
+            elif "error" in face_result:
+                linking_method_log_for_doc.append("Face Ext. Error.")
+            else:
+                linking_method_log_for_doc.append("No face det.")
+        else:
+            linking_method_log_for_doc.append("Face Ext. Skipped.")
+        file_data_item["status"] = "Face Done. Classify..."
+        persons_md = format_persons_markdown(person_profiles, processed_files_data)
+        df_data = format_dataframe_data(processed_files_data)
+        yield (df_data, persons_md, current_ocr_json_display, f"Classifying: {current_filename}")
+        # 4. Person Classification
+        person_key = get_person_id_and_update_profiles(
+            file_data_item["doc_id"],
+            entities,
+            file_data_item.get("facial_embeddings"),
+            person_profiles,
+            linking_method_log_for_doc
+        )
+        file_data_item["assigned_person_key"] = person_key
+        file_data_item["status"] = "Classified"
+        file_data_item["linking_method"] = " ".join(linking_method_log_for_doc)
+        persons_md = format_persons_markdown(person_profiles, processed_files_data)
+        df_data = format_dataframe_data(processed_files_data)
+        yield (df_data, persons_md, current_ocr_json_display, f"Done: {current_filename} -> {person_key}")
+    # Final Result
+    final_df_data = format_dataframe_data(processed_files_data)
+    final_persons_md = format_persons_markdown(person_profiles, processed_files_data)
+    yield (final_df_data, final_persons_md, "{}", f"All {len(processed_files_data)} pages analyzed.")
+    # Cleanup
+    try:
+        shutil.rmtree(temp_dir)
+        print(f"Cleaned up temporary directory: {temp_dir}")
+    except Exception as e:
+        print(f"Error cleaning up temporary directory {temp_dir}: {e}")
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📄 Intelligent Document Processor & Classifier v2 (with Face ID)")
     with gr.Row():
         with gr.Column(scale=1):
             files_input = gr.Files(label="Upload Document Images (Bulk)", file_count="multiple", type="filepath")
+            process_button = gr.Button("Process Uploaded Documents", variant="primary")
         with gr.Column(scale=2):
             overall_status_textbox = gr.Textbox(label="Current Task & Overall Progress", interactive=False, lines=2)
     document_status_df = gr.Dataframe(
         headers=dataframe_headers, datatype=["str"] * len(dataframe_headers),
         label="Individual Document Status & Extracted Entities",
+        row_count=(1, "dynamic"), col_count=(len(dataframe_headers), "fixed"), wrap=True
     )
     with gr.Accordion("Selected Document Full OCR JSON", open=False):

requirements.txt CHANGED Viewed

@@ -2,6 +2,10 @@ gradio>=4.0.0
 requests>=2.25.0
 Pillow>=9.0.0
 deepface>=0.0.79
-tensorflow-cpu>=2.13.0,<2.15.0  # Loosen to a broader range if needed
 opencv-python-headless>=4.5.0
-retina-face>=0.0.12

 requests>=2.25.0
 Pillow>=9.0.0
 deepface>=0.0.79
+tensorflow>=2.10.0 # Or tensorflow-cpu if GPU is not available/needed
 opencv-python-headless>=4.5.0
+# retina-face Pypi package for the detector if deepface doesn't pull it correctly
+retina-face>=0.0.12
+tf-keras
+PyMuPDF
+python-docx