Spaces:

pierreguillou
/

arquiteturia

Running

App Files Files Community

pierreguillou commited on Dec 1, 2024

Commit

2fcabd1

verified ·

1 Parent(s): cb44bd9

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -28

app.py CHANGED Viewed

@@ -9,8 +9,10 @@ import pytesseract
 from pytesseract import Output
 import zipfile
 from pdf2image import convert_from_path
-# [Keep all the helper functions from the original code]
 def convert_to_rgb(image_path):
     img = Image.open(image_path)
     rgb_img = img.convert("RGB")
@@ -90,38 +92,111 @@ def save_extracted_text(blocks, page_number, output_folder):
         f.write(f"[PAGE {page_number}]\n")
         for block in blocks:
             f.write(block['text'] + "\n")
-        f.write(f"[FIN DE PAGE {page_number}]\n\n")
     return text_file_path
-# Modified process_pdf function with better temp file handling
 def process_pdf(pdf_file):
-    # Create unique temporary working directory
     temp_dir = os.path.join(os.getcwd(), "temp_processing")
     output_dir = os.path.join(temp_dir, 'output_images')
-    # Clean up any existing temp directories
     if os.path.exists(temp_dir):
         shutil.rmtree(temp_dir)
     os.makedirs(output_dir, exist_ok=True)
     try:
-        # Convert PDF to images
         images = convert_from_path(pdf_file.name)
-        # Process each image
         annotated_images = []
         for i, img in enumerate(images):
-            # Save temporary image
             temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png')
             img.save(temp_img_path)
-            # Process the image
             blocks, annotated_image_path = process_image(temp_img_path, output_dir, i)
             annotated_images.append(annotated_image_path)
             save_extracted_text(blocks, i + 1, output_dir)
-        # Create ZIP file of annotated images
         zip_path = os.path.join(temp_dir, "annotated_images.zip")
         with zipfile.ZipFile(zip_path, 'w') as zipf:
             for img_path in annotated_images:
@@ -130,22 +205,20 @@ def process_pdf(pdf_file):
         # Get the text file
         text_file_path = os.path.join(output_dir, 'extracted_text.txt')
-        # Read the files into memory before cleanup
-        with open(text_file_path, 'rb') as f:
-            text_content = f.read()
-        with open(zip_path, 'rb') as f:
-            zip_content = f.read()
-        return (text_file_path, zip_path)
     except Exception as e:
         raise gr.Error(f"Error processing PDF: {str(e)}")
-    finally:
-        # Clean up will be handled by Hugging Face Spaces
-        pass
-# Create Gradio interface with theme and better styling
 css = """
 .gradio-container {
     font-family: 'IBM Plex Sans', sans-serif;
@@ -158,7 +231,6 @@ css = """
 }
 """
-# Create Gradio interface
 demo = gr.Interface(
     fn=process_pdf,
     inputs=[
@@ -170,15 +242,17 @@ demo = gr.Interface(
     ],
     outputs=[
         gr.File(label="Extracted Text (TXT)"),
-        gr.File(label="Annotated Images (ZIP)")
     ],
-    title="PDF Text Extraction and Annotation",
     description="""
     Upload a PDF document to:
     1. Extract text content
     2. Get annotated images showing detected text blocks
-    Supports multiple pages and French language text.
     """,
     article="Created by [Your Name] - [Your GitHub/Profile Link]",
     css=css,

 from pytesseract import Output
 import zipfile
 from pdf2image import convert_from_path
+import google.generativeai as genai
+import json
+# Helper Functions
 def convert_to_rgb(image_path):
     img = Image.open(image_path)
     rgb_img = img.convert("RGB")
         f.write(f"[PAGE {page_number}]\n")
         for block in blocks:
             f.write(block['text'] + "\n")
+        f.write("[FIN DE PAGE]\n\n")
     return text_file_path
+# Gemini Functions
+def initialize_gemini():
+    try:
+        genai.configure(api_key=os.environ("GEMINI_API_KEY"))
+        generation_config = {
+            "temperature": 1,
+            "top_p": 0.95,
+            "top_k": 40,
+            "max_output_tokens": 8192,
+            "response_mime_type": "text/plain",
+        }
+        model = genai.GenerativeModel(
+            model_name="gemini-1.5-pro",
+            generation_config=generation_config,
+        )
+        return model
+    except Exception as e:
+        raise gr.Error(f"Error initializing Gemini: {str(e)}")
+def create_prompt(extracted_text: str) -> str:
+    data_to_extract = {
+        "tribunal": "",
+        "numero_rg": "",
+        "date_ordonnance": "",
+        "demandeurs": [],
+        "defendeurs": [],
+        "avocats_demandeurs": [],
+        "avocats_defendeurs": []
+    }
+    prompt = f"""Tu es un assistant juridique expert en analyse de documents judiciaires français.
+    Je vais te fournir le contenu d'un document judiciaire extrait d'un PDF.
+    Ta tâche est d'analyser ce texte et d'en extraire les informations suivantes de manière précise :
+    {json.dumps(data_to_extract, indent=2, ensure_ascii=False)}
+    Voici quelques règles à suivre :
+    - Si une information n'est pas présente dans le texte, indique "Non spécifié" pour cette catégorie.
+    - Pour les noms des parties (demandeurs et défendeurs, et leurs avocats), liste tous ceux que tu trouves
+    - Assure-toi de différencier correctement les demandeurs des défendeurs.
+    - Si tu n'es pas sûr d'une information, indique-le clairement.
+    Présente tes résultats sous forme de JSON, en utilisant les catégories mentionnées ci-dessus.
+    Voici le contenu du document :
+    {extracted_text.strip()}
+    Analyse ce texte et fournis-moi les informations demandées au format JSON uniquement.""".strip()
+    return prompt
+def extract_data_with_gemini(text_file_path: str) -> dict:
+    try:
+        # Initialize Gemini
+        model = initialize_gemini()
+        # Read the extracted text
+        with open(text_file_path, 'r', encoding='utf-8') as f:
+            extracted_text = f.read()
+        # Create prompt and get response
+        prompt = create_prompt(extracted_text)
+        response = model.generate_content(prompt)
+        # Parse the JSON response
+        try:
+            # Extract JSON from the response text
+            json_str = response.text
+            if "json" in json_str.lower():
+                json_str = json_str.split("json")[1].split("```")[0]
+            elif "```" in json_str:
+                json_str = json_str.split("```")[1]
+            result = json.loads(json_str)
+        except:
+            result = {"error": "Failed to parse JSON response", "raw_response": response.text}
+        return result
+    except Exception as e:
+        raise gr.Error(f"Error in Gemini processing: {str(e)}")
+# Main Processing Function
 def process_pdf(pdf_file):
     temp_dir = os.path.join(os.getcwd(), "temp_processing")
     output_dir = os.path.join(temp_dir, 'output_images')
     if os.path.exists(temp_dir):
         shutil.rmtree(temp_dir)
     os.makedirs(output_dir, exist_ok=True)
     try:
+        # Convert PDF to images and process
         images = convert_from_path(pdf_file.name)
         annotated_images = []
         for i, img in enumerate(images):
             temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png')
             img.save(temp_img_path)
             blocks, annotated_image_path = process_image(temp_img_path, output_dir, i)
             annotated_images.append(annotated_image_path)
             save_extracted_text(blocks, i + 1, output_dir)
+        # Create ZIP file
         zip_path = os.path.join(temp_dir, "annotated_images.zip")
         with zipfile.ZipFile(zip_path, 'w') as zipf:
             for img_path in annotated_images:
         # Get the text file
         text_file_path = os.path.join(output_dir, 'extracted_text.txt')
+        # Process with Gemini
+        extracted_data = extract_data_with_gemini(text_file_path)
+        # Save extracted data to JSON file
+        json_path = os.path.join(temp_dir, "extracted_data.json")
+        with open(json_path, 'w', encoding='utf-8') as f:
+            json.dump(extracted_data, f, ensure_ascii=False, indent=2)
+        return text_file_path, zip_path, json_path
     except Exception as e:
         raise gr.Error(f"Error processing PDF: {str(e)}")
+# Gradio Interface
 css = """
 .gradio-container {
     font-family: 'IBM Plex Sans', sans-serif;
 }
 """
 demo = gr.Interface(
     fn=process_pdf,
     inputs=[
     ],
     outputs=[
         gr.File(label="Extracted Text (TXT)"),
+        gr.File(label="Annotated Images (ZIP)"),
+        gr.File(label="Extracted Data (JSON)")
     ],
+    title="PDF Text Extraction and Analysis",
     description="""
     Upload a PDF document to:
     1. Extract text content
     2. Get annotated images showing detected text blocks
+    3. Extract structured data using AI analysis
+    Supports multiple pages and French legal documents.
     """,
     article="Created by [Your Name] - [Your GitHub/Profile Link]",
     css=css,