pierreguillou commited on
Commit
2fcabd1
·
verified ·
1 Parent(s): cb44bd9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -28
app.py CHANGED
@@ -9,8 +9,10 @@ import pytesseract
9
  from pytesseract import Output
10
  import zipfile
11
  from pdf2image import convert_from_path
 
 
12
 
13
- # [Keep all the helper functions from the original code]
14
  def convert_to_rgb(image_path):
15
  img = Image.open(image_path)
16
  rgb_img = img.convert("RGB")
@@ -90,38 +92,111 @@ def save_extracted_text(blocks, page_number, output_folder):
90
  f.write(f"[PAGE {page_number}]\n")
91
  for block in blocks:
92
  f.write(block['text'] + "\n")
93
- f.write(f"[FIN DE PAGE {page_number}]\n\n")
94
  return text_file_path
95
 
96
- # Modified process_pdf function with better temp file handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def process_pdf(pdf_file):
98
- # Create unique temporary working directory
99
  temp_dir = os.path.join(os.getcwd(), "temp_processing")
100
  output_dir = os.path.join(temp_dir, 'output_images')
101
 
102
- # Clean up any existing temp directories
103
  if os.path.exists(temp_dir):
104
  shutil.rmtree(temp_dir)
105
-
106
  os.makedirs(output_dir, exist_ok=True)
107
 
108
  try:
109
- # Convert PDF to images
110
  images = convert_from_path(pdf_file.name)
111
-
112
- # Process each image
113
  annotated_images = []
114
  for i, img in enumerate(images):
115
- # Save temporary image
116
  temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png')
117
  img.save(temp_img_path)
118
-
119
- # Process the image
120
  blocks, annotated_image_path = process_image(temp_img_path, output_dir, i)
121
  annotated_images.append(annotated_image_path)
122
  save_extracted_text(blocks, i + 1, output_dir)
123
 
124
- # Create ZIP file of annotated images
125
  zip_path = os.path.join(temp_dir, "annotated_images.zip")
126
  with zipfile.ZipFile(zip_path, 'w') as zipf:
127
  for img_path in annotated_images:
@@ -130,22 +205,20 @@ def process_pdf(pdf_file):
130
  # Get the text file
131
  text_file_path = os.path.join(output_dir, 'extracted_text.txt')
132
 
133
- # Read the files into memory before cleanup
134
- with open(text_file_path, 'rb') as f:
135
- text_content = f.read()
136
- with open(zip_path, 'rb') as f:
137
- zip_content = f.read()
 
 
138
 
139
- return (text_file_path, zip_path)
140
 
141
  except Exception as e:
142
  raise gr.Error(f"Error processing PDF: {str(e)}")
143
 
144
- finally:
145
- # Clean up will be handled by Hugging Face Spaces
146
- pass
147
-
148
- # Create Gradio interface with theme and better styling
149
  css = """
150
  .gradio-container {
151
  font-family: 'IBM Plex Sans', sans-serif;
@@ -158,7 +231,6 @@ css = """
158
  }
159
  """
160
 
161
- # Create Gradio interface
162
  demo = gr.Interface(
163
  fn=process_pdf,
164
  inputs=[
@@ -170,15 +242,17 @@ demo = gr.Interface(
170
  ],
171
  outputs=[
172
  gr.File(label="Extracted Text (TXT)"),
173
- gr.File(label="Annotated Images (ZIP)")
 
174
  ],
175
- title="PDF Text Extraction and Annotation",
176
  description="""
177
  Upload a PDF document to:
178
  1. Extract text content
179
  2. Get annotated images showing detected text blocks
 
180
 
181
- Supports multiple pages and French language text.
182
  """,
183
  article="Created by [Your Name] - [Your GitHub/Profile Link]",
184
  css=css,
 
9
  from pytesseract import Output
10
  import zipfile
11
  from pdf2image import convert_from_path
12
+ import google.generativeai as genai
13
+ import json
14
 
15
+ # Helper Functions
16
  def convert_to_rgb(image_path):
17
  img = Image.open(image_path)
18
  rgb_img = img.convert("RGB")
 
92
  f.write(f"[PAGE {page_number}]\n")
93
  for block in blocks:
94
  f.write(block['text'] + "\n")
95
+ f.write("[FIN DE PAGE]\n\n")
96
  return text_file_path
97
 
98
+ # Gemini Functions
99
+ def initialize_gemini():
100
+ try:
101
+ genai.configure(api_key=os.environ("GEMINI_API_KEY"))
102
+ generation_config = {
103
+ "temperature": 1,
104
+ "top_p": 0.95,
105
+ "top_k": 40,
106
+ "max_output_tokens": 8192,
107
+ "response_mime_type": "text/plain",
108
+ }
109
+ model = genai.GenerativeModel(
110
+ model_name="gemini-1.5-pro",
111
+ generation_config=generation_config,
112
+ )
113
+ return model
114
+ except Exception as e:
115
+ raise gr.Error(f"Error initializing Gemini: {str(e)}")
116
+
117
+ def create_prompt(extracted_text: str) -> str:
118
+ data_to_extract = {
119
+ "tribunal": "",
120
+ "numero_rg": "",
121
+ "date_ordonnance": "",
122
+ "demandeurs": [],
123
+ "defendeurs": [],
124
+ "avocats_demandeurs": [],
125
+ "avocats_defendeurs": []
126
+ }
127
+
128
+ prompt = f"""Tu es un assistant juridique expert en analyse de documents judiciaires français.
129
+ Je vais te fournir le contenu d'un document judiciaire extrait d'un PDF.
130
+ Ta tâche est d'analyser ce texte et d'en extraire les informations suivantes de manière précise :
131
+
132
+ {json.dumps(data_to_extract, indent=2, ensure_ascii=False)}
133
+
134
+ Voici quelques règles à suivre :
135
+ - Si une information n'est pas présente dans le texte, indique "Non spécifié" pour cette catégorie.
136
+ - Pour les noms des parties (demandeurs et défendeurs, et leurs avocats), liste tous ceux que tu trouves
137
+ - Assure-toi de différencier correctement les demandeurs des défendeurs.
138
+ - Si tu n'es pas sûr d'une information, indique-le clairement.
139
+
140
+ Présente tes résultats sous forme de JSON, en utilisant les catégories mentionnées ci-dessus.
141
+
142
+ Voici le contenu du document :
143
+
144
+ {extracted_text.strip()}
145
+
146
+ Analyse ce texte et fournis-moi les informations demandées au format JSON uniquement.""".strip()
147
+
148
+ return prompt
149
+
150
+ def extract_data_with_gemini(text_file_path: str) -> dict:
151
+ try:
152
+ # Initialize Gemini
153
+ model = initialize_gemini()
154
+
155
+ # Read the extracted text
156
+ with open(text_file_path, 'r', encoding='utf-8') as f:
157
+ extracted_text = f.read()
158
+
159
+ # Create prompt and get response
160
+ prompt = create_prompt(extracted_text)
161
+ response = model.generate_content(prompt)
162
+
163
+ # Parse the JSON response
164
+ try:
165
+ # Extract JSON from the response text
166
+ json_str = response.text
167
+ if "json" in json_str.lower():
168
+ json_str = json_str.split("json")[1].split("```")[0]
169
+ elif "```" in json_str:
170
+ json_str = json_str.split("```")[1]
171
+ result = json.loads(json_str)
172
+ except:
173
+ result = {"error": "Failed to parse JSON response", "raw_response": response.text}
174
+
175
+ return result
176
+ except Exception as e:
177
+ raise gr.Error(f"Error in Gemini processing: {str(e)}")
178
+
179
+ # Main Processing Function
180
  def process_pdf(pdf_file):
 
181
  temp_dir = os.path.join(os.getcwd(), "temp_processing")
182
  output_dir = os.path.join(temp_dir, 'output_images')
183
 
 
184
  if os.path.exists(temp_dir):
185
  shutil.rmtree(temp_dir)
 
186
  os.makedirs(output_dir, exist_ok=True)
187
 
188
  try:
189
+ # Convert PDF to images and process
190
  images = convert_from_path(pdf_file.name)
 
 
191
  annotated_images = []
192
  for i, img in enumerate(images):
 
193
  temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png')
194
  img.save(temp_img_path)
 
 
195
  blocks, annotated_image_path = process_image(temp_img_path, output_dir, i)
196
  annotated_images.append(annotated_image_path)
197
  save_extracted_text(blocks, i + 1, output_dir)
198
 
199
+ # Create ZIP file
200
  zip_path = os.path.join(temp_dir, "annotated_images.zip")
201
  with zipfile.ZipFile(zip_path, 'w') as zipf:
202
  for img_path in annotated_images:
 
205
  # Get the text file
206
  text_file_path = os.path.join(output_dir, 'extracted_text.txt')
207
 
208
+ # Process with Gemini
209
+ extracted_data = extract_data_with_gemini(text_file_path)
210
+
211
+ # Save extracted data to JSON file
212
+ json_path = os.path.join(temp_dir, "extracted_data.json")
213
+ with open(json_path, 'w', encoding='utf-8') as f:
214
+ json.dump(extracted_data, f, ensure_ascii=False, indent=2)
215
 
216
+ return text_file_path, zip_path, json_path
217
 
218
  except Exception as e:
219
  raise gr.Error(f"Error processing PDF: {str(e)}")
220
 
221
+ # Gradio Interface
 
 
 
 
222
  css = """
223
  .gradio-container {
224
  font-family: 'IBM Plex Sans', sans-serif;
 
231
  }
232
  """
233
 
 
234
  demo = gr.Interface(
235
  fn=process_pdf,
236
  inputs=[
 
242
  ],
243
  outputs=[
244
  gr.File(label="Extracted Text (TXT)"),
245
+ gr.File(label="Annotated Images (ZIP)"),
246
+ gr.File(label="Extracted Data (JSON)")
247
  ],
248
+ title="PDF Text Extraction and Analysis",
249
  description="""
250
  Upload a PDF document to:
251
  1. Extract text content
252
  2. Get annotated images showing detected text blocks
253
+ 3. Extract structured data using AI analysis
254
 
255
+ Supports multiple pages and French legal documents.
256
  """,
257
  article="Created by [Your Name] - [Your GitHub/Profile Link]",
258
  css=css,