Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,7 @@ import sys
|
|
13 |
import traceback
|
14 |
import io
|
15 |
import os
|
|
|
16 |
|
17 |
class PDFAnalyzer:
|
18 |
def __init__(self, file_path: str):
|
@@ -32,11 +33,11 @@ class PDFAnalyzer:
|
|
32 |
|
33 |
def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
|
34 |
"""Checks for the presence of required terms in the text."""
|
35 |
-
return {term: term in self.full_text for term in search_terms}
|
36 |
|
37 |
def label_authors(self) -> str:
|
38 |
"""Label authors in the text with 'Authors:' if not already labeled."""
|
39 |
-
author_line_regex = r"^(?:.*\n)(.*?)(?:\n\
|
40 |
match = re.search(author_line_regex, self.full_text, re.MULTILINE)
|
41 |
if match:
|
42 |
authors = match.group(1).strip()
|
@@ -314,49 +315,62 @@ class PDFAnalyzer:
|
|
314 |
|
315 |
def analyze_pdf(file):
|
316 |
try:
|
317 |
-
#
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
except Exception as e:
|
345 |
error_message = {
|
346 |
"error": str(e),
|
347 |
"traceback": traceback.format_exc()
|
348 |
}
|
349 |
-
return json.dumps(error_message, indent=2)
|
350 |
-
finally:
|
351 |
-
# Clean up the temporary file
|
352 |
-
if os.path.exists(temp_path):
|
353 |
-
os.remove(temp_path)
|
354 |
|
355 |
# Create Gradio interface
|
356 |
iface = gr.Interface(
|
357 |
fn=analyze_pdf,
|
358 |
inputs=gr.File(label="Upload PDF"),
|
359 |
-
outputs=
|
|
|
|
|
|
|
360 |
title="PDF Analyzer",
|
361 |
description="Upload a PDF document to analyze its structure, references, language, and more.",
|
362 |
)
|
|
|
13 |
import traceback
|
14 |
import io
|
15 |
import os
|
16 |
+
import tempfile
|
17 |
|
18 |
class PDFAnalyzer:
|
19 |
def __init__(self, file_path: str):
|
|
|
33 |
|
34 |
def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
|
35 |
"""Checks for the presence of required terms in the text."""
|
36 |
+
return {term: term.lower() in self.full_text.lower() for term in search_terms}
|
37 |
|
38 |
def label_authors(self) -> str:
|
39 |
"""Label authors in the text with 'Authors:' if not already labeled."""
|
40 |
+
author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
|
41 |
match = re.search(author_line_regex, self.full_text, re.MULTILINE)
|
42 |
if match:
|
43 |
authors = match.group(1).strip()
|
|
|
315 |
|
316 |
def analyze_pdf(file):
|
317 |
try:
|
318 |
+
# Create a temporary directory to store files
|
319 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
320 |
+
# Save the uploaded file temporarily
|
321 |
+
temp_path = os.path.join(temp_dir, "uploaded.pdf")
|
322 |
+
with open(temp_path, "wb") as f:
|
323 |
+
f.write(file.read())
|
324 |
+
|
325 |
+
analyzer = PDFAnalyzer(temp_path)
|
326 |
+
results = analyzer.analyze()
|
327 |
+
|
328 |
+
# Ensure all keys are present in the results, even if they're empty
|
329 |
+
default_results = {
|
330 |
+
"annotated_pdf_path": "",
|
331 |
+
"metadata": {},
|
332 |
+
"disclosures": {},
|
333 |
+
"figures_and_tables": {},
|
334 |
+
"figure_order": {},
|
335 |
+
"references": {},
|
336 |
+
"reference_order": {},
|
337 |
+
"reference_style": {},
|
338 |
+
"structure": {},
|
339 |
+
"language": {},
|
340 |
+
}
|
341 |
+
|
342 |
+
# Update default_results with actual results
|
343 |
+
default_results.update(results)
|
344 |
+
|
345 |
+
# Handle the annotated PDF
|
346 |
+
annotated_pdf_path = results.get("annotated_pdf_path", "")
|
347 |
+
if annotated_pdf_path and os.path.exists(annotated_pdf_path):
|
348 |
+
# Read the annotated PDF and return it as bytes
|
349 |
+
with open(annotated_pdf_path, "rb") as f:
|
350 |
+
annotated_pdf_bytes = f.read()
|
351 |
+
else:
|
352 |
+
annotated_pdf_bytes = None
|
353 |
+
|
354 |
+
# Remove the annotated_pdf_path from the results as we're returning the file separately
|
355 |
+
default_results.pop("annotated_pdf_path", None)
|
356 |
+
|
357 |
+
return json.dumps(default_results, indent=2, default=str), annotated_pdf_bytes
|
358 |
|
359 |
except Exception as e:
|
360 |
error_message = {
|
361 |
"error": str(e),
|
362 |
"traceback": traceback.format_exc()
|
363 |
}
|
364 |
+
return json.dumps(error_message, indent=2), None
|
|
|
|
|
|
|
|
|
365 |
|
366 |
# Create Gradio interface
|
367 |
iface = gr.Interface(
|
368 |
fn=analyze_pdf,
|
369 |
inputs=gr.File(label="Upload PDF"),
|
370 |
+
outputs=[
|
371 |
+
gr.JSON(label="Analysis Results"),
|
372 |
+
gr.File(label="Annotated PDF")
|
373 |
+
],
|
374 |
title="PDF Analyzer",
|
375 |
description="Upload a PDF document to analyze its structure, references, language, and more.",
|
376 |
)
|