samyak152002 commited on
Commit
3700c3a
·
verified ·
1 Parent(s): 6ecdc78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -34
app.py CHANGED
@@ -13,6 +13,7 @@ import sys
13
  import traceback
14
  import io
15
  import os
 
16
 
17
  class PDFAnalyzer:
18
  def __init__(self, file_path: str):
@@ -32,11 +33,11 @@ class PDFAnalyzer:
32
 
33
  def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
34
  """Checks for the presence of required terms in the text."""
35
- return {term: term in self.full_text for term in search_terms}
36
 
37
  def label_authors(self) -> str:
38
  """Label authors in the text with 'Authors:' if not already labeled."""
39
- author_line_regex = r"^(?:.*\n)(.*?)(?:\n\nNetaji Subhas University of Technology, Dwarka, Delhi, 110078, India)"
40
  match = re.search(author_line_regex, self.full_text, re.MULTILINE)
41
  if match:
42
  authors = match.group(1).strip()
@@ -314,49 +315,62 @@ class PDFAnalyzer:
314
 
315
  def analyze_pdf(file):
316
  try:
317
- # Save the uploaded file temporarily
318
- temp_path = "temp_uploaded.pdf"
319
- with open(temp_path, "wb") as f:
320
- f.write(file.read())
321
-
322
- analyzer = PDFAnalyzer(temp_path)
323
- results = analyzer.analyze()
324
-
325
- # Ensure all keys are present in the results, even if they're empty
326
- default_results = {
327
- "annotated_pdf_path": "",
328
- "metadata": {},
329
- "disclosures": {},
330
- "figures_and_tables": {},
331
- "figure_order": {},
332
- "references": {},
333
- "reference_order": {},
334
- "reference_style": {},
335
- "structure": {},
336
- "language": {},
337
- }
338
-
339
- # Update default_results with actual results
340
- default_results.update(results)
341
-
342
- return json.dumps(default_results, indent=2, default=str)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  except Exception as e:
345
  error_message = {
346
  "error": str(e),
347
  "traceback": traceback.format_exc()
348
  }
349
- return json.dumps(error_message, indent=2)
350
- finally:
351
- # Clean up the temporary file
352
- if os.path.exists(temp_path):
353
- os.remove(temp_path)
354
 
355
  # Create Gradio interface
356
  iface = gr.Interface(
357
  fn=analyze_pdf,
358
  inputs=gr.File(label="Upload PDF"),
359
- outputs=gr.JSON(label="Analysis Results"),
 
 
 
360
  title="PDF Analyzer",
361
  description="Upload a PDF document to analyze its structure, references, language, and more.",
362
  )
 
13
  import traceback
14
  import io
15
  import os
16
+ import tempfile
17
 
18
  class PDFAnalyzer:
19
  def __init__(self, file_path: str):
 
33
 
34
  def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
35
  """Checks for the presence of required terms in the text."""
36
+ return {term: term.lower() in self.full_text.lower() for term in search_terms}
37
 
38
  def label_authors(self) -> str:
39
  """Label authors in the text with 'Authors:' if not already labeled."""
40
+ author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
41
  match = re.search(author_line_regex, self.full_text, re.MULTILINE)
42
  if match:
43
  authors = match.group(1).strip()
 
315
 
316
  def analyze_pdf(file):
317
  try:
318
+ # Create a temporary directory to store files
319
+ with tempfile.TemporaryDirectory() as temp_dir:
320
+ # Save the uploaded file temporarily
321
+ temp_path = os.path.join(temp_dir, "uploaded.pdf")
322
+ with open(temp_path, "wb") as f:
323
+ f.write(file.read())
324
+
325
+ analyzer = PDFAnalyzer(temp_path)
326
+ results = analyzer.analyze()
327
+
328
+ # Ensure all keys are present in the results, even if they're empty
329
+ default_results = {
330
+ "annotated_pdf_path": "",
331
+ "metadata": {},
332
+ "disclosures": {},
333
+ "figures_and_tables": {},
334
+ "figure_order": {},
335
+ "references": {},
336
+ "reference_order": {},
337
+ "reference_style": {},
338
+ "structure": {},
339
+ "language": {},
340
+ }
341
+
342
+ # Update default_results with actual results
343
+ default_results.update(results)
344
+
345
+ # Handle the annotated PDF
346
+ annotated_pdf_path = results.get("annotated_pdf_path", "")
347
+ if annotated_pdf_path and os.path.exists(annotated_pdf_path):
348
+ # Read the annotated PDF and return it as bytes
349
+ with open(annotated_pdf_path, "rb") as f:
350
+ annotated_pdf_bytes = f.read()
351
+ else:
352
+ annotated_pdf_bytes = None
353
+
354
+ # Remove the annotated_pdf_path from the results as we're returning the file separately
355
+ default_results.pop("annotated_pdf_path", None)
356
+
357
+ return json.dumps(default_results, indent=2, default=str), annotated_pdf_bytes
358
 
359
  except Exception as e:
360
  error_message = {
361
  "error": str(e),
362
  "traceback": traceback.format_exc()
363
  }
364
+ return json.dumps(error_message, indent=2), None
 
 
 
 
365
 
366
  # Create Gradio interface
367
  iface = gr.Interface(
368
  fn=analyze_pdf,
369
  inputs=gr.File(label="Upload PDF"),
370
+ outputs=[
371
+ gr.JSON(label="Analysis Results"),
372
+ gr.File(label="Annotated PDF")
373
+ ],
374
  title="PDF Analyzer",
375
  description="Upload a PDF document to analyze its structure, references, language, and more.",
376
  )