sblumenf commited on
Commit
41a1dac
·
verified ·
1 Parent(s): e2fb9c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -4
app.py CHANGED
@@ -9,6 +9,7 @@ import pandas as pd
9
  import pdfplumber
10
  import tempfile
11
  import traceback
 
12
 
13
  def save_image(element, images):
14
  try:
@@ -23,6 +24,20 @@ def save_image(element, images):
23
  except Exception as e:
24
  print(f"Error extracting image: {e}")
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
27
  """
28
  Parses a PDF file, extracts text, tables, and images, and formats the output.
@@ -50,6 +65,8 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
50
  print(f"Processing element: {type(element)}")
51
  save_image(element, images)
52
 
 
 
53
  with pdfplumber.open(pdf_file) as pdf:
54
  for page_num, page in enumerate(pdf.pages):
55
  for table in page.extract_tables():
@@ -70,13 +87,13 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
70
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
71
  if output_format == "JSON":
72
  json_data = {
73
- "text": text,
74
  "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
75
  "images": images
76
  }
77
  json.dump(json_data, tmp, ensure_ascii=False, indent=4)
78
  elif output_format == "Markdown":
79
- markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
80
  for i, table in enumerate(tables):
81
  if not table.columns.duplicated().any():
82
  markdown_text += f"## Table {i+1}\n"
@@ -87,7 +104,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
87
  markdown_text += f'![Image]({image_path})\n'
88
  tmp.write(markdown_text)
89
  elif output_format == "HTML":
90
- html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
91
  for i, table in enumerate(tables):
92
  if not table.columns.duplicated().any():
93
  html_text += f"<h2>Table {i+1}</h2>\n"
@@ -99,7 +116,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
99
  tmp.write(html_text)
100
  download_path = tmp.name
101
 
102
- return text, download_path
103
 
104
  except Exception as main_e:
105
  traceback.print_exc() # Print full traceback to console
 
9
  import pdfplumber
10
  import tempfile
11
  import traceback
12
+ import re
13
 
14
  def save_image(element, images):
15
  try:
 
24
  except Exception as e:
25
  print(f"Error extracting image: {e}")
26
 
27
+ def detect_headers(text):
28
+ """Detect headers in the text and format them."""
29
+ lines = text.split('\n')
30
+ formatted_text = ""
31
+ header_patterns = [r"^\d+\.\s", r"^[A-Z\s]+$", r"^[A-Z][a-z]+\s\d"]
32
+
33
+ for line in lines:
34
+ if any(re.match(pattern, line.strip()) for pattern in header_patterns):
35
+ formatted_text += f"# {line.strip()}\n"
36
+ else:
37
+ formatted_text += f"{line.strip()}\n"
38
+
39
+ return formatted_text
40
+
41
  def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
42
  """
43
  Parses a PDF file, extracts text, tables, and images, and formats the output.
 
65
  print(f"Processing element: {type(element)}")
66
  save_image(element, images)
67
 
68
+ formatted_text = detect_headers(text)
69
+
70
  with pdfplumber.open(pdf_file) as pdf:
71
  for page_num, page in enumerate(pdf.pages):
72
  for table in page.extract_tables():
 
87
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
88
  if output_format == "JSON":
89
  json_data = {
90
+ "text": formatted_text,
91
  "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
92
  "images": images
93
  }
94
  json.dump(json_data, tmp, ensure_ascii=False, indent=4)
95
  elif output_format == "Markdown":
96
+ markdown_text = f"# Extracted Text\n\n{formatted_text}\n\n# Tables\n"
97
  for i, table in enumerate(tables):
98
  if not table.columns.duplicated().any():
99
  markdown_text += f"## Table {i+1}\n"
 
104
  markdown_text += f'![Image]({image_path})\n'
105
  tmp.write(markdown_text)
106
  elif output_format == "HTML":
107
+ html_text = f"<p>{formatted_text}</p>\n\n<h2>Tables</h2>\n"
108
  for i, table in enumerate(tables):
109
  if not table.columns.duplicated().any():
110
  html_text += f"<h2>Table {i+1}</h2>\n"
 
116
  tmp.write(html_text)
117
  download_path = tmp.name
118
 
119
+ return formatted_text, download_path
120
 
121
  except Exception as main_e:
122
  traceback.print_exc() # Print full traceback to console