broadfield-dev commited on
Commit
0dd31f7
·
verified ·
1 Parent(s): 77541b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -17
app.py CHANGED
@@ -4,28 +4,28 @@ import PyPDF2
4
  from pdf2image import convert_from_path, convert_from_bytes
5
  import pytesseract
6
  from PIL import Image
7
- import io
8
  import os
9
  from huggingface_hub import HfApi, create_repo
10
  import re
11
  from datetime import datetime
 
12
 
13
  # Initialize Hugging Face API
14
  HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
15
- REPO_NAME = "pdf-images-extracted" # Hugging Face repo for images
16
  hf_api = HfApi()
17
 
18
- def ensure_hf_repo():
19
- """Create or get Hugging Face repository."""
20
  try:
21
- repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True)
22
  return repo_id
23
  except Exception as e:
24
- return f"Error creating repo: {str(e)}"
25
 
26
  def upload_image_to_hf(image, filename):
27
- """Upload an image to Hugging Face Hub and return its URL."""
28
- repo_id = ensure_hf_repo()
29
  if isinstance(repo_id, str) and repo_id.startswith("Error"):
30
  return repo_id
31
 
@@ -34,11 +34,12 @@ def upload_image_to_hf(image, filename):
34
  temp_path = f"/tmp/temp_{filename}.png"
35
  image.save(temp_path, format="PNG")
36
 
37
- # Upload to Hugging Face
38
  file_url = hf_api.upload_file(
39
  path_or_fileobj=temp_path,
40
  path_in_repo=f"images/{filename}.png",
41
  repo_id=repo_id,
 
42
  token=HF_TOKEN
43
  )
44
  os.remove(temp_path)
@@ -62,7 +63,7 @@ def extract_images_from_pdf(pdf_file):
62
  """Extract images from PDF and convert to PIL images."""
63
  try:
64
  if isinstance(pdf_file, str): # URL case
65
- response = requests.get(pdf_file)
66
  images = convert_from_bytes(response.content)
67
  else: # File upload case
68
  images = convert_from_path(pdf_file.name)
@@ -87,7 +88,7 @@ def format_to_markdown(text, images):
87
  else:
88
  markdown_output += f"{line}\n\n"
89
 
90
- # Add images with Hugging Face URLs
91
  if isinstance(images, list) and images:
92
  markdown_output += "## Extracted Images\n\n"
93
  for i, image in enumerate(images):
@@ -110,11 +111,16 @@ def process_pdf(pdf_input, pdf_url):
110
  if not HF_TOKEN:
111
  return "Error: HF_TOKEN not set in Spaces Secrets."
112
 
 
113
  if pdf_url and pdf_url.strip():
114
- response = requests.head(pdf_url)
115
- if response.status_code != 200:
116
- return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
117
- pdf_file = pdf_url
 
 
 
 
118
  elif pdf_input:
119
  pdf_file = pdf_input
120
  else:
@@ -136,11 +142,11 @@ iface = gr.Interface(
136
  fn=process_pdf,
137
  inputs=[
138
  gr.File(label="Upload PDF File", type="filepath"),
139
- gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
140
  ],
141
  outputs=gr.Markdown(label="Markdown Output"),
142
  title="PDF to Markdown Converter",
143
- description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible. Requires HF_TOKEN in Spaces Secrets.",
144
  )
145
 
146
  if __name__ == "__main__":
 
4
  from pdf2image import convert_from_path, convert_from_bytes
5
  import pytesseract
6
  from PIL import Image
 
7
  import os
8
  from huggingface_hub import HfApi, create_repo
9
  import re
10
  from datetime import datetime
11
+ import urllib.parse
12
 
13
  # Initialize Hugging Face API
14
  HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
15
+ REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
16
  hf_api = HfApi()
17
 
18
+ def ensure_hf_dataset():
19
+ """Create or get Hugging Face dataset repository."""
20
  try:
21
+ repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
22
  return repo_id
23
  except Exception as e:
24
+ return f"Error creating dataset repo: {str(e)}"
25
 
26
  def upload_image_to_hf(image, filename):
27
+ """Upload an image to Hugging Face dataset and return its URL."""
28
+ repo_id = ensure_hf_dataset()
29
  if isinstance(repo_id, str) and repo_id.startswith("Error"):
30
  return repo_id
31
 
 
34
  temp_path = f"/tmp/temp_{filename}.png"
35
  image.save(temp_path, format="PNG")
36
 
37
+ # Upload to Hugging Face dataset
38
  file_url = hf_api.upload_file(
39
  path_or_fileobj=temp_path,
40
  path_in_repo=f"images/{filename}.png",
41
  repo_id=repo_id,
42
+ repo_type="dataset",
43
  token=HF_TOKEN
44
  )
45
  os.remove(temp_path)
 
63
  """Extract images from PDF and convert to PIL images."""
64
  try:
65
  if isinstance(pdf_file, str): # URL case
66
+ response = requests.get(pdf_file, stream=True)
67
  images = convert_from_bytes(response.content)
68
  else: # File upload case
69
  images = convert_from_path(pdf_file.name)
 
88
  else:
89
  markdown_output += f"{line}\n\n"
90
 
91
+ # Add images with Hugging Face dataset URLs
92
  if isinstance(images, list) and images:
93
  markdown_output += "## Extracted Images\n\n"
94
  for i, image in enumerate(images):
 
111
  if not HF_TOKEN:
112
  return "Error: HF_TOKEN not set in Spaces Secrets."
113
 
114
+ # Decode URL-encoded string if provided
115
  if pdf_url and pdf_url.strip():
116
+ pdf_url = urllib.parse.unquote(pdf_url)
117
+ try:
118
+ response = requests.head(pdf_url, allow_redirects=True)
119
+ if response.status_code != 200:
120
+ return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
121
+ pdf_file = pdf_url
122
+ except requests.RequestException as e:
123
+ return f"Error accessing URL: {str(e)}"
124
  elif pdf_input:
125
  pdf_file = pdf_input
126
  else:
 
142
  fn=process_pdf,
143
  inputs=[
144
  gr.File(label="Upload PDF File", type="filepath"),
145
+ gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings)"),
146
  ],
147
  outputs=gr.Markdown(label="Markdown Output"),
148
  title="PDF to Markdown Converter",
149
+ description="Upload a PDF file or provide a PDF URL (including URL-encoded strings) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
150
  )
151
 
152
  if __name__ == "__main__":