Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,28 +4,28 @@ import PyPDF2
|
|
4 |
from pdf2image import convert_from_path, convert_from_bytes
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
7 |
-
import io
|
8 |
import os
|
9 |
from huggingface_hub import HfApi, create_repo
|
10 |
import re
|
11 |
from datetime import datetime
|
|
|
12 |
|
13 |
# Initialize Hugging Face API
|
14 |
HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
|
15 |
-
REPO_NAME = "pdf-images-extracted" # Hugging Face repo
|
16 |
hf_api = HfApi()
|
17 |
|
18 |
-
def
|
19 |
-
"""Create or get Hugging Face repository."""
|
20 |
try:
|
21 |
-
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True)
|
22 |
return repo_id
|
23 |
except Exception as e:
|
24 |
-
return f"Error creating repo: {str(e)}"
|
25 |
|
26 |
def upload_image_to_hf(image, filename):
|
27 |
-
"""Upload an image to Hugging Face
|
28 |
-
repo_id =
|
29 |
if isinstance(repo_id, str) and repo_id.startswith("Error"):
|
30 |
return repo_id
|
31 |
|
@@ -34,11 +34,12 @@ def upload_image_to_hf(image, filename):
|
|
34 |
temp_path = f"/tmp/temp_{filename}.png"
|
35 |
image.save(temp_path, format="PNG")
|
36 |
|
37 |
-
# Upload to Hugging Face
|
38 |
file_url = hf_api.upload_file(
|
39 |
path_or_fileobj=temp_path,
|
40 |
path_in_repo=f"images/{filename}.png",
|
41 |
repo_id=repo_id,
|
|
|
42 |
token=HF_TOKEN
|
43 |
)
|
44 |
os.remove(temp_path)
|
@@ -62,7 +63,7 @@ def extract_images_from_pdf(pdf_file):
|
|
62 |
"""Extract images from PDF and convert to PIL images."""
|
63 |
try:
|
64 |
if isinstance(pdf_file, str): # URL case
|
65 |
-
response = requests.get(pdf_file)
|
66 |
images = convert_from_bytes(response.content)
|
67 |
else: # File upload case
|
68 |
images = convert_from_path(pdf_file.name)
|
@@ -87,7 +88,7 @@ def format_to_markdown(text, images):
|
|
87 |
else:
|
88 |
markdown_output += f"{line}\n\n"
|
89 |
|
90 |
-
# Add images with Hugging Face URLs
|
91 |
if isinstance(images, list) and images:
|
92 |
markdown_output += "## Extracted Images\n\n"
|
93 |
for i, image in enumerate(images):
|
@@ -110,11 +111,16 @@ def process_pdf(pdf_input, pdf_url):
|
|
110 |
if not HF_TOKEN:
|
111 |
return "Error: HF_TOKEN not set in Spaces Secrets."
|
112 |
|
|
|
113 |
if pdf_url and pdf_url.strip():
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
118 |
elif pdf_input:
|
119 |
pdf_file = pdf_input
|
120 |
else:
|
@@ -136,11 +142,11 @@ iface = gr.Interface(
|
|
136 |
fn=process_pdf,
|
137 |
inputs=[
|
138 |
gr.File(label="Upload PDF File", type="filepath"),
|
139 |
-
gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
|
140 |
],
|
141 |
outputs=gr.Markdown(label="Markdown Output"),
|
142 |
title="PDF to Markdown Converter",
|
143 |
-
description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face
|
144 |
)
|
145 |
|
146 |
if __name__ == "__main__":
|
|
|
4 |
from pdf2image import convert_from_path, convert_from_bytes
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
|
|
7 |
import os
|
8 |
from huggingface_hub import HfApi, create_repo
|
9 |
import re
|
10 |
from datetime import datetime
|
11 |
+
import urllib.parse
|
12 |
|
13 |
# Initialize Hugging Face API
|
14 |
HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
|
15 |
+
REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
|
16 |
hf_api = HfApi()
|
17 |
|
18 |
+
def ensure_hf_dataset():
|
19 |
+
"""Create or get Hugging Face dataset repository."""
|
20 |
try:
|
21 |
+
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
22 |
return repo_id
|
23 |
except Exception as e:
|
24 |
+
return f"Error creating dataset repo: {str(e)}"
|
25 |
|
26 |
def upload_image_to_hf(image, filename):
|
27 |
+
"""Upload an image to Hugging Face dataset and return its URL."""
|
28 |
+
repo_id = ensure_hf_dataset()
|
29 |
if isinstance(repo_id, str) and repo_id.startswith("Error"):
|
30 |
return repo_id
|
31 |
|
|
|
34 |
temp_path = f"/tmp/temp_{filename}.png"
|
35 |
image.save(temp_path, format="PNG")
|
36 |
|
37 |
+
# Upload to Hugging Face dataset
|
38 |
file_url = hf_api.upload_file(
|
39 |
path_or_fileobj=temp_path,
|
40 |
path_in_repo=f"images/{filename}.png",
|
41 |
repo_id=repo_id,
|
42 |
+
repo_type="dataset",
|
43 |
token=HF_TOKEN
|
44 |
)
|
45 |
os.remove(temp_path)
|
|
|
63 |
"""Extract images from PDF and convert to PIL images."""
|
64 |
try:
|
65 |
if isinstance(pdf_file, str): # URL case
|
66 |
+
response = requests.get(pdf_file, stream=True)
|
67 |
images = convert_from_bytes(response.content)
|
68 |
else: # File upload case
|
69 |
images = convert_from_path(pdf_file.name)
|
|
|
88 |
else:
|
89 |
markdown_output += f"{line}\n\n"
|
90 |
|
91 |
+
# Add images with Hugging Face dataset URLs
|
92 |
if isinstance(images, list) and images:
|
93 |
markdown_output += "## Extracted Images\n\n"
|
94 |
for i, image in enumerate(images):
|
|
|
111 |
if not HF_TOKEN:
|
112 |
return "Error: HF_TOKEN not set in Spaces Secrets."
|
113 |
|
114 |
+
# Decode URL-encoded string if provided
|
115 |
if pdf_url and pdf_url.strip():
|
116 |
+
pdf_url = urllib.parse.unquote(pdf_url)
|
117 |
+
try:
|
118 |
+
response = requests.head(pdf_url, allow_redirects=True)
|
119 |
+
if response.status_code != 200:
|
120 |
+
return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
|
121 |
+
pdf_file = pdf_url
|
122 |
+
except requests.RequestException as e:
|
123 |
+
return f"Error accessing URL: {str(e)}"
|
124 |
elif pdf_input:
|
125 |
pdf_file = pdf_input
|
126 |
else:
|
|
|
142 |
fn=process_pdf,
|
143 |
inputs=[
|
144 |
gr.File(label="Upload PDF File", type="filepath"),
|
145 |
+
gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings)"),
|
146 |
],
|
147 |
outputs=gr.Markdown(label="Markdown Output"),
|
148 |
title="PDF to Markdown Converter",
|
149 |
+
description="Upload a PDF file or provide a PDF URL (including URL-encoded strings) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
|
150 |
)
|
151 |
|
152 |
if __name__ == "__main__":
|