Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

0dd31f7

verified ·

1 Parent(s): 77541b8

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -17

app.py CHANGED Viewed

@@ -4,28 +4,28 @@ import PyPDF2
 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
-import io
 import os
 from huggingface_hub import HfApi, create_repo
 import re
 from datetime import datetime
 # Initialize Hugging Face API
 HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
-REPO_NAME = "pdf-images-extracted"  # Hugging Face repo for images
 hf_api = HfApi()
-def ensure_hf_repo():
-    """Create or get Hugging Face repository."""
     try:
-        repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True)
         return repo_id
     except Exception as e:
-        return f"Error creating repo: {str(e)}"
 def upload_image_to_hf(image, filename):
-    """Upload an image to Hugging Face Hub and return its URL."""
-    repo_id = ensure_hf_repo()
     if isinstance(repo_id, str) and repo_id.startswith("Error"):
         return repo_id
@@ -34,11 +34,12 @@ def upload_image_to_hf(image, filename):
         temp_path = f"/tmp/temp_{filename}.png"
         image.save(temp_path, format="PNG")
-        # Upload to Hugging Face
         file_url = hf_api.upload_file(
             path_or_fileobj=temp_path,
             path_in_repo=f"images/{filename}.png",
             repo_id=repo_id,
             token=HF_TOKEN
         )
         os.remove(temp_path)
@@ -62,7 +63,7 @@ def extract_images_from_pdf(pdf_file):
     """Extract images from PDF and convert to PIL images."""
     try:
         if isinstance(pdf_file, str):  # URL case
-            response = requests.get(pdf_file)
             images = convert_from_bytes(response.content)
         else:  # File upload case
             images = convert_from_path(pdf_file.name)
@@ -87,7 +88,7 @@ def format_to_markdown(text, images):
         else:
             markdown_output += f"{line}\n\n"
-    # Add images with Hugging Face URLs
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
@@ -110,11 +111,16 @@ def process_pdf(pdf_input, pdf_url):
     if not HF_TOKEN:
         return "Error: HF_TOKEN not set in Spaces Secrets."
     if pdf_url and pdf_url.strip():
-        response = requests.head(pdf_url)
-        if response.status_code != 200:
-            return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
-        pdf_file = pdf_url
     elif pdf_input:
         pdf_file = pdf_input
     else:
@@ -136,11 +142,11 @@ iface = gr.Interface(
     fn=process_pdf,
     inputs=[
         gr.File(label="Upload PDF File", type="filepath"),
-        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
-    description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible. Requires HF_TOKEN in Spaces Secrets.",
 )
 if __name__ == "__main__":

 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
 import os
 from huggingface_hub import HfApi, create_repo
 import re
 from datetime import datetime
+import urllib.parse
 # Initialize Hugging Face API
 HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
+REPO_NAME = "pdf-images-extracted"  # Hugging Face dataset repo
 hf_api = HfApi()
+def ensure_hf_dataset():
+    """Create or get Hugging Face dataset repository."""
     try:
+        repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         return repo_id
     except Exception as e:
+        return f"Error creating dataset repo: {str(e)}"
 def upload_image_to_hf(image, filename):
+    """Upload an image to Hugging Face dataset and return its URL."""
+    repo_id = ensure_hf_dataset()
     if isinstance(repo_id, str) and repo_id.startswith("Error"):
         return repo_id
         temp_path = f"/tmp/temp_{filename}.png"
         image.save(temp_path, format="PNG")
+        # Upload to Hugging Face dataset
         file_url = hf_api.upload_file(
             path_or_fileobj=temp_path,
             path_in_repo=f"images/{filename}.png",
             repo_id=repo_id,
+            repo_type="dataset",
             token=HF_TOKEN
         )
         os.remove(temp_path)
     """Extract images from PDF and convert to PIL images."""
     try:
         if isinstance(pdf_file, str):  # URL case
+            response = requests.get(pdf_file, stream=True)
             images = convert_from_bytes(response.content)
         else:  # File upload case
             images = convert_from_path(pdf_file.name)
         else:
             markdown_output += f"{line}\n\n"
+    # Add images with Hugging Face dataset URLs
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
     if not HF_TOKEN:
         return "Error: HF_TOKEN not set in Spaces Secrets."
+    # Decode URL-encoded string if provided
     if pdf_url and pdf_url.strip():
+        pdf_url = urllib.parse.unquote(pdf_url)
+        try:
+            response = requests.head(pdf_url, allow_redirects=True)
+            if response.status_code != 200:
+                return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
+            pdf_file = pdf_url
+        except requests.RequestException as e:
+            return f"Error accessing URL: {str(e)}"
     elif pdf_input:
         pdf_file = pdf_input
     else:
     fn=process_pdf,
     inputs=[
         gr.File(label="Upload PDF File", type="filepath"),
+        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings)"),
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
+    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
 )
 if __name__ == "__main__":