Spaces:

Svngoku
/

PDF2Dataset

Running

App Files Files Community

Svngoku commited on Mar 28

Commit

716b14f

verified ·

1 Parent(s): 4371128

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -11

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
 from langchain.schema import Document
-from typing import List, Dict
 import logging
 import re
 from pathlib import Path
@@ -41,7 +41,7 @@ def replace_images_in_markdown(markdown_str: str, images_dict: Dict[str, str]) -
     return markdown_str
 # Function to combine markdown from OCR response
-def get_combined_markdown(ocr_response) -> tuple[str, str]:
     markdowns = []
     raw_markdowns = []
     image_data = {}  # Collect all image data
@@ -147,7 +147,11 @@ def chunk_markdown(
         # Add images to metadata
         for chunk in chunks:
             image_names = extract_image_names_from_markdown(chunk.page_content)
-            chunk.metadata["images"] = {name: image_data.get(name, None) for name in image_names}
         logger.info(f"Created {len(chunks)} chunks")
         return chunks
@@ -156,13 +160,6 @@ def chunk_markdown(
         logger.error(f"Error processing markdown: {str(e)}")
         raise
-# Placeholder image generation
-def text_to_base64_dummy(text: str, chunk_index: int):
-    img = Image.new('RGB', (200, 200), color='white')
-    buffer = io.BytesIO()
-    img.save(buffer, format="PNG")
-    return base64.b64encode(buffer.getvalue()).decode("utf-8")
 # Process file: OCR -> Chunk -> Save
 def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
     try:
@@ -175,7 +172,7 @@ def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_tok
         chunks = chunk_markdown(combined_markdown, image_data, chunk_size, chunk_overlap, strip_headers)
         # Step 3: Prepare dataset
-        data = {
             "chunk_id": [],
             "content": [],
             "metadata": [],

 import gradio as gr
 from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
 from langchain.schema import Document
+from typing import List, Dict, Any
 import logging
 import re
 from pathlib import Path
     return markdown_str
 # Function to combine markdown from OCR response
+def get_combined_markdown(ocr_response) -> tuple[str, str, Dict[str, str]]:
     markdowns = []
     raw_markdowns = []
     image_data = {}  # Collect all image data
         # Add images to metadata
         for chunk in chunks:
             image_names = extract_image_names_from_markdown(chunk.page_content)
+            images = {name: image_data.get(name, None) for name in image_names}
+            # Add a dummy field if the images dictionary is empty
+            if not images:
+                images = {"dummy": None}
+            chunk.metadata["images"] = images
         logger.info(f"Created {len(chunks)} chunks")
         return chunks
         logger.error(f"Error processing markdown: {str(e)}")
         raise
 # Process file: OCR -> Chunk -> Save
 def process_file_and_save(file, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name):
     try:
         chunks = chunk_markdown(combined_markdown, image_data, chunk_size, chunk_overlap, strip_headers)
         # Step 3: Prepare dataset
+        data: Dict[str, List[Any]] = {
             "chunk_id": [],
             "content": [],
             "metadata": [],