Spaces:

Svngoku
/

PDF2Dataset

Running

App Files Files Community

Svngoku commited on May 16

Commit

84f4f2c

verified ·

1 Parent(s): 93be02b

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -30

app.py CHANGED Viewed

@@ -118,7 +118,7 @@ def get_combined_markdown(ocr_response: Any) -> Tuple[str, str, Dict[str, str]]:
             processed_markdowns.append(current_processed_markdown)
-        logger.info(f"Processed {len(processed_markdowns)} pages with {len(image_data_map)} images.")
         return "\n\n".join(processed_markdowns), "\n\n".join(raw_markdowns), image_data_map
     except Exception as e:
@@ -286,12 +286,12 @@ def get_hf_token(explicit_token: str = None) -> str:
     return None
 def process_file_and_save(
-    file_obj: Any, chunk_size: int, chunk_overlap: int,
     strip_headers: bool, hf_token: str, repo_name: str
 ) -> str:
-    """Orchestrates OCR, chunking, and saving to Hugging Face."""
-    if not file_obj:
-        return "Error: No file uploaded."
     if not repo_name or '/' not in repo_name:
         return "Error: Invalid repository name (use 'username/dataset-name')."
@@ -311,25 +311,44 @@ def process_file_and_save(
         3. Run `huggingface-cli login` in your terminal"""
     try:
-        source_filename = getattr(file_obj, 'orig_name', os.path.basename(file_obj.name))
-        logger.info(f"--- Starting processing for file: {source_filename} ---")
-        processed_markdown, raw_markdown, img_map = perform_ocr_file(file_obj)
-        if not processed_markdown or processed_markdown.startswith("Error:"):
-            return processed_markdown
-        chunks = chunk_markdown(processed_markdown, chunk_size, chunk_overlap, strip_headers)
-        if not chunks:
-            return "Error: Failed to chunk the document."
-        data = {
-            "chunk_id": [f"{source_filename}_chunk_{i}" for i in range(len(chunks))],
-            "text": [chunk.page_content or "" for chunk in chunks],
-            "metadata": [chunk.metadata for chunk in chunks],
-            "source_filename": [source_filename] * len(chunks),
         }
-        dataset = Dataset.from_dict(data)
         api = HfApi(token=effective_hf_token)
         try:
@@ -346,9 +365,12 @@ def process_file_and_save(
             logger.info(f"Created repository '{repo_name}'.")
         dataset.push_to_hub(repo_name, token=effective_hf_token,
-                          commit_message=f"Add OCR data from {source_filename}")
         repo_url = f"https://huggingface.co/datasets/{repo_name}"
-        return f"Success! Dataset with {len(chunks)} chunks saved to: {repo_url}"
     except huggingface_hub.utils.HfHubHTTPError as hf_http_err:
         status = getattr(hf_http_err.response, 'status_code', 'Unknown')
@@ -359,7 +381,7 @@ def process_file_and_save(
         return f"Error: Hugging Face Hub Error (Status {status}): {hf_http_err}"
     except Exception as e:
         logger.error(f"Unexpected error: {e}", exc_info=True)
-        return f"Unexpected error: {str(e)}"
 # --- Gradio Interface ---
 with gr.Blocks(title="Mistral OCR & Dataset Creator",
@@ -367,21 +389,22 @@ with gr.Blocks(title="Mistral OCR & Dataset Creator",
     gr.Markdown("# Mistral OCR, Markdown Chunking, and Hugging Face Dataset Creator")
     gr.Markdown(
         """
-        Upload a PDF or image file. The application will:
-        1. Extract text and images using Mistral OCR
         2. Embed images as base64 data URIs in markdown
         3. Chunk markdown by headers and optionally character count
         4. Store embedded images in chunk metadata
-        5. Create/update a Hugging Face Dataset
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
-                label="Upload PDF or Image File",
                 file_types=['.pdf', '.png', '.jpg', '.jpeg', '.webp', '.bmp'],
-                type="filepath"
             )
             gr.Markdown("## Chunking Options")
             chunk_size = gr.Slider(minimum=0, maximum=8000, value=1000, step=100,

             processed_markdowns.append(current_processed_markdown)
+        logger.info(f"Processed {len(processed_markdowns)} pages with {len(image_data_map)} images.)
         return "\n\n".join(processed_markdowns), "\n\n".join(raw_markdowns), image_data_map
     except Exception as e:
     return None
 def process_file_and_save(
+    file_objs: List[Any], chunk_size: int, chunk_overlap: int,
     strip_headers: bool, hf_token: str, repo_name: str
 ) -> str:
+    """Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
+    if not file_objs:
+        return "Error: No files uploaded."
     if not repo_name or '/' not in repo_name:
         return "Error: Invalid repository name (use 'username/dataset-name')."
         3. Run `huggingface-cli login` in your terminal"""
     try:
+        all_data = {
+            "chunk_id": [],
+            "text": [],
+            "metadata": [],
+            "source_filename": []
         }
+        total_chunks = 0
+        files_processed = 0
+        error_messages = []
+        for file_idx, file_obj in enumerate(file_objs, 1):
+            source_filename = getattr(file_obj, 'orig_name', os.path.basename(file_obj.name))
+            logger.info(f"--- Processing file {file_idx}/{len(file_objs)}: {source_filename} ---")
+            processed_markdown, raw_markdown, img_map = perform_ocr_file(file_obj)
+            if processed_markdown.startswith("Error:"):
+                error_messages.append(f"File '{source_filename}': {processed_markdown}")
+                logger.error(f"Failed to process file {source_filename}: {processed_markdown}")
+                continue
+            chunks = chunk_markdown(processed_markdown, chunk_size, chunk_overlap, strip_headers)
+            if not chunks:
+                error_messages.append(f"File '{source_filename}': Failed to chunk the document.")
+                logger.error(f"Failed to chunk file {source_filename}")
+                continue
+            all_data["chunk_id"].extend([f"{source_filename}_chunk_{i}" for i in range(len(chunks))])
+            all_data["text"].extend([chunk.page_content or "" for chunk in chunks])
+            all_data["metadata"].extend([chunk.metadata for chunk in chunks])
+            all_data["source_filename"].extend([source_filename] * len(chunks))
+            total_chunks += len(chunks)
+            files_processed += 1
+            logger.info(f"File {source_filename}: Added {len(chunks)} chunks. Total chunks: {total_chunks}")
+        if not all_data["chunk_id"]:
+            return "Error: No valid data processed from any files.\n" + "\n".join(error_messages)
+        dataset = Dataset.from_dict(all_data)
         api = HfApi(token=effective_hf_token)
         try:
             logger.info(f"Created repository '{repo_name}'.")
         dataset.push_to_hub(repo_name, token=effective_hf_token,
+                          commit_message=f"Add OCR data from {files_processed} files")
         repo_url = f"https://huggingface.co/datasets/{repo_name}"
+        result = f"Success! Dataset with {total_chunks} chunks from {files_processed}/{len(file_objs)} files saved to: {repo_url}"
+        if error_messages:
+            result += "\n\nErrors encountered:\n" + "\n".join(error_messages)
+        return result
     except huggingface_hub.utils.HfHubHTTPError as hf_http_err:
         status = getattr(hf_http_err.response, 'status_code', 'Unknown')
         return f"Error: Hugging Face Hub Error (Status {status}): {hf_http_err}"
     except Exception as e:
         logger.error(f"Unexpected error: {e}", exc_info=True)
+        return f"Unexpected error: {str(e)}\n" + "\n".join(error_messages)
 # --- Gradio Interface ---
 with gr.Blocks(title="Mistral OCR & Dataset Creator",
     gr.Markdown("# Mistral OCR, Markdown Chunking, and Hugging Face Dataset Creator")
     gr.Markdown(
         """
+        Upload one or more PDF or image files. The application will:
+        1. Extract text and images using Mistral OCR for each file
         2. Embed images as base64 data URIs in markdown
         3. Chunk markdown by headers and optionally character count
         4. Store embedded images in chunk metadata
+        5. Create/update a Hugging Face Dataset with all processed data
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
+                label="Upload PDF or Image Files",
                 file_types=['.pdf', '.png', '.jpg', '.jpeg', '.webp', '.bmp'],
+                type="filepath",
+                file_count="multiple"  # Allow multiple file uploads
             )
             gr.Markdown("## Chunking Options")
             chunk_size = gr.Slider(minimum=0, maximum=8000, value=1000, step=100,