Spaces:

Svngoku
/

PDF2Dataset

Running

App Files Files Community

PDF2Dataset / app.py

Svngoku

Update app.py

f13386c verified 3 months ago

raw

history blame

33.7 kB

	import gradio as gr
	from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
	from langchain.schema import Document
	from typing import List, Dict, Any, Tuple
	import logging
	import re
	import base64
	import mimetypes # Added
	from datasets import Dataset
	from huggingface_hub import HfApi
	import huggingface_hub # Added for token checking and errors
	import os
	from mistralai import Mistral # Assuming this is the correct import for the client

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# --- Mistral OCR Setup ---
	api_key = os.environ.get("MISTRAL_API_KEY")
	if not api_key:
	logger.warning("MISTRAL_API_KEY environment variable not set. Attempting to use Hugging Face token.")
	try:
	api_key = huggingface_hub.get_token()
	if not api_key:
	# If running locally, this might still fail if not logged in.
	logger.warning("Could not retrieve token from Hugging Face login.")
	# Error will be raised later if client init fails or during HF push if token still missing
	else:
	logger.info("Using Hugging Face token as MISTRAL_API_KEY.")
	except Exception as e:
	logger.warning(f"Could not check Hugging Face login for token: {e}")
	# Proceed without API key, client initialization might fail

	# Initialize Mistral Client
	client = None
	if api_key:
	try:
	client = Mistral(api_key=api_key)
	logger.info("Mistral client initialized successfully.")
	except Exception as e:
	logger.error(f"Failed to initialize Mistral client: {e}", exc_info=True)
	# Raise a clearer error for Gradio startup if client fails
	raise RuntimeError(f"Failed to initialize Mistral client. Check API key and mistralai installation. Error: {e}")
	else:
	# This path might be hit if no env var and no HF token found
	logger.error("Mistral API key is not available. OCR functionality will fail.")
	# We could raise an error here, or let it fail when client methods are called.
	# Let's allow Gradio to load but OCR will fail clearly later.


	# --- Helper Functions ---

	def encode_image_bytes(image_bytes: bytes) -> str:
	"""Encodes image bytes to a base64 string."""
	return base64.b64encode(image_bytes).decode('utf-8')

	def get_combined_markdown(ocr_response: Any) -> Tuple[str, str, Dict[str, str]]:
	"""
	Combines markdown from OCR pages, replacing image IDs with base64 data URIs.

	Args:
	ocr_response: The response object from the Mistral OCR API.

	Returns:
	A tuple containing:
	- combined_markdown_with_images: Markdown string with image references replaced by base64 data URIs.
	- combined_raw_markdown: Original markdown string without image replacement.
	- image_data_map: A dictionary mapping image IDs to their base64 data URIs.
	Raises ValueError on unexpected response structure.
	"""
	processed_markdowns = []
	raw_markdowns = []
	image_data_map = {} # Collect image_id -> base64_data_uri

	if not hasattr(ocr_response, 'pages') or not ocr_response.pages:
	logger.warning("OCR response has no 'pages' attribute or pages list is empty.")
	return "", "", {}

	try:
	# Collect all image data first (assuming image_base64 includes data URI prefix from Mistral)
	for page_idx, page in enumerate(ocr_response.pages):
	if hasattr(page, 'images') and page.images:
	for img in page.images:
	if hasattr(img, 'id') and hasattr(img, 'image_base64') and img.image_base64:
	image_data_map[img.id] = img.image_base64 # Assuming this is the full data URI
	else:
	logger.warning(f"Page {page_idx}: Image object lacks 'id' or valid 'image_base64'. Image: {img}")
	# else: # Don't warn if a page simply has no images
	# logger.debug(f"Page {page_idx} has no 'images' attribute or no images found.")


	# Process markdown for each page
	for page_idx, page in enumerate(ocr_response.pages):
	if not hasattr(page, 'markdown'):
	logger.warning(f"Page {page_idx} in OCR response lacks 'markdown' attribute. Skipping.")
	continue # Skip page if no markdown

	current_raw_markdown = page.markdown if page.markdown else ""
	raw_markdowns.append(current_raw_markdown)
	current_processed_markdown = current_raw_markdown

	# Find all image references like ![alt_text](image_id)
	# Regex to find the image ID (content within parentheses)
	img_refs = re.findall(r"!\[.?\]\((.?)\)", current_processed_markdown)
	for img_id in img_refs:
	if img_id in image_data_map:
	base64_data_uri = image_data_map[img_id]
	# Escape potential regex special characters in img_id before using in replace
	escaped_img_id = re.escape(img_id)
	# Replace ![...](image_id) with ![...](data:...)
	# Use a specific regex for replacement: find the exact pattern ![...](img_id)
	pattern = r"(!\[.*?\]\()" + escaped_img_id + r"(\))"
	# Check if replacement target exists before replacing
	if re.search(pattern, current_processed_markdown):
	current_processed_markdown = re.sub(
	pattern,
	r"\1" + base64_data_uri + r"\2",
	current_processed_markdown
	)
	else:
	# This case shouldn't happen often if img_id came from findall on the same string
	logger.warning(f"Page {page_idx}: Found img_id '{img_id}' but couldn't find exact pattern '{pattern}' for replacement.")

	else:
	# Only log warning if the ID looks like an expected image ID pattern (e.g., 'image_X')
	# Avoid warning for regular URLs that might be in the markdown
	if not img_id.startswith(('http:', 'https:', 'data:')): # Check if it's not already a URL
	logger.warning(f"Page {page_idx}: Image ID '{img_id}' found in markdown but not in collected image data.")

	processed_markdowns.append(current_processed_markdown)

	return "\n\n".join(processed_markdowns), "\n\n".join(raw_markdowns), image_data_map

	except AttributeError as ae:
	logger.error(f"Attribute error accessing OCR response structure: {ae}", exc_info=True)
	raise ValueError(f"Unexpected OCR response structure. Check Mistral API changes. Error: {ae}")
	except Exception as e:
	logger.error(f"Error processing OCR response markdown: {e}", exc_info=True)
	raise

	def perform_ocr_file(file_obj: Any) -> Tuple[str, str, Dict[str, str]]:
	"""
	Performs OCR on an uploaded file (PDF or image) using the Mistral API.

	Args:
	file_obj: The file object from Gradio's gr.File component.

	Returns:
	A tuple containing:
	- processed_markdown: Markdown string with base64 images, or error message.
	- raw_markdown: Original markdown string.
	- image_data_map: Dictionary mapping image IDs to base64 data URIs.
	"""
	if not client:
	return "Error: Mistral client not initialized. Check API key setup.", "", {}
	if not file_obj:
	# This check might be redundant if called from process_file_and_save, but good practice
	return "Error: No file provided to OCR function.", "", {}

	try:
	file_path = file_obj.name # Get the temporary file path from Gradio
	# Use the original filename if available (Gradio>=4), else use the temp path's basename
	file_name = getattr(file_obj, 'orig_name', os.path.basename(file_path))
	logger.info(f"Performing OCR on file: {file_name} (temp path: {file_path})")

	# Determine file type from extension
	file_ext = os.path.splitext(file_name)[1].lower()

	ocr_response = None
	uploaded_file_id = None

	if file_ext == '.pdf':
	try:
	with open(file_path, "rb") as f:
	logger.info(f"Uploading PDF {file_name} to Mistral...")
	# Pass as tuple (filename, file-like object)
	uploaded_pdf = client.files.upload(
	file=(file_name, f),
	purpose="ocr"
	)
	uploaded_file_id = uploaded_pdf.id
	logger.info(f"PDF uploaded successfully. File ID: {uploaded_file_id}")

	logger.info(f"Getting signed URL for file ID: {uploaded_file_id}")
	signed_url_response = client.files.get_signed_url(file_id=uploaded_file_id)
	logger.info(f"Got signed URL: {signed_url_response.url[:50]}...")

	logger.info("Sending PDF URL to Mistral OCR (model: mistral-ocr-latest)...")
	ocr_response = client.ocr.process(
	model="mistral-ocr-latest",
	document={
	"type": "document_url",
	"document_url": signed_url_response.url,
	},
	include_image_base64=True
	)
	logger.info("OCR processing complete for PDF.")

	finally:
	# Ensure cleanup even if OCR fails after upload
	if uploaded_file_id:
	try:
	logger.info(f"Deleting temporary Mistral file: {uploaded_file_id}")
	client.files.delete(file_id=uploaded_file_id)
	except Exception as delete_err:
	logger.warning(f"Failed to delete temporary Mistral file {uploaded_file_id}: {delete_err}")

	elif file_ext in ['.png', '.jpg', '.jpeg', '.webp', '.bmp']:
	try:
	with open(file_path, "rb") as f:
	image_bytes = f.read()

	if not image_bytes:
	return f"Error: Uploaded image file '{file_name}' is empty.", "", {}

	base64_encoded_image = encode_image_bytes(image_bytes)

	# Determine MIME type
	mime_type, _ = mimetypes.guess_type(file_path)
	if not mime_type or not mime_type.startswith('image'):
	logger.warning(f"Could not determine MIME type for {file_name} using extension. Defaulting to image/jpeg.")
	mime_type = 'image/jpeg' # Fallback

	data_uri = f"data:{mime_type};base64,{base64_encoded_image}"
	logger.info(f"Sending image {file_name} ({mime_type}) as data URI to Mistral OCR (model: mistral-ocr-latest)...")

	ocr_response = client.ocr.process(
	model="mistral-ocr-latest",
	document={
	"type": "image_url",
	"image_url": data_uri
	},
	include_image_base64=True
	)
	logger.info(f"OCR processing complete for image {file_name}.")
	except Exception as img_ocr_err:
	logger.error(f"Error during image OCR for {file_name}: {img_ocr_err}", exc_info=True)
	return f"Error during OCR for image '{file_name}': {img_ocr_err}", "", {}

	else:
	unsupported_msg = f"Unsupported file type: '{file_name}'. Please provide a PDF or an image (png, jpg, jpeg, webp, bmp)."
	logger.warning(unsupported_msg)
	return unsupported_msg, "", {}

	# Process the OCR response (common path for PDF/Image)
	if ocr_response:
	logger.info("Processing OCR response to combine markdown and images...")
	processed_md, raw_md, img_map = get_combined_markdown(ocr_response)
	logger.info("Markdown and image data extraction complete.")
	return processed_md, raw_md, img_map
	else:
	# This case might occur if OCR processing itself failed silently or returned None
	logger.error(f"OCR processing for '{file_name}' did not return a valid response.")
	return f"Error: OCR processing failed for '{file_name}'. No response received.", "", {}

	except FileNotFoundError:
	logger.error(f"Temporary file not found: {file_path}", exc_info=True)
	return f"Error: Could not read the uploaded file '{file_name}'. Ensure it uploaded correctly.", "", {}
	except Exception as e:
	logger.error(f"Unexpected error during OCR processing file {file_name}: {e}", exc_info=True)
	# Provide more context in the error message returned to the user
	return f"Error during OCR processing for '{file_name}': {str(e)}", "", {}

	def chunk_markdown(
	markdown_text_with_images: str,
	chunk_size: int = 1000,
	chunk_overlap: int = 200,
	strip_headers: bool = True
	) -> List[Document]:
	"""
	Chunks markdown text, preserving headers in metadata and adding embedded image info.

	Args:
	markdown_text_with_images: The markdown string containing base64 data URIs for images.
	chunk_size: The target size for chunks (characters). 0 to disable recursive splitting.
	chunk_overlap: The overlap between consecutive chunks (characters).
	strip_headers: Whether to remove header syntax (e.g., '# ') from the chunk content.

	Returns:
	A list of Langchain Document objects representing the chunks. Returns empty list if input is empty.
	"""
	if not markdown_text_with_images or not markdown_text_with_images.strip():
	logger.warning("chunk_markdown received empty or whitespace-only input string.")
	return []
	try:
	headers_to_split_on = [
	("#", "Header 1"),
	("##", "Header 2"),
	("###", "Header 3"),
	("####", "Header 4"),
	("#####", "Header 5"), # Added more levels
	("######", "Header 6"),
	]
	# Initialize MarkdownHeaderTextSplitter
	markdown_splitter = MarkdownHeaderTextSplitter(
	headers_to_split_on=headers_to_split_on,
	strip_headers=strip_headers,
	return_each_line=False # Process blocks
	)

	logger.info("Splitting markdown by headers...")
	header_chunks = markdown_splitter.split_text(markdown_text_with_images)
	logger.info(f"Split into {len(header_chunks)} chunks based on headers.")

	if not header_chunks:
	logger.warning("MarkdownHeaderTextSplitter returned zero chunks.")
	# Maybe the input had no headers? Treat the whole text as one chunk?
	# Or just return empty? Let's return empty for now, as header splitting is intended.
	# Alternative: create a single Document if header_chunks is empty but input wasn't.
	# doc = Document(page_content=markdown_text_with_images, metadata={})
	# header_chunks = [doc]
	# logger.info("No headers found, treating input as a single chunk.")
	# For now, stick to returning empty list if no header chunks are made.
	return []


	final_chunks = []
	# If chunk_size is specified and > 0, further split large chunks
	if chunk_size > 0:
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	# More robust separators
	separators=["\n\n", "\n", "(?<=\. )", "(?<=\? )", "(?<=! )", ", ", "; ", " ", ""],
	keep_separator=False,
	add_start_index=True # Add start index relative to the parent (header) chunk
	)
	logger.info(f"Applying recursive character splitting (size={chunk_size}, overlap={chunk_overlap})...")
	processed_chunks_count = 0
	for i, header_chunk in enumerate(header_chunks):
	# Check if page_content exists and is longer than chunk_size
	if header_chunk.page_content and len(header_chunk.page_content) > chunk_size:
	logger.debug(f"Header chunk {i} (length {len(header_chunk.page_content)}) needs recursive splitting.")
	try:
	# split_documents preserves metadata from the parent chunk
	sub_chunks = text_splitter.split_documents([header_chunk])
	final_chunks.extend(sub_chunks)
	processed_chunks_count += len(sub_chunks)
	logger.debug(f" -> Split into {len(sub_chunks)} sub-chunks.")
	except Exception as split_err:
	logger.error(f"Error splitting header chunk {i}: {split_err}", exc_info=True)
	# Option: Add the original large chunk instead? Or skip? Let's skip broken ones.
	logger.warning(f"Skipping header chunk {i} due to splitting error.")
	continue
	else:
	# If the chunk is already small enough or empty, just add it
	if header_chunk.page_content: # Add only if it has content
	final_chunks.append(header_chunk)
	processed_chunks_count += 1
	logger.debug(f"Header chunk {i} (length {len(header_chunk.page_content)}) kept as is.")
	else:
	logger.debug(f"Header chunk {i} was empty, skipping.")
	logger.info(f"Recursive character splitting finished. Processed {processed_chunks_count} chunks.")
	else:
	# If chunk_size is 0, use only non-empty header chunks
	logger.info("chunk_size is 0, using only non-empty header-based chunks.")
	final_chunks = [chunk for chunk in header_chunks if chunk.page_content]


	# Post-process final chunks: Extract embedded image data URIs and add to metadata
	logger.info("Extracting embedded image data URIs for final chunk metadata...")
	for chunk in final_chunks:
	images_in_chunk = []
	if chunk.page_content:
	try:
	# Regex to find all base64 data URIs in the chunk content
	# Non-greedy alt text `.*?`, robust base64 chars `[A-Za-z0-9+/=]+`
	# Ensure the closing parenthesis `\)` is matched correctly
	pattern = r"!\[.*?\]\((data:image/[a-zA-Z+]+;base64,[A-Za-z0-9+/=]+)\)"
	images_in_chunk = re.findall(pattern, chunk.page_content)
	except Exception as regex_err:
	logger.error(f"Regex error extracting images from chunk: {regex_err}", exc_info=True)
	# Leave images list empty for this chunk

	# Ensure metadata exists and add images list (can be empty)
	if not hasattr(chunk, 'metadata'):
	chunk.metadata = {}
	chunk.metadata["images_base64"] = images_in_chunk # Use a more specific key name

	logger.info(f"Created {len(final_chunks)} final chunks after processing and filtering.")
	return final_chunks

	except Exception as e:
	logger.error(f"Error during markdown chunking process: {str(e)}", exc_info=True)
	raise # Re-raise to be caught by the main processing function

	# --- Main Processing Function ---

	def process_file_and_save(
	file_obj: Any, # Gradio File object
	chunk_size: int,
	chunk_overlap: int,
	strip_headers: bool,
	hf_token: str,
	repo_name: str
	) -> str:
	"""
	Orchestrates the OCR, chunking, and saving process to Hugging Face Hub.

	Args:
	file_obj: The uploaded file object from Gradio.
	chunk_size: Max chunk size for text splitting (chars). 0 disables recursive splitting.
	chunk_overlap: Overlap for text splitting (chars).
	strip_headers: Whether to remove markdown headers from chunk content.
	hf_token: Hugging Face API token (write permission).
	repo_name: Name for the Hugging Face dataset repository (e.g., 'username/my-ocr-dataset').

	Returns:
	A string indicating success or failure, suitable for display in Gradio.
	"""
	# --- Input Validation ---
	if not file_obj:
	return "Error: No file uploaded. Please upload a PDF or image file."
	if not repo_name or '/' not in repo_name:
	return "Error: Invalid Hugging Face Repository Name. Use format 'username/dataset-name'."

	# Validate chunking parameters
	if chunk_size < 0:
	logger.warning("Chunk size cannot be negative. Setting to 0 (header splits only).")
	chunk_size = 0
	if chunk_overlap < 0:
	logger.warning("Chunk overlap cannot be negative. Setting to 0.")
	chunk_overlap = 0
	if chunk_size > 0 and chunk_overlap >= chunk_size:
	logger.warning(f"Chunk overlap ({chunk_overlap}) >= chunk size ({chunk_size}). Adjusting overlap to {min(200, chunk_size // 2)}.")
	chunk_overlap = min(200, chunk_size // 2) # Set a reasonable overlap

	# Handle Hugging Face Token
	if not hf_token:
	logger.info("No explicit HF token provided. Trying to use token from local Hugging Face login.")
	try:
	hf_token = huggingface_hub.get_token()
	if not hf_token:
	return "Error: Hugging Face Token is required. Please provide a token or log in using `huggingface-cli login`."
	logger.info("Using HF token from local login for dataset operations.")
	except Exception as e:
	logger.error(f"Error checking HF login for token: {e}", exc_info=True)
	return f"Error: Hugging Face Token is required. Could not verify HF login: {e}"

	try:
	source_filename = getattr(file_obj, 'orig_name', os.path.basename(file_obj.name))
	logger.info(f"--- Starting processing for file: {source_filename} ---")

	# --- Step 1: Perform OCR ---
	logger.info("Step 1: Performing OCR...")
	processed_markdown, _, _ = perform_ocr_file(file_obj) # raw_markdown and image_map not directly used later

	# Check if OCR returned an error message or was empty/invalid
	if not processed_markdown or isinstance(processed_markdown, str) and (
	processed_markdown.startswith("Error:") or processed_markdown.startswith("Unsupported file type:")):
	logger.error(f"OCR failed or returned error/unsupported: {processed_markdown}")
	return processed_markdown # Return the error message directly
	if not isinstance(processed_markdown, str) or len(processed_markdown.strip()) == 0:
	logger.error("OCR processing returned empty or invalid markdown content.")
	return "Error: OCR returned empty or invalid content."
	logger.info("Step 1: OCR finished successfully.")

	# --- Step 2: Chunk the markdown ---
	logger.info("Step 2: Chunking the markdown...")
	chunks = chunk_markdown(processed_markdown, chunk_size, chunk_overlap, strip_headers)

	if not chunks:
	logger.error("Chunking resulted in zero chunks. Check OCR output and chunking parameters.")
	return "Error: Failed to chunk the document (possibly empty after OCR or no headers found)."
	logger.info(f"Step 2: Chunking finished, produced {len(chunks)} chunks.")

	# --- Step 3: Prepare dataset ---
	logger.info("Step 3: Preparing data for Hugging Face dataset...")
	data: Dict[str, List[Any]] = {
	"chunk_id": [],
	"text": [], # Renamed 'content' to 'text'
	"metadata": [],
	"source_filename": [],
	}

	for i, chunk in enumerate(chunks):
	chunk_id = f"{source_filename}_chunk_{i}"
	data["chunk_id"].append(chunk_id)
	data["text"].append(chunk.page_content if chunk.page_content else "") # Ensure text is string

	# Ensure metadata is serializable (dicts, lists, primitives) for HF Datasets
	serializable_metadata = {}
	if hasattr(chunk, 'metadata') and chunk.metadata:
	for k, v in chunk.metadata.items():
	if isinstance(v, (str, int, float, bool, list, dict, type(None))):
	serializable_metadata[k] = v
	else:
	# Convert potentially problematic types (like Langchain objects) to string
	logger.warning(f"Chunk {chunk_id}: Metadata key '{k}' has non-standard type {type(v)}. Converting to string.")
	try:
	serializable_metadata[k] = str(v)
	except Exception as str_err:
	logger.error(f"Chunk {chunk_id}: Failed to convert metadata key '{k}' to string: {str_err}")
	serializable_metadata[k] = f"ERROR_CONVERTING_{type(v).__name__}"
	data["metadata"].append(serializable_metadata)
	data["source_filename"].append(source_filename)


	# --- Step 4: Create and push dataset to Hugging Face ---
	logger.info(f"Step 4: Creating Hugging Face Dataset object for repo '{repo_name}'...")
	try:
	# Explicitly define features for robustness, especially if metadata varies
	# features = datasets.Features({
	# "chunk_id": datasets.Value("string"),
	# "text": datasets.Value("string"),
	# "metadata": datasets.features.Features({}), # Define known metadata fields if possible, or leave open
	# "source_filename": datasets.Value("string"),
	# })
	# dataset = Dataset.from_dict(data, features=features)
	dataset = Dataset.from_dict(data) # Simpler approach, infers features
	logger.info(f"Dataset object created with {len(chunks)} rows.")
	except Exception as ds_err:
	logger.error(f"Failed to create Dataset object: {ds_err}", exc_info=True)
	return f"Error: Failed to create dataset structure. Check logs. ({ds_err})"

	logger.info(f"Connecting to Hugging Face Hub API to push to '{repo_name}'...")
	try:
	api = HfApi(token=hf_token) # Pass token explicitly

	# Create repo if it doesn't exist
	try:
	api.repo_info(repo_id=repo_name, repo_type="dataset")
	logger.info(f"Repository '{repo_name}' already exists. Will overwrite content.")
	except huggingface_hub.utils.RepositoryNotFoundError:
	logger.info(f"Repository '{repo_name}' does not exist. Creating...")
	api.create_repo(repo_id=repo_name, repo_type="dataset", private=False) # Default to public
	logger.info(f"Successfully created repository '{repo_name}'.")

	# Push the dataset
	logger.info(f"Pushing dataset to '{repo_name}'...")
	commit_message = f"Add/update OCR data from file: {source_filename}"
	# push_to_hub overwrites the dataset by default
	dataset.push_to_hub(repo_name, commit_message=commit_message)
	repo_url = f"https://huggingface.co/datasets/{repo_name}"
	logger.info(f"Dataset successfully pushed to {repo_url}")
	return f"Success! Dataset with {len(chunks)} chunks saved to Hugging Face: {repo_url}"

	except huggingface_hub.utils.HfHubHTTPError as hf_http_err:
	logger.error(f"Hugging Face Hub HTTP Error: {hf_http_err}", exc_info=True)
	return f"Error: Hugging Face Hub Error pushing to '{repo_name}'. Status: {hf_http_err.response.status_code}. Check token permissions, repo name, and network. Details: {hf_http_err}"
	except Exception as push_err:
	logger.error(f"Failed to push dataset to '{repo_name}': {push_err}", exc_info=True)
	return f"Error: Failed to push dataset to Hugging Face repository '{repo_name}'. ({push_err})"

	except Exception as e:
	# Catch any unexpected errors during the overall process
	logger.error(f"An unexpected error occurred processing '{source_filename}': {str(e)}", exc_info=True)
	return f"An unexpected error occurred: {str(e)}"
	finally:
	logger.info(f"--- Finished processing for file: {source_filename} ---")


	# --- Gradio Interface ---
	with gr.Blocks(title="Mistral OCR & Dataset Creator", theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan")) as demo:
	gr.Markdown("# Mistral OCR, Markdown Chunking, and Hugging Face Dataset Creator")
	gr.Markdown(
	"""
	Upload a PDF or image file (PNG, JPG, WEBP, BMP). The application will:
	1. Extract text and images using Mistral OCR.
	2. Embed images as base64 data URIs directly within the extracted markdown text.
	3. Chunk the resulting markdown based on headers and optionally recursively by character count.
	4. Store any embedded base64 images found within each chunk in the chunk's metadata (`metadata['images_base64']`).
	5. Create or update a Hugging Face Dataset with the processed chunks (`chunk_id`, `text`, `metadata`, `source_filename`).
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(
	label="Upload PDF or Image File",
	file_types=['.pdf', '.png', '.jpg', '.jpeg', '.webp', '.bmp'],
	type="filepath" # Ensures we get a path usable by `open()`
	)

	gr.Markdown("## Chunking Options")
	chunk_size = gr.Slider(
	minimum=0, maximum=8000, value=1000, step=100, # Increased max size
	label="Max Chunk Size (Characters)",
	info="Approximate target size. Set to 0 to disable recursive splitting (uses only header splits)."
	)
	chunk_overlap = gr.Slider(
	minimum=0, maximum=1000, value=200, step=50,
	label="Chunk Overlap (Characters)",
	info="Number of characters to overlap between consecutive chunks (if recursive splitting is enabled)."
	)
	strip_headers = gr.Checkbox(
	label="Strip Markdown Headers (#) from Chunk Content",
	value=True,
	info="If checked, removes '#', '##' etc. from the start of the text in each chunk."
	)

	gr.Markdown("## Hugging Face Output Options")
	repo_name = gr.Textbox(
	label="Target Hugging Face Dataset Repository",
	placeholder="your-username/your-dataset-name",
	info="The dataset will be pushed here (e.g., 'my-org/my-ocr-data'). Will be created if it doesn't exist."
	)
	hf_token = gr.Textbox(
	label="Hugging Face Token (write permission)",
	type="password",
	placeholder="hf_...",
	info="Required to create/push the dataset. If blank, will try using token from local `huggingface-cli login`.",
	# value=os.environ.get("HF_TOKEN", "") # Optionally pre-fill from env var if desired
	)

	submit_btn = gr.Button("Process File and Save to Hugging Face", variant="primary")

	with gr.Column(scale=1):
	output = gr.Textbox(
	label="Processing Log / Result Status",
	lines=20,
	interactive=False,
	placeholder="Processing steps and final result will appear here..."
	)

	submit_btn.click(
	fn=process_file_and_save,
	inputs=[file_input, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name],
	outputs=output
	)

	gr.Examples(
	examples=[
	[None, 1000, 200, True, "", "hf-username/my-first-ocr-dataset"],
	[None, 2000, 400, True, "", "hf-username/large-chunk-ocr-data"],
	[None, 0, 0, False, "", "hf-username/header-only-ocr-data"], # Example for header-only splitting
	],
	inputs=[file_input, chunk_size, chunk_overlap, strip_headers, hf_token, repo_name],
	outputs=output,
	fn=process_file_and_save, # Make examples clickable
	cache_examples=False # Avoid caching as it involves API calls and file processing
	)

	gr.Markdown("--- \n Requires `MISTRAL_API_KEY` environment variable or being logged in via `huggingface-cli login`.")

	# --- Launch the Gradio App ---
	if __name__ == "__main__":
	# Check if client initialization failed earlier
	if not client and api_key: # Check if key was present but init failed
	print("\nCRITICAL: Mistral client failed to initialize. The application cannot perform OCR.")
	print("Please check your MISTRAL_API_KEY and network connection.\n")
	# Optionally exit, or let Gradio launch with limited functionality
	# exit(1)
	elif not client and not api_key:
	print("\nWARNING: Mistral client not initialized because no API key was found.")
	print("OCR functionality will fail. Please set MISTRAL_API_KEY or log in via `huggingface-cli login`.\n")

	# share=True creates a public link (useful for Colab/Spaces)
	# debug=True provides detailed errors in the console during development
	demo.launch(share=os.getenv('GRADIO_SHARE', 'False').lower() == 'true', debug=True,)