Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -118,7 +118,7 @@ def get_combined_markdown(ocr_response: Any) -> Tuple[str, str, Dict[str, str]]:
|
|
118 |
|
119 |
processed_markdowns.append(current_processed_markdown)
|
120 |
|
121 |
-
logger.info(f"Processed {len(processed_markdowns)} pages with {len(image_data_map)} images.
|
122 |
return "\n\n".join(processed_markdowns), "\n\n".join(raw_markdowns), image_data_map
|
123 |
|
124 |
except Exception as e:
|
@@ -286,12 +286,12 @@ def get_hf_token(explicit_token: str = None) -> str:
|
|
286 |
return None
|
287 |
|
288 |
def process_file_and_save(
|
289 |
-
|
290 |
strip_headers: bool, hf_token: str, repo_name: str
|
291 |
) -> str:
|
292 |
-
"""Orchestrates OCR, chunking, and saving to Hugging Face."""
|
293 |
-
if not
|
294 |
-
return "Error: No
|
295 |
if not repo_name or '/' not in repo_name:
|
296 |
return "Error: Invalid repository name (use 'username/dataset-name')."
|
297 |
|
@@ -311,25 +311,44 @@ def process_file_and_save(
|
|
311 |
3. Run `huggingface-cli login` in your terminal"""
|
312 |
|
313 |
try:
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
return processed_markdown
|
320 |
-
|
321 |
-
chunks = chunk_markdown(processed_markdown, chunk_size, chunk_overlap, strip_headers)
|
322 |
-
if not chunks:
|
323 |
-
return "Error: Failed to chunk the document."
|
324 |
-
|
325 |
-
data = {
|
326 |
-
"chunk_id": [f"{source_filename}_chunk_{i}" for i in range(len(chunks))],
|
327 |
-
"text": [chunk.page_content or "" for chunk in chunks],
|
328 |
-
"metadata": [chunk.metadata for chunk in chunks],
|
329 |
-
"source_filename": [source_filename] * len(chunks),
|
330 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
|
332 |
-
dataset = Dataset.from_dict(
|
333 |
api = HfApi(token=effective_hf_token)
|
334 |
|
335 |
try:
|
@@ -346,9 +365,12 @@ def process_file_and_save(
|
|
346 |
logger.info(f"Created repository '{repo_name}'.")
|
347 |
|
348 |
dataset.push_to_hub(repo_name, token=effective_hf_token,
|
349 |
-
commit_message=f"Add OCR data from {
|
350 |
repo_url = f"https://huggingface.co/datasets/{repo_name}"
|
351 |
-
|
|
|
|
|
|
|
352 |
|
353 |
except huggingface_hub.utils.HfHubHTTPError as hf_http_err:
|
354 |
status = getattr(hf_http_err.response, 'status_code', 'Unknown')
|
@@ -359,7 +381,7 @@ def process_file_and_save(
|
|
359 |
return f"Error: Hugging Face Hub Error (Status {status}): {hf_http_err}"
|
360 |
except Exception as e:
|
361 |
logger.error(f"Unexpected error: {e}", exc_info=True)
|
362 |
-
return f"Unexpected error: {str(e)}"
|
363 |
|
364 |
# --- Gradio Interface ---
|
365 |
with gr.Blocks(title="Mistral OCR & Dataset Creator",
|
@@ -367,21 +389,22 @@ with gr.Blocks(title="Mistral OCR & Dataset Creator",
|
|
367 |
gr.Markdown("# Mistral OCR, Markdown Chunking, and Hugging Face Dataset Creator")
|
368 |
gr.Markdown(
|
369 |
"""
|
370 |
-
Upload
|
371 |
-
1. Extract text and images using Mistral OCR
|
372 |
2. Embed images as base64 data URIs in markdown
|
373 |
3. Chunk markdown by headers and optionally character count
|
374 |
4. Store embedded images in chunk metadata
|
375 |
-
5. Create/update a Hugging Face Dataset
|
376 |
"""
|
377 |
)
|
378 |
|
379 |
with gr.Row():
|
380 |
with gr.Column(scale=1):
|
381 |
file_input = gr.File(
|
382 |
-
label="Upload PDF or Image
|
383 |
file_types=['.pdf', '.png', '.jpg', '.jpeg', '.webp', '.bmp'],
|
384 |
-
type="filepath"
|
|
|
385 |
)
|
386 |
gr.Markdown("## Chunking Options")
|
387 |
chunk_size = gr.Slider(minimum=0, maximum=8000, value=1000, step=100,
|
|
|
118 |
|
119 |
processed_markdowns.append(current_processed_markdown)
|
120 |
|
121 |
+
logger.info(f"Processed {len(processed_markdowns)} pages with {len(image_data_map)} images.)
|
122 |
return "\n\n".join(processed_markdowns), "\n\n".join(raw_markdowns), image_data_map
|
123 |
|
124 |
except Exception as e:
|
|
|
286 |
return None
|
287 |
|
288 |
def process_file_and_save(
|
289 |
+
file_objs: List[Any], chunk_size: int, chunk_overlap: int,
|
290 |
strip_headers: bool, hf_token: str, repo_name: str
|
291 |
) -> str:
|
292 |
+
"""Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
|
293 |
+
if not file_objs:
|
294 |
+
return "Error: No files uploaded."
|
295 |
if not repo_name or '/' not in repo_name:
|
296 |
return "Error: Invalid repository name (use 'username/dataset-name')."
|
297 |
|
|
|
311 |
3. Run `huggingface-cli login` in your terminal"""
|
312 |
|
313 |
try:
|
314 |
+
all_data = {
|
315 |
+
"chunk_id": [],
|
316 |
+
"text": [],
|
317 |
+
"metadata": [],
|
318 |
+
"source_filename": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
}
|
320 |
+
total_chunks = 0
|
321 |
+
files_processed = 0
|
322 |
+
error_messages = []
|
323 |
+
|
324 |
+
for file_idx, file_obj in enumerate(file_objs, 1):
|
325 |
+
source_filename = getattr(file_obj, 'orig_name', os.path.basename(file_obj.name))
|
326 |
+
logger.info(f"--- Processing file {file_idx}/{len(file_objs)}: {source_filename} ---")
|
327 |
+
|
328 |
+
processed_markdown, raw_markdown, img_map = perform_ocr_file(file_obj)
|
329 |
+
if processed_markdown.startswith("Error:"):
|
330 |
+
error_messages.append(f"File '{source_filename}': {processed_markdown}")
|
331 |
+
logger.error(f"Failed to process file {source_filename}: {processed_markdown}")
|
332 |
+
continue
|
333 |
+
|
334 |
+
chunks = chunk_markdown(processed_markdown, chunk_size, chunk_overlap, strip_headers)
|
335 |
+
if not chunks:
|
336 |
+
error_messages.append(f"File '{source_filename}': Failed to chunk the document.")
|
337 |
+
logger.error(f"Failed to chunk file {source_filename}")
|
338 |
+
continue
|
339 |
+
|
340 |
+
all_data["chunk_id"].extend([f"{source_filename}_chunk_{i}" for i in range(len(chunks))])
|
341 |
+
all_data["text"].extend([chunk.page_content or "" for chunk in chunks])
|
342 |
+
all_data["metadata"].extend([chunk.metadata for chunk in chunks])
|
343 |
+
all_data["source_filename"].extend([source_filename] * len(chunks))
|
344 |
+
total_chunks += len(chunks)
|
345 |
+
files_processed += 1
|
346 |
+
logger.info(f"File {source_filename}: Added {len(chunks)} chunks. Total chunks: {total_chunks}")
|
347 |
+
|
348 |
+
if not all_data["chunk_id"]:
|
349 |
+
return "Error: No valid data processed from any files.\n" + "\n".join(error_messages)
|
350 |
|
351 |
+
dataset = Dataset.from_dict(all_data)
|
352 |
api = HfApi(token=effective_hf_token)
|
353 |
|
354 |
try:
|
|
|
365 |
logger.info(f"Created repository '{repo_name}'.")
|
366 |
|
367 |
dataset.push_to_hub(repo_name, token=effective_hf_token,
|
368 |
+
commit_message=f"Add OCR data from {files_processed} files")
|
369 |
repo_url = f"https://huggingface.co/datasets/{repo_name}"
|
370 |
+
result = f"Success! Dataset with {total_chunks} chunks from {files_processed}/{len(file_objs)} files saved to: {repo_url}"
|
371 |
+
if error_messages:
|
372 |
+
result += "\n\nErrors encountered:\n" + "\n".join(error_messages)
|
373 |
+
return result
|
374 |
|
375 |
except huggingface_hub.utils.HfHubHTTPError as hf_http_err:
|
376 |
status = getattr(hf_http_err.response, 'status_code', 'Unknown')
|
|
|
381 |
return f"Error: Hugging Face Hub Error (Status {status}): {hf_http_err}"
|
382 |
except Exception as e:
|
383 |
logger.error(f"Unexpected error: {e}", exc_info=True)
|
384 |
+
return f"Unexpected error: {str(e)}\n" + "\n".join(error_messages)
|
385 |
|
386 |
# --- Gradio Interface ---
|
387 |
with gr.Blocks(title="Mistral OCR & Dataset Creator",
|
|
|
389 |
gr.Markdown("# Mistral OCR, Markdown Chunking, and Hugging Face Dataset Creator")
|
390 |
gr.Markdown(
|
391 |
"""
|
392 |
+
Upload one or more PDF or image files. The application will:
|
393 |
+
1. Extract text and images using Mistral OCR for each file
|
394 |
2. Embed images as base64 data URIs in markdown
|
395 |
3. Chunk markdown by headers and optionally character count
|
396 |
4. Store embedded images in chunk metadata
|
397 |
+
5. Create/update a Hugging Face Dataset with all processed data
|
398 |
"""
|
399 |
)
|
400 |
|
401 |
with gr.Row():
|
402 |
with gr.Column(scale=1):
|
403 |
file_input = gr.File(
|
404 |
+
label="Upload PDF or Image Files",
|
405 |
file_types=['.pdf', '.png', '.jpg', '.jpeg', '.webp', '.bmp'],
|
406 |
+
type="filepath",
|
407 |
+
file_count="multiple" # Allow multiple file uploads
|
408 |
)
|
409 |
gr.Markdown("## Chunking Options")
|
410 |
chunk_size = gr.Slider(minimum=0, maximum=8000, value=1000, step=100,
|