Svngoku commited on
Commit
84f4f2c
·
verified ·
1 Parent(s): 93be02b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -30
app.py CHANGED
@@ -118,7 +118,7 @@ def get_combined_markdown(ocr_response: Any) -> Tuple[str, str, Dict[str, str]]:
118
 
119
  processed_markdowns.append(current_processed_markdown)
120
 
121
- logger.info(f"Processed {len(processed_markdowns)} pages with {len(image_data_map)} images.")
122
  return "\n\n".join(processed_markdowns), "\n\n".join(raw_markdowns), image_data_map
123
 
124
  except Exception as e:
@@ -286,12 +286,12 @@ def get_hf_token(explicit_token: str = None) -> str:
286
  return None
287
 
288
  def process_file_and_save(
289
- file_obj: Any, chunk_size: int, chunk_overlap: int,
290
  strip_headers: bool, hf_token: str, repo_name: str
291
  ) -> str:
292
- """Orchestrates OCR, chunking, and saving to Hugging Face."""
293
- if not file_obj:
294
- return "Error: No file uploaded."
295
  if not repo_name or '/' not in repo_name:
296
  return "Error: Invalid repository name (use 'username/dataset-name')."
297
 
@@ -311,25 +311,44 @@ def process_file_and_save(
311
  3. Run `huggingface-cli login` in your terminal"""
312
 
313
  try:
314
- source_filename = getattr(file_obj, 'orig_name', os.path.basename(file_obj.name))
315
- logger.info(f"--- Starting processing for file: {source_filename} ---")
316
-
317
- processed_markdown, raw_markdown, img_map = perform_ocr_file(file_obj)
318
- if not processed_markdown or processed_markdown.startswith("Error:"):
319
- return processed_markdown
320
-
321
- chunks = chunk_markdown(processed_markdown, chunk_size, chunk_overlap, strip_headers)
322
- if not chunks:
323
- return "Error: Failed to chunk the document."
324
-
325
- data = {
326
- "chunk_id": [f"{source_filename}_chunk_{i}" for i in range(len(chunks))],
327
- "text": [chunk.page_content or "" for chunk in chunks],
328
- "metadata": [chunk.metadata for chunk in chunks],
329
- "source_filename": [source_filename] * len(chunks),
330
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
- dataset = Dataset.from_dict(data)
333
  api = HfApi(token=effective_hf_token)
334
 
335
  try:
@@ -346,9 +365,12 @@ def process_file_and_save(
346
  logger.info(f"Created repository '{repo_name}'.")
347
 
348
  dataset.push_to_hub(repo_name, token=effective_hf_token,
349
- commit_message=f"Add OCR data from {source_filename}")
350
  repo_url = f"https://huggingface.co/datasets/{repo_name}"
351
- return f"Success! Dataset with {len(chunks)} chunks saved to: {repo_url}"
 
 
 
352
 
353
  except huggingface_hub.utils.HfHubHTTPError as hf_http_err:
354
  status = getattr(hf_http_err.response, 'status_code', 'Unknown')
@@ -359,7 +381,7 @@ def process_file_and_save(
359
  return f"Error: Hugging Face Hub Error (Status {status}): {hf_http_err}"
360
  except Exception as e:
361
  logger.error(f"Unexpected error: {e}", exc_info=True)
362
- return f"Unexpected error: {str(e)}"
363
 
364
  # --- Gradio Interface ---
365
  with gr.Blocks(title="Mistral OCR & Dataset Creator",
@@ -367,21 +389,22 @@ with gr.Blocks(title="Mistral OCR & Dataset Creator",
367
  gr.Markdown("# Mistral OCR, Markdown Chunking, and Hugging Face Dataset Creator")
368
  gr.Markdown(
369
  """
370
- Upload a PDF or image file. The application will:
371
- 1. Extract text and images using Mistral OCR
372
  2. Embed images as base64 data URIs in markdown
373
  3. Chunk markdown by headers and optionally character count
374
  4. Store embedded images in chunk metadata
375
- 5. Create/update a Hugging Face Dataset
376
  """
377
  )
378
 
379
  with gr.Row():
380
  with gr.Column(scale=1):
381
  file_input = gr.File(
382
- label="Upload PDF or Image File",
383
  file_types=['.pdf', '.png', '.jpg', '.jpeg', '.webp', '.bmp'],
384
- type="filepath"
 
385
  )
386
  gr.Markdown("## Chunking Options")
387
  chunk_size = gr.Slider(minimum=0, maximum=8000, value=1000, step=100,
 
118
 
119
  processed_markdowns.append(current_processed_markdown)
120
 
121
+ logger.info(f"Processed {len(processed_markdowns)} pages with {len(image_data_map)} images.)
122
  return "\n\n".join(processed_markdowns), "\n\n".join(raw_markdowns), image_data_map
123
 
124
  except Exception as e:
 
286
  return None
287
 
288
  def process_file_and_save(
289
+ file_objs: List[Any], chunk_size: int, chunk_overlap: int,
290
  strip_headers: bool, hf_token: str, repo_name: str
291
  ) -> str:
292
+ """Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
293
+ if not file_objs:
294
+ return "Error: No files uploaded."
295
  if not repo_name or '/' not in repo_name:
296
  return "Error: Invalid repository name (use 'username/dataset-name')."
297
 
 
311
  3. Run `huggingface-cli login` in your terminal"""
312
 
313
  try:
314
+ all_data = {
315
+ "chunk_id": [],
316
+ "text": [],
317
+ "metadata": [],
318
+ "source_filename": []
 
 
 
 
 
 
 
 
 
 
 
319
  }
320
+ total_chunks = 0
321
+ files_processed = 0
322
+ error_messages = []
323
+
324
+ for file_idx, file_obj in enumerate(file_objs, 1):
325
+ source_filename = getattr(file_obj, 'orig_name', os.path.basename(file_obj.name))
326
+ logger.info(f"--- Processing file {file_idx}/{len(file_objs)}: {source_filename} ---")
327
+
328
+ processed_markdown, raw_markdown, img_map = perform_ocr_file(file_obj)
329
+ if processed_markdown.startswith("Error:"):
330
+ error_messages.append(f"File '{source_filename}': {processed_markdown}")
331
+ logger.error(f"Failed to process file {source_filename}: {processed_markdown}")
332
+ continue
333
+
334
+ chunks = chunk_markdown(processed_markdown, chunk_size, chunk_overlap, strip_headers)
335
+ if not chunks:
336
+ error_messages.append(f"File '{source_filename}': Failed to chunk the document.")
337
+ logger.error(f"Failed to chunk file {source_filename}")
338
+ continue
339
+
340
+ all_data["chunk_id"].extend([f"{source_filename}_chunk_{i}" for i in range(len(chunks))])
341
+ all_data["text"].extend([chunk.page_content or "" for chunk in chunks])
342
+ all_data["metadata"].extend([chunk.metadata for chunk in chunks])
343
+ all_data["source_filename"].extend([source_filename] * len(chunks))
344
+ total_chunks += len(chunks)
345
+ files_processed += 1
346
+ logger.info(f"File {source_filename}: Added {len(chunks)} chunks. Total chunks: {total_chunks}")
347
+
348
+ if not all_data["chunk_id"]:
349
+ return "Error: No valid data processed from any files.\n" + "\n".join(error_messages)
350
 
351
+ dataset = Dataset.from_dict(all_data)
352
  api = HfApi(token=effective_hf_token)
353
 
354
  try:
 
365
  logger.info(f"Created repository '{repo_name}'.")
366
 
367
  dataset.push_to_hub(repo_name, token=effective_hf_token,
368
+ commit_message=f"Add OCR data from {files_processed} files")
369
  repo_url = f"https://huggingface.co/datasets/{repo_name}"
370
+ result = f"Success! Dataset with {total_chunks} chunks from {files_processed}/{len(file_objs)} files saved to: {repo_url}"
371
+ if error_messages:
372
+ result += "\n\nErrors encountered:\n" + "\n".join(error_messages)
373
+ return result
374
 
375
  except huggingface_hub.utils.HfHubHTTPError as hf_http_err:
376
  status = getattr(hf_http_err.response, 'status_code', 'Unknown')
 
381
  return f"Error: Hugging Face Hub Error (Status {status}): {hf_http_err}"
382
  except Exception as e:
383
  logger.error(f"Unexpected error: {e}", exc_info=True)
384
+ return f"Unexpected error: {str(e)}\n" + "\n".join(error_messages)
385
 
386
  # --- Gradio Interface ---
387
  with gr.Blocks(title="Mistral OCR & Dataset Creator",
 
389
  gr.Markdown("# Mistral OCR, Markdown Chunking, and Hugging Face Dataset Creator")
390
  gr.Markdown(
391
  """
392
+ Upload one or more PDF or image files. The application will:
393
+ 1. Extract text and images using Mistral OCR for each file
394
  2. Embed images as base64 data URIs in markdown
395
  3. Chunk markdown by headers and optionally character count
396
  4. Store embedded images in chunk metadata
397
+ 5. Create/update a Hugging Face Dataset with all processed data
398
  """
399
  )
400
 
401
  with gr.Row():
402
  with gr.Column(scale=1):
403
  file_input = gr.File(
404
+ label="Upload PDF or Image Files",
405
  file_types=['.pdf', '.png', '.jpg', '.jpeg', '.webp', '.bmp'],
406
+ type="filepath",
407
+ file_count="multiple" # Allow multiple file uploads
408
  )
409
  gr.Markdown("## Chunking Options")
410
  chunk_size = gr.Slider(minimum=0, maximum=8000, value=1000, step=100,