Spaces:

ByteDance
/

Dolphin

Running on Zero

App Files Files Community

Dolphin / inference_hugg.py

xfey

[init] update application file

dfb1341 8 days ago

raw

history blame contribute delete

10.1 kB

	"""
	Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
	SPDX-License-Identifier: MIT
	"""

	import argparse
	import glob
	import os

	import cv2
	import torch
	from PIL import Image
	from transformers import AutoProcessor, VisionEncoderDecoderModel

	from utils.utils import *


	class DOLPHIN:
	def __init__(self, model_id_or_path):
	"""Initialize the Hugging Face model

	Args:
	model_id_or_path: Path to local model or Hugging Face model ID
	"""
	# Load model from local path or Hugging Face hub
	self.processor = AutoProcessor.from_pretrained(model_id_or_path)
	self.model = VisionEncoderDecoderModel.from_pretrained(model_id_or_path)
	self.model.eval()

	# Set device and precision
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.model.to(self.device)
	self.model = self.model.half() # Always use half precision by default

	# set tokenizer
	self.tokenizer = self.processor.tokenizer

	def chat(self, prompt, image):
	"""Process an image or batch of images with the given prompt(s)

	Args:
	prompt: Text prompt or list of prompts to guide the model
	image: PIL Image or list of PIL Images to process

	Returns:
	Generated text or list of texts from the model
	"""
	# Check if we're dealing with a batch
	is_batch = isinstance(image, list)

	if not is_batch:
	# Single image, wrap it in a list for consistent processing
	images = [image]
	prompts = [prompt]
	else:
	# Batch of images
	images = image
	prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)

	# Prepare image
	batch_inputs = self.processor(images, return_tensors="pt", padding=True)
	batch_pixel_values = batch_inputs.pixel_values.half().to(self.device)

	# Prepare prompt
	prompts = [f"<s>{p} <Answer/>" for p in prompts]
	batch_prompt_inputs = self.tokenizer(
	prompts,
	add_special_tokens=False,
	return_tensors="pt"
	)

	batch_prompt_ids = batch_prompt_inputs.input_ids.to(self.device)
	batch_attention_mask = batch_prompt_inputs.attention_mask.to(self.device)

	# Generate text
	outputs = self.model.generate(
	pixel_values=batch_pixel_values,
	decoder_input_ids=batch_prompt_ids,
	decoder_attention_mask=batch_attention_mask,
	min_length=1,
	max_length=4096,
	pad_token_id=self.tokenizer.pad_token_id,
	eos_token_id=self.tokenizer.eos_token_id,
	use_cache=True,
	bad_words_ids=[[self.tokenizer.unk_token_id]],
	return_dict_in_generate=True,
	do_sample=False,
	num_beams=1,
	repetition_penalty=1.1
	)

	# Process output
	sequences = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)

	# Clean prompt text from output
	results = []
	for i, sequence in enumerate(sequences):
	cleaned = sequence.replace(prompts[i], "").replace("<pad>", "").replace("</s>", "").strip()
	results.append(cleaned)

	# Return a single result for single image input
	if not is_batch:
	return results[0]
	return results


	def process_page(image_path, model, save_dir, max_batch_size=None):
	"""Parse document images with two stages"""
	# Stage 1: Page-level layout and reading order parsing
	pil_image = Image.open(image_path).convert("RGB")
	layout_output = model.chat("Parse the reading order of this document.", pil_image)

	# Stage 2: Element-level content parsing
	padded_image, dims = prepare_image(pil_image)
	recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size)

	# Save outputs
	json_path = save_outputs(recognition_results, image_path, save_dir)

	return json_path, recognition_results


	def process_elements(layout_results, padded_image, dims, model, max_batch_size=None):
	"""Parse all document elements with parallel decoding"""
	layout_results = parse_layout_string(layout_results)

	# Store text and table elements separately
	text_elements = [] # Text elements
	table_elements = [] # Table elements
	figure_results = [] # Image elements (no processing needed)
	previous_box = None
	reading_order = 0

	# Collect elements to process and group by type
	for bbox, label in layout_results:
	try:
	# Adjust coordinates
	x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = process_coordinates(
	bbox, padded_image, dims, previous_box
	)

	# Crop and parse element
	cropped = padded_image[y1:y2, x1:x2]
	if cropped.size > 0:
	if label == "fig":
	# For figure regions, add empty text result immediately
	figure_results.append(
	{
	"label": label,
	"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
	"text": "",
	"reading_order": reading_order,
	}
	)
	else:
	# Prepare element for parsing
	pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
	element_info = {
	"crop": pil_crop,
	"label": label,
	"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
	"reading_order": reading_order,
	}

	# Group by type
	if label == "tab":
	table_elements.append(element_info)
	else: # Text elements
	text_elements.append(element_info)

	reading_order += 1

	except Exception as e:
	print(f"Error processing bbox with label {label}: {str(e)}")
	continue

	# Initialize results list
	recognition_results = figure_results.copy()

	# Process text elements (in batches)
	if text_elements:
	text_results = process_element_batch(text_elements, model, "Read text in the image.", max_batch_size)
	recognition_results.extend(text_results)

	# Process table elements (in batches)
	if table_elements:
	table_results = process_element_batch(table_elements, model, "Parse the table in the image.", max_batch_size)
	recognition_results.extend(table_results)

	# Sort elements by reading order
	recognition_results.sort(key=lambda x: x.get("reading_order", 0))

	return recognition_results


	def process_element_batch(elements, model, prompt, max_batch_size=None):
	"""Process elements of the same type in batches"""
	results = []

	# Determine batch size
	batch_size = len(elements)
	if max_batch_size is not None and max_batch_size > 0:
	batch_size = min(batch_size, max_batch_size)

	# Process in batches
	for i in range(0, len(elements), batch_size):
	batch_elements = elements[i:i+batch_size]
	crops_list = [elem["crop"] for elem in batch_elements]

	# Use the same prompt for all elements in the batch
	prompts_list = [prompt] * len(crops_list)

	# Batch inference
	batch_results = model.chat(prompts_list, crops_list)

	# Add results
	for j, result in enumerate(batch_results):
	elem = batch_elements[j]
	results.append({
	"label": elem["label"],
	"bbox": elem["bbox"],
	"text": result.strip(),
	"reading_order": elem["reading_order"],
	})

	return results


	def main():
	parser = argparse.ArgumentParser(description="Document processing tool using DOLPHIN model")
	parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image or directory of images")
	parser.add_argument(
	"--save_dir",
	type=str,
	default=None,
	help="Directory to save parsing results (default: same as input directory)",
	)
	parser.add_argument(
	"--max_batch_size",
	type=int,
	default=16,
	help="Maximum number of document elements to parse in a single batch (default: 16)",
	)
	args = parser.parse_args()

	# Load Model
	model = DOLPHIN("ByteDance/Dolphin")

	# Collect Document Images
	if os.path.isdir(args.input_path):
	image_files = []
	for ext in [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]:
	image_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
	image_files = sorted(image_files)
	else:
	if not os.path.exists(args.input_path):
	raise FileNotFoundError(f"Input path {args.input_path} does not exist")
	image_files = [args.input_path]

	save_dir = args.save_dir or (
	args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)
	)
	setup_output_dirs(save_dir)

	total_samples = len(image_files)
	print(f"\nTotal samples to process: {total_samples}")

	# Process All Document Images
	for image_path in image_files:
	print(f"\nProcessing {image_path}")
	try:
	json_path, recognition_results = process_page(
	image_path=image_path,
	model=model,
	save_dir=save_dir,
	max_batch_size=args.max_batch_size,
	)

	print(f"Processing completed. Results saved to {save_dir}")

	except Exception as e:
	print(f"Error processing {image_path}: {str(e)}")
	continue


	if __name__ == "__main__":
	main()