import torch import re from PIL import Image from transformers import DonutProcessor, VisionEncoderDecoderModel import base64 import io import json def model_fn(model_dir): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') processor = DonutProcessor.from_pretrained(model_dir) model = VisionEncoderDecoderModel.from_pretrained(model_dir) model.to(device) return model, processor, device def transform_fn(model, request_body, input_content_type, output_content_type): model, processor, device = model if input_content_type == 'application/json': data = json.loads(request_body) image_data = data['inputs'] image = Image.open(io.BytesIO(base64.b64decode(image_data))).convert("RGB") # Preprocess the image pixel_values = processor(image, return_tensors="pt").pixel_values.to(device) # Run inference model.eval() with torch.no_grad(): task_prompt = "" decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(device) generated_outputs = model.generate( pixel_values, decoder_input_ids=decoder_input_ids, max_length=model.config.decoder.max_position_embeddings, pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id, early_stopping=True, bad_words_ids=[[processor.tokenizer.unk_token_id]], return_dict_in_generate=True ) # Decode the output decoded_text = processor.batch_decode(generated_outputs.sequences)[0] decoded_text = decoded_text.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") decoded_text = re.sub(r"<.*?>", "", decoded_text, count=1).strip() # Prepare the response prediction = {'result': decoded_text} return json.dumps(prediction), output_content_type else: raise ValueError(f"Unsupported content type: {input_content_type}")