Llama-3.1-8B-DALv0.1 / venv /lib /python3.12 /site-packages /transformers /agents /document_question_answering.py

Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

3.62 kB

	#!/usr/bin/env python
	# coding=utf-8

	# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import re

	import numpy as np
	import torch

	from ..models.auto import AutoProcessor
	from ..models.vision_encoder_decoder import VisionEncoderDecoderModel
	from ..utils import is_vision_available
	from .tools import PipelineTool


	if is_vision_available():
	from PIL import Image


	class DocumentQuestionAnsweringTool(PipelineTool):
	default_checkpoint = "naver-clova-ix/donut-base-finetuned-docvqa"
	description = "This is a tool that answers a question about an document (pdf). It returns a text that contains the answer to the question."
	name = "document_qa"
	pre_processor_class = AutoProcessor
	model_class = VisionEncoderDecoderModel

	inputs = {
	"document": {
	"type": "image",
	"description": "The image containing the information. Can be a PIL Image or a string path to the image.",
	},
	"question": {"type": "text", "description": "The question in English"},
	}
	output_type = "text"

	def __init__(self, args, *kwargs):
	if not is_vision_available():
	raise ValueError("Pillow must be installed to use the DocumentQuestionAnsweringTool.")

	super().__init__(args, *kwargs)

	def encode(self, document: "Image", question: str):
	task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
	prompt = task_prompt.replace("{user_input}", question)
	decoder_input_ids = self.pre_processor.tokenizer(
	prompt, add_special_tokens=False, return_tensors="pt"
	).input_ids
	if isinstance(document, str):
	img = Image.open(document).convert("RGB")
	img_array = np.array(img).transpose(2, 0, 1)
	document = torch.tensor(img_array)
	pixel_values = self.pre_processor(document, return_tensors="pt").pixel_values

	return {"decoder_input_ids": decoder_input_ids, "pixel_values": pixel_values}

	def forward(self, inputs):
	return self.model.generate(
	inputs["pixel_values"].to(self.device),
	decoder_input_ids=inputs["decoder_input_ids"].to(self.device),
	max_length=self.model.decoder.config.max_position_embeddings,
	early_stopping=True,
	pad_token_id=self.pre_processor.tokenizer.pad_token_id,
	eos_token_id=self.pre_processor.tokenizer.eos_token_id,
	use_cache=True,
	num_beams=1,
	bad_words_ids=[[self.pre_processor.tokenizer.unk_token_id]],
	return_dict_in_generate=True,
	).sequences

	def decode(self, outputs):
	sequence = self.pre_processor.batch_decode(outputs)[0]
	sequence = sequence.replace(self.pre_processor.tokenizer.eos_token, "")
	sequence = sequence.replace(self.pre_processor.tokenizer.pad_token, "")
	sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
	sequence = self.pre_processor.token2json(sequence)

	return sequence["answer"]