import os os.system('pip install torch torchvision') os.system('pip install git+https://github.com/facebookresearch/detectron2.git') os.system('sudo apt-get install tesseract-ocr') os.system('pip install -q pytesseract') import gradio as gr import torch from transformers import AutoProcessor, AutoModelForDocumentQuestionAnswering, pipeline from PIL import Image model_checkpoint = "pacman2223/test-mod" image = Image.open("./sample_cv.png") image.save("cv.png") image = Image.open("./sample_hack.png") image.save("hack.png") def demo_process(img, question): processor = AutoProcessor.from_pretrained(model_checkpoint) model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint) with torch.no_grad(): encoding = processor(img, question, return_tensors="pt") outputs = model(**encoding) start_logits = outputs.start_logits end_logits = outputs.end_logits predicted_start_idx = start_logits.argmax(-1).item() predicted_end_idx = end_logits.argmax(-1).item() processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]) predicted_answer_tokens = encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1] predicted_answer = processor.tokenizer.decode(predicted_answer_tokens) return predicted_answer # qa_pipeline = pipeline("document-question-answering", model="pacman2223/test-mod") # qa_pipeline(img, question) # return qa_pipeline demo = gr.Interface( fn=demo_process, inputs=["image", "text"], outputs="json", title=f"BIP demonstration for `layoutlmv2` task", description="""This model is trained with 1200 receipt images of Docqva dataset.
""", examples=[["cv.png", "What are the relevant courses?"], ["hack.png", "When does the hackathon end?"]], cache_examples=False, ) demo.launch()