Spaces:
Paused
Paused
import os | |
import tempfile | |
import gradio as gr | |
import numpy as np | |
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor | |
from qwen_vl_utils import process_vision_info | |
import torch | |
from ast import literal_eval | |
from PIL import Image | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
# Load the model on the available device(s) | |
model = Qwen2VLForConditionalGeneration.from_pretrained( | |
"Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto" | |
) | |
# Load the processor | |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") | |
# Define your prompts | |
other_benifits = '''Extract the following information in the given format: | |
{'other_benefits_and_information': { | |
'401k eru: {'This Period':'', 'Year-to-Date':''}}, | |
'quota summary': | |
{ | |
'sick:': '', | |
'vacation:': '', | |
} | |
'payment method': 'eg. Direct payment', | |
'Amount': 'eg. 12.99' | |
} | |
''' | |
tax_deductions = '''Extract the following information in the given format: | |
{ | |
'tax_deductions': { | |
'federal:': { | |
'withholding tax:': {'Amount':'', 'Year-To_Date':""}, | |
'ee social security tax:': {'Amount':'', 'Year-To_Date':""}, | |
'ee medicare tax:': {'Amount':'', 'Year-To_Date':""}}, | |
'california:': { | |
'withholding tax:': {'Amount':'', 'Year-To_Date':""}, | |
'ee disability tax:': {'Amount':'', 'Year-To-Date':""}}}, | |
} | |
''' | |
def demo(image_path, prompt): | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "image", | |
"image": image_path, # Use the file path here | |
}, | |
{"type": "text", "text": prompt}, | |
], | |
} | |
] | |
logging.info("Step 1: Preparing inference") | |
# Preparation for inference | |
text = processor.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
logging.info("2") | |
image_inputs, video_inputs = process_vision_info(messages) | |
logging.info("3") | |
inputs = processor( | |
text=[text], | |
images=image_inputs, | |
videos=video_inputs, | |
padding=True, | |
return_tensors="pt", | |
) | |
logging.info("4") | |
inputs = inputs.to("cuda") | |
logging.info("5") | |
# Inference: Generation of the output | |
generated_ids = model.generate(**inputs, max_new_tokens=1500) | |
logging.info("6") | |
generated_ids_trimmed = [ | |
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
] | |
logging.info("7") | |
output_text = processor.batch_decode( | |
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
) | |
logging.info("8", output_text) | |
# Handle output text to convert it into JSON | |
try: | |
almost_json = output_text[0].split('\n')[-1].split('\n')[0] | |
json = literal_eval(almost_json) | |
except: | |
json = output_text[0] # Return raw output if JSON parsing fails | |
return json | |
def process_document(image): | |
# Save the uploaded image to a temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp_file: | |
image = Image.fromarray(image) # Convert NumPy array to PIL Image | |
image.save(tmp_file.name) # Save the image to the temporary file | |
image_path = tmp_file.name # Get the path of the saved file | |
logging.info("the path made for image: %s", image_path) | |
# Process the image with your model | |
one = demo(image_path, other_benifits) | |
logging.info("kjf") | |
two = demo(image_path, tax_deductions) | |
json_op = { | |
"tax_deductions": one, | |
"other_benifits": two | |
} | |
# Optionally, you can delete the temporary file after use | |
os.remove(image_path) | |
return json_op | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=process_document, | |
inputs="image", # Gradio will handle the image input | |
outputs="json", | |
title="PaySlip_Demo_Model", | |
examples=[["Slip_1.jpg"], ["Slip_2.jpg"]], | |
cache_examples=False | |
) | |
demo.launch() | |