import os import tempfile import gradio as gr import numpy as np from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info import torch from ast import literal_eval from PIL import Image # Load the model on the available device(s) model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto" ) # Load the processor processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") # Define your prompts other_benifits = '''Extract the following information in the given format: {'other_benefits_and_information': { '401k eru: {'This Period':'', 'Year-to-Date':''}}, 'quota summary': { 'sick:': '', 'vacation:': '', } 'payment method': 'eg. Direct payment', 'Amount': 'eg. 12.99' } ''' tax_deductions = '''Extract the following information in the given format: { 'tax_deductions': { 'federal:': { 'withholding tax:': {'Amount':'', 'Year-To_Date':""}, 'ee social security tax:': {'Amount':'', 'Year-To_Date':""}, 'ee medicare tax:': {'Amount':'', 'Year-To_Date':""}}, 'california:': { 'withholding tax:': {'Amount':'', 'Year-To_Date':""}, 'ee disability tax:': {'Amount':'', 'Year-To-Date':""}}}, } ''' def process_document(image): with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp_file: image = Image.fromarray(image) image.save(tmp_file.name) image_path = tmp_file.name messages = [ { "role": "user", "content": [ { "type": "image", "image": image_path, }, {"type": "text", "text": '''Extract the following information in the given format: { 'tax_deductions': { 'federal:': { 'withholding tax:': {'Amount':'', 'Year-To_Date':""}, 'ee social security tax:': {'Amount':'', 'Year-To_Date':""}, 'ee medicare tax:': {'Amount':'', 'Year-To_Date':""}}, 'california:': { 'withholding tax:': {'Amount':'', 'Year-To_Date':""}, 'ee disability tax:': {'Amount':'', 'Year-To-Date':""}}}, }'''}, ], } ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") generated_ids = model.generate(**inputs, max_new_tokens=1500) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) try: almost_json = output_text[0].split('```\n')[-1].split('\n```')[0] json = literal_eval(almost_json) except: try: almost_json = output_text[0].split('```json\n')[-1].split('\n```')[0] json = literal_eval(almost_json) except: json = output_text[0] messages = [ { "role": "user", "content": [ { "type": "image", "image": image_path, }, {"type": "text", "text": '''Extract the following information in the given format: {'other_benefits_and_information': { '401k eru: {'This Period':'', 'Year-to-Date':''}}, 'quota summary': { 'sick:': '', 'vacation:': '', } 'payment method': 'eg. Direct payment', 'Amount': 'eg. 12.99' }'''}, ], } ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") # Inference: Generation of the output generated_ids = model.generate(**inputs, max_new_tokens=1500) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) try: almost_json_2 = output_text[0].split('```\n')[-1].split('\n```')[0] json_2 = literal_eval(almost_json_2) except: try: almost_json_2 = output_text[0].split('```json\n')[-1].split('\n```')[0] json_2 = literal_eval(almost_json_2) except: json_2 = output_text[0] # json_op = { # "tax_deductions": json, # "other_benifits": json_2 # } # # Optionally, you can delete the temporary file after use os.remove(image_path) return json, json_2 # Create Gradio interface demo = gr.Interface( fn=process_document, inputs="image", # Gradio will handle the image input outputs=[ gr.outputs.JSON(label="Tax Deductions Information"), # First output box with heading gr.outputs.JSON(label="Other Benefits and Information") # Second output box with heading ], title="PaySlip_Demo_Model", examples=[["Slip_1.jpg"], ["Slip_2.jpg"]], cache_examples=False ) demo.launch()