File size: 1,889 Bytes
8c1a99e
a4a68e2
8c1a99e
 
 
 
a4a68e2
 
 
8c1a99e
 
 
 
 
 
 
 
 
 
 
 
 
 
bc5da26
 
 
8c1a99e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Importing necessary libraries
import subprocess
import spaces
from transformers import AutoProcessor, AutoModelForCausalLM


# Install the required dependencies
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Load model and processor from Hugging Face
model_id = "microsoft/Florence-2-large-ft"
model = (
    AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to("cuda").eval()
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)


@spaces.GPU(duration=120)
def run_example(task_prompt, image, text_input=None):
    """
    Runs an example using the given task prompt and image.

    Args:
        - task_prompt (str): The task prompt for the example.
        - image (PIL.Image.Image): The image to be processed.
        - text_input (str, optional): Additional text input to be appended to the task prompt. Defaults to None.

    Returns:
        str: The parsed answer generated by the model.
    """
    # If there is no text input, use the task prompt as the prompt
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input

    # Process the image and text input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")

    # Generate the answer using the model
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text, task=task_prompt, image_size=(image.width, image.height)
    )

    # Return the parsed answer
    return parsed_answer