|
--- |
|
library_name: transformers |
|
--- |
|
|
|
--- |
|
|
|
## How to Use the *ferret-gemma* Model |
|
|
|
Please download and save `builder.py`, `conversation.py` locally. |
|
|
|
### Basic Text Generation |
|
```python |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
# load the model and tokenizer |
|
model_name = "jadechoghari/ferret-gemma" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) |
|
|
|
# give input text |
|
input_text = "The United States of America is a country situated on earth" |
|
|
|
# tokenize the input text |
|
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
model = model.to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
output = model.generate(inputs['input_ids'], max_length=50, num_return_sequences=1) |
|
|
|
# decode and print the output |
|
generated_text = tokenizer.decode(output[0], skip_special_tokens=True) |
|
print(generated_text) |
|
``` |
|
|
|
### Image and Text Generation |
|
```python |
|
import torch |
|
from PIL import Image |
|
from conversation import conv_templates |
|
from builder import load_pretrained_model # custom model loader |
|
|
|
# load model and tokenizer, then preprocess an image |
|
def infer_single_prompt(image_path, prompt, model_path): |
|
img = Image.open(image_path).convert('RGB') |
|
tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "ferret_gemma") |
|
image_tensor = image_processor.preprocess(img, return_tensors='pt', size=(336, 336))['pixel_values'][0].unsqueeze(0).half() |
|
|
|
# prepare prompt |
|
conv = conv_templates["ferret_gemma_instruct"].copy() |
|
conv.append_message(conv.roles[0], f"Image and prompt: {prompt}") |
|
input_ids = tokenizer(conv.get_prompt(), return_tensors='pt')['input_ids'].cuda() |
|
|
|
image_tensor = image_tensor.cuda() |
|
|
|
# generate text output |
|
with torch.inference_mode(): |
|
output_ids = model.generate(input_ids, images=image_tensor, max_new_tokens=1024) |
|
|
|
# decode the output |
|
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
return output_text.strip() |
|
|
|
# Usage |
|
result = infer_single_prompt("image.jpg", "Describe the contents of the image.", "jadechoghari/ferret-gemma") |
|
print(result) |
|
``` |
|
|
|
### Text, Image, and Bounding Box |
|
```python |
|
import torch |
|
from PIL import Image |
|
from functools import partial |
|
from builder import load_pretrained_model |
|
|
|
# generates a bounding box mask |
|
def generate_mask_for_feature(coor, img_w, img_h): |
|
coor_mask = torch.zeros((img_w, img_h)) |
|
coor_mask[coor[0]:coor[2]+1, coor[1]:coor[3]+1] = 1 |
|
return coor_mask |
|
|
|
def infer_with_bounding_box(image_path, prompt, model_path, region): |
|
img = Image.open(image_path).convert('RGB') |
|
tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "ferret_gemma") |
|
image_tensor = image_processor.preprocess(img, return_tensors='pt', size=(336, 336))['pixel_values'][0].unsqueeze(0).half().cuda() |
|
|
|
input_ids = tokenizer(f"Image and prompt: {prompt}", return_tensors='pt')['input_ids'].cuda() |
|
|
|
# create region mask |
|
mask = generate_mask_for_feature(region, *img.size).unsqueeze(0).half().cuda() |
|
|
|
# generate output with region mask |
|
with torch.inference_mode(): |
|
model.orig_forward = model.forward |
|
model.forward = partial(model.orig_forward, region_masks=[[mask]]) |
|
output_ids = model.generate(input_ids, images=image_tensor, max_new_tokens=1024) |
|
|
|
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
return output_text.strip() |
|
|
|
# Usage |
|
result = infer_with_bounding_box("image.jpg", "Describe the contents of the box.", "jadechoghari/ferret-gemma", (50, 50, 200, 200)) |
|
print(result) |
|
``` |