jadechoghari commited on
Commit
d866300
1 Parent(s): ba1211e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +102 -1
README.md CHANGED
@@ -1,3 +1,104 @@
1
  ---
2
  library_name: transformers
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  library_name: transformers
3
+ ---
4
+
5
+ ---
6
+
7
+ ## How to Use the *ferret-gemma* Model
8
+
9
+ Please download and save `builder.py`, `conversation.py` locally.
10
+
11
+ ### Basic Text Generation
12
+ ```python
13
+ import torch
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM
15
+
16
+ # load the model and tokenizer
17
+ model_name = "jadechoghari/ferret-gemma"
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
20
+
21
+ # give input text
22
+ input_text = "The United States of America is a country situated on earth"
23
+
24
+ # tokenize the input text
25
+ inputs = tokenizer(input_text, return_tensors="pt", padding=True).to("cuda" if torch.cuda.is_available() else "cpu")
26
+
27
+ model = model.to("cuda" if torch.cuda.is_available() else "cpu")
28
+
29
+ output = model.generate(inputs['input_ids'], max_length=50, num_return_sequences=1)
30
+
31
+ # decode and print the output
32
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
33
+ print(generated_text)
34
+ ```
35
+
36
+ ### Image and Text Generation
37
+ ```python
38
+ import torch
39
+ from PIL import Image
40
+ from conversation import conv_templates
41
+ from builder import load_pretrained_model # custom model loader
42
+
43
+ # load model and tokenizer, then preprocess an image
44
+ def infer_single_prompt(image_path, prompt, model_path):
45
+ img = Image.open(image_path).convert('RGB')
46
+ tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "ferret_gemma")
47
+ image_tensor = image_processor.preprocess(img, return_tensors='pt', size=(336, 336))['pixel_values'][0].unsqueeze(0).half()
48
+
49
+ # prepare prompt
50
+ conv = conv_templates["ferret_gemma_instruct"].copy()
51
+ conv.append_message(conv.roles[0], f"Image and prompt: {prompt}")
52
+ input_ids = tokenizer(conv.get_prompt(), return_tensors='pt')['input_ids'].cuda()
53
+
54
+ image_tensor = image_tensor.cuda()
55
+
56
+ # generate text output
57
+ with torch.inference_mode():
58
+ output_ids = model.generate(input_ids, images=image_tensor, max_new_tokens=1024)
59
+
60
+ # decode the output
61
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
62
+ return output_text.strip()
63
+
64
+ # Usage
65
+ result = infer_single_prompt("image.jpg", "Describe the contents of the image.", "jadechoghari/ferret-gemma")
66
+ print(result)
67
+ ```
68
+
69
+ ### Text, Image, and Bounding Box
70
+ ```python
71
+ import torch
72
+ from PIL import Image
73
+ from functools import partial
74
+ from builder import load_pretrained_model
75
+
76
+ # generates a bounding box mask
77
+ def generate_mask_for_feature(coor, img_w, img_h):
78
+ coor_mask = torch.zeros((img_w, img_h))
79
+ coor_mask[coor[0]:coor[2]+1, coor[1]:coor[3]+1] = 1
80
+ return coor_mask
81
+
82
+ def infer_with_bounding_box(image_path, prompt, model_path, region):
83
+ img = Image.open(image_path).convert('RGB')
84
+ tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "ferret_gemma")
85
+ image_tensor = image_processor.preprocess(img, return_tensors='pt', size=(336, 336))['pixel_values'][0].unsqueeze(0).half().cuda()
86
+
87
+ input_ids = tokenizer(f"Image and prompt: {prompt}", return_tensors='pt')['input_ids'].cuda()
88
+
89
+ # create region mask
90
+ mask = generate_mask_for_feature(region, *img.size).unsqueeze(0).half().cuda()
91
+
92
+ # generate output with region mask
93
+ with torch.inference_mode():
94
+ model.orig_forward = model.forward
95
+ model.forward = partial(model.orig_forward, region_masks=[[mask]])
96
+ output_ids = model.generate(input_ids, images=image_tensor, max_new_tokens=1024)
97
+
98
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
99
+ return output_text.strip()
100
+
101
+ # Usage
102
+ result = infer_with_bounding_box("image.jpg", "Describe the contents of the box.", "jadechoghari/ferret-gemma", (50, 50, 200, 200))
103
+ print(result)
104
+ ```