iamrobotbear commited on
Commit
0320ac7
·
1 Parent(s): ca6ee07

trying to mirror what is here

Browse files

https://huggingface.co/spaces/nielsr/comparing-captioning-models/blob/main/app.py

Files changed (1) hide show
  1. app.py +6 -4
app.py CHANGED
@@ -3,14 +3,15 @@ import torch
3
  from PIL import Image
4
  from lavis.models import load_model_and_preprocess
5
  from lavis.processors import load_processor
6
- from transformers import AutoTokenizer, AutoModelForCausalLM
7
 
8
  # Load model and preprocessors for Image-Text Matching (LAVIS)
9
  device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
10
  model_itm, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)
11
 
12
- # Load tokenizer and model for Image Captioning (TextCaps)
13
  tokenizer_caption = AutoTokenizer.from_pretrained("microsoft/git-large-r-textcaps")
 
14
  model_caption = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps").to(device)
15
 
16
  # List of statements for Image-Text Matching
@@ -26,7 +27,9 @@ statements = [
26
 
27
  # Function to generate image captions using TextCaps
28
  def generate_image_captions(image):
29
- inputs = tokenizer_caption(image, return_tensors="pt", padding=True, truncation=True).to(device)
 
 
30
  outputs = model_caption.generate(**inputs)
31
  caption = tokenizer_caption.decode(outputs[0], skip_special_tokens=True)
32
  return caption
@@ -64,4 +67,3 @@ output = gr.outputs.Textbox(label="Results")
64
 
65
  iface = gr.Interface(fn=process_images_and_statements, inputs=image_input, outputs=output, title="Image Captioning and Image-Text Matching")
66
  iface.launch()
67
-
 
3
  from PIL import Image
4
  from lavis.models import load_model_and_preprocess
5
  from lavis.processors import load_processor
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoFeatureExtractor
7
 
8
  # Load model and preprocessors for Image-Text Matching (LAVIS)
9
  device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
10
  model_itm, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)
11
 
12
+ # Load tokenizer, feature extractor, and model for Image Captioning (TextCaps)
13
  tokenizer_caption = AutoTokenizer.from_pretrained("microsoft/git-large-r-textcaps")
14
+ feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/git-large-r-textcaps")
15
  model_caption = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps").to(device)
16
 
17
  # List of statements for Image-Text Matching
 
27
 
28
  # Function to generate image captions using TextCaps
29
  def generate_image_captions(image):
30
+ # Preprocess image using feature extractor
31
+ inputs = feature_extractor(images=image, return_tensors="pt", padding=True, truncation=True).to(device)
32
+ # Generate captions
33
  outputs = model_caption.generate(**inputs)
34
  caption = tokenizer_caption.decode(outputs[0], skip_special_tokens=True)
35
  return caption
 
67
 
68
  iface = gr.Interface(fn=process_images_and_statements, inputs=image_input, outputs=output, title="Image Captioning and Image-Text Matching")
69
  iface.launch()