iamrobotbear commited on
Commit
ca6ee07
·
1 Parent(s): ecdbe90

returning without vilt

Browse files
Files changed (1) hide show
  1. app.py +7 -9
app.py CHANGED
@@ -3,14 +3,13 @@ import torch
3
  from PIL import Image
4
  from lavis.models import load_model_and_preprocess
5
  from lavis.processors import load_processor
6
- from transformers import ViTFeatureExtractor, AutoTokenizer, AutoModelForCausalLM
7
 
8
  # Load model and preprocessors for Image-Text Matching (LAVIS)
9
  device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
10
  model_itm, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)
11
 
12
- # Load feature extractor, tokenizer, and model for Image Captioning (TextCaps)
13
- feature_extractor = ViTFeatureExtractor.from_pretrained("microsoft/git-large-r-textcaps")
14
  tokenizer_caption = AutoTokenizer.from_pretrained("microsoft/git-large-r-textcaps")
15
  model_caption = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps").to(device)
16
 
@@ -27,9 +26,7 @@ statements = [
27
 
28
  # Function to generate image captions using TextCaps
29
  def generate_image_captions(image):
30
- # Process the image using feature_extractor
31
- inputs = feature_extractor(images=image, return_tensors="pt", padding=True, truncation=True).to(device)
32
- # Generate captions
33
  outputs = model_caption.generate(**inputs)
34
  caption = tokenizer_caption.decode(outputs[0], skip_special_tokens=True)
35
  return caption
@@ -63,7 +60,8 @@ def process_images_and_statements(image):
63
 
64
  # Gradio interface
65
  image_input = gr.inputs.Image()
66
- output = gr.outputs.Textbox(label="Output")
 
 
 
67
 
68
- iface = gr.Interface(fn=process_images_and_statements, inputs=image_input, outputs=output)
69
- iface.launch()
 
3
  from PIL import Image
4
  from lavis.models import load_model_and_preprocess
5
  from lavis.processors import load_processor
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
 
8
  # Load model and preprocessors for Image-Text Matching (LAVIS)
9
  device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
10
  model_itm, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)
11
 
12
+ # Load tokenizer and model for Image Captioning (TextCaps)
 
13
  tokenizer_caption = AutoTokenizer.from_pretrained("microsoft/git-large-r-textcaps")
14
  model_caption = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps").to(device)
15
 
 
26
 
27
  # Function to generate image captions using TextCaps
28
  def generate_image_captions(image):
29
+ inputs = tokenizer_caption(image, return_tensors="pt", padding=True, truncation=True).to(device)
 
 
30
  outputs = model_caption.generate(**inputs)
31
  caption = tokenizer_caption.decode(outputs[0], skip_special_tokens=True)
32
  return caption
 
60
 
61
  # Gradio interface
62
  image_input = gr.inputs.Image()
63
+ output = gr.outputs.Textbox(label="Results")
64
+
65
+ iface = gr.Interface(fn=process_images_and_statements, inputs=image_input, outputs=output, title="Image Captioning and Image-Text Matching")
66
+ iface.launch()
67