iamrobotbear commited on
Commit
afa4c81
·
1 Parent(s): e1bf7e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -1,19 +1,18 @@
1
  import gradio as gr
2
  import torch
3
- from pathlib import Path
4
  from PIL import Image
5
  import pandas as pd
6
  from lavis.models import load_model_and_preprocess
7
  from lavis.processors import load_processor
8
- from transformers import CLIPTokenizerFast, CLIPModel # Import CLIPTokenizerFast
9
 
10
  # Load model and preprocessors for Image-Text Matching (LAVIS)
11
  device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
12
  model_itm, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)
13
 
14
- # Load model and processor for Image Captioning (TextCaps)
15
- model_caption = CLIPModel.from_pretrained("microsoft/git-large-r-textcaps")
16
- processor_caption = CLIPTokenizerFast.from_pretrained("microsoft/git-large-r-textcaps", from_slow=True) # Convert tokenizer
17
 
18
  # List of statements for Image-Text Matching
19
  statements = [
@@ -46,9 +45,9 @@ def compute_itm_scores(image):
46
  # Function to generate image captions using TextCaps
47
  def generate_image_captions(image):
48
  pil_image = Image.fromarray(image.astype('uint8'), 'RGB')
49
- inputs = processor_caption(pil_image, return_tensors="pt", padding=True, truncation=True)
50
  outputs = model_caption.generate(**inputs)
51
- caption = processor_caption.decode(outputs[0])
52
  return caption
53
 
54
  # Main function to perform image captioning and image-text matching
@@ -63,8 +62,9 @@ def process_images_and_statements(image):
63
  output = "Image Captions:\n" + captions + "\n\nITM Scores:\n" + itm_scores
64
  return output
65
 
 
66
  image_input = gr.inputs.Image()
67
  output = gr.outputs.Textbox(label="Results")
68
 
69
  iface = gr.Interface(fn=process_images_and_statements, inputs=image_input, outputs=output, title="Image Captioning and Image-Text Matching")
70
- iface.launch()
 
1
  import gradio as gr
2
  import torch
 
3
  from PIL import Image
4
  import pandas as pd
5
  from lavis.models import load_model_and_preprocess
6
  from lavis.processors import load_processor
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Import AutoTokenizer and AutoModelForSeq2SeqLM
8
 
9
  # Load model and preprocessors for Image-Text Matching (LAVIS)
10
  device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
11
  model_itm, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)
12
 
13
+ # Load model and tokenizer for Image Captioning (TextCaps)
14
+ model_caption = AutoModelForSeq2SeqLM.from_pretrained("microsoft/git-large-r-textcaps")
15
+ tokenizer_caption = AutoTokenizer.from_pretrained("microsoft/git-large-r-textcaps")
16
 
17
  # List of statements for Image-Text Matching
18
  statements = [
 
45
  # Function to generate image captions using TextCaps
46
  def generate_image_captions(image):
47
  pil_image = Image.fromarray(image.astype('uint8'), 'RGB')
48
+ inputs = tokenizer_caption(pil_image, return_tensors="pt", padding=True, truncation=True)
49
  outputs = model_caption.generate(**inputs)
50
+ caption = tokenizer_caption.decode(outputs[0])
51
  return caption
52
 
53
  # Main function to perform image captioning and image-text matching
 
62
  output = "Image Captions:\n" + captions + "\n\nITM Scores:\n" + itm_scores
63
  return output
64
 
65
+ # Gradio interface
66
  image_input = gr.inputs.Image()
67
  output = gr.outputs.Textbox(label="Results")
68
 
69
  iface = gr.Interface(fn=process_images_and_statements, inputs=image_input, outputs=output, title="Image Captioning and Image-Text Matching")
70
+ iface.launch()