Makhinur commited on
Commit
1679fe8
·
verified ·
1 Parent(s): 54fce6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -20
app.py CHANGED
@@ -1,36 +1,32 @@
1
  from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast
 
2
 
 
3
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
4
  vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
5
  tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2")
6
 
7
  def vit2distilgpt2(img):
 
8
  pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
9
- encoder_outputs = model.generate(pixel_values.to('cpu'), num_beams=5, num_return_sequences=3)
10
- generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
11
-
12
- return generated_sentences
13
-
14
- import gradio as gr
15
 
16
- inputs = [
17
- gr.inputs.Image(type="pil", label="Original Images")
18
- ]
19
 
20
- outputs = [
21
- gr.outputs.Textbox(label="Caption 1"),
22
-
23
- ]
24
 
25
  title = "Image Captioning using ViT + GPT2"
26
- description = "ViT and GPT2 are used to generate Image Caption for the uploaded image. COCO DataSet is used for Training"
27
-
28
 
29
  gr.Interface(
30
- vit2distilgpt2,
31
- inputs,
32
- outputs,
33
  title=title,
34
  description=description,
35
-
36
- ).launch(debug=True, enable_queue=True)
 
1
  from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast
2
+ import gradio as gr
3
 
4
+ # Load the model and preprocessing tools
5
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
6
  vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
7
  tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2")
8
 
9
  def vit2distilgpt2(img):
10
+ # Preprocess the image
11
  pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
12
+
13
+ # Generate a single caption
14
+ encoder_outputs = model.generate(pixel_values.to('cpu'), num_beams=5, num_return_sequences=1)
15
+ generated_sentence = tokenizer.decode(encoder_outputs[0], skip_special_tokens=True)
 
 
16
 
17
+ return generated_sentence
 
 
18
 
19
+ # Gradio interface setup
20
+ inputs = gr.inputs.Image(type="pil", label="Original Image")
21
+ outputs = gr.outputs.Textbox(label="Caption")
 
22
 
23
  title = "Image Captioning using ViT + GPT2"
24
+ description = "ViT and GPT2 are used to generate an image caption for the uploaded image. COCO dataset is used for training."
 
25
 
26
  gr.Interface(
27
+ fn=vit2distilgpt2,
28
+ inputs=inputs,
29
+ outputs=outputs,
30
  title=title,
31
  description=description,
32
+ ).launch(debug=True, enable_queue=True)