ipvikas commited on
Commit
44766f9
·
1 Parent(s): f5bd615

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -19
app.py CHANGED
@@ -3,13 +3,11 @@ import gradio as gr
3
  from PIL import Image
4
  import requests
5
 
6
-
7
  from transformers import ViTFeatureExtractor
8
  feature_extractor = ViTFeatureExtractor()
9
  # or, to load one that corresponds to a checkpoint on the hub:
10
  feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
11
 
12
-
13
  from transformers import VisionEncoderDecoderModel
14
  # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
15
  model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
@@ -28,15 +26,6 @@ tokenizer = AutoTokenizer.from_pretrained(repo_name)
28
  model = VisionEncoderDecoderModel.from_pretrained(repo_name)
29
 
30
  def get_quote(image):
31
-
32
- #image = Image.open(image_1).raw
33
- #image = Image.open(image_1)
34
-
35
- #url = "http://images.cocodataset.org/val2017/000000039769.jpg"
36
- #with Image.open(requests.get(url, stream=True).raw) as image:
37
-
38
- #image.save("cats.png")
39
-
40
 
41
  ##############
42
  pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
@@ -47,18 +36,12 @@ def get_quote(image):
47
  # decode into text
48
  preds = tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)
49
  preds = [pred.strip() for pred in preds]
50
- #print(preds)
51
-
52
  return preds
53
 
54
-
55
  #1: Text to Speech
56
- #import gradio as gr
57
- title = "Image to text generation"
58
 
59
- demo = gr.Interface(fn=get_quote, inputs=gr.inputs.Image(type="pil"), outputs=['text'],title = title, description = "Import an image file and get text from it" ,cache_examples=False, enable_queue=True).launch()
60
- #inputs = "image"
61
- #inputs=gr.inputs.Image(type="pil")
62
  if __name__ == "__main__":
63
 
64
  demo.launch(debug=True, cache_examples=True)
 
3
  from PIL import Image
4
  import requests
5
 
 
6
  from transformers import ViTFeatureExtractor
7
  feature_extractor = ViTFeatureExtractor()
8
  # or, to load one that corresponds to a checkpoint on the hub:
9
  feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
10
 
 
11
  from transformers import VisionEncoderDecoderModel
12
  # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
13
  model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
 
26
  model = VisionEncoderDecoderModel.from_pretrained(repo_name)
27
 
28
  def get_quote(image):
 
 
 
 
 
 
 
 
 
29
 
30
  ##############
31
  pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
 
36
  # decode into text
37
  preds = tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)
38
  preds = [pred.strip() for pred in preds]
 
 
39
  return preds
40
 
 
41
  #1: Text to Speech
42
+ title = "Get a sentence with items, present in the image"
 
43
 
44
+ demo = gr.Interface(fn=get_quote, inputs=gr.inputs.Image(type="pil"), outputs=['text'],title = title, description = "Upload an image file and get text from it" ,cache_examples=False, enable_queue=True).launch()
 
 
45
  if __name__ == "__main__":
46
 
47
  demo.launch(debug=True, cache_examples=True)