Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,13 +3,11 @@ import gradio as gr
|
|
3 |
from PIL import Image
|
4 |
import requests
|
5 |
|
6 |
-
|
7 |
from transformers import ViTFeatureExtractor
|
8 |
feature_extractor = ViTFeatureExtractor()
|
9 |
# or, to load one that corresponds to a checkpoint on the hub:
|
10 |
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
|
11 |
|
12 |
-
|
13 |
from transformers import VisionEncoderDecoderModel
|
14 |
# initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
|
15 |
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
|
@@ -28,15 +26,6 @@ tokenizer = AutoTokenizer.from_pretrained(repo_name)
|
|
28 |
model = VisionEncoderDecoderModel.from_pretrained(repo_name)
|
29 |
|
30 |
def get_quote(image):
|
31 |
-
|
32 |
-
#image = Image.open(image_1).raw
|
33 |
-
#image = Image.open(image_1)
|
34 |
-
|
35 |
-
#url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
36 |
-
#with Image.open(requests.get(url, stream=True).raw) as image:
|
37 |
-
|
38 |
-
#image.save("cats.png")
|
39 |
-
|
40 |
|
41 |
##############
|
42 |
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
|
@@ -47,18 +36,12 @@ def get_quote(image):
|
|
47 |
# decode into text
|
48 |
preds = tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)
|
49 |
preds = [pred.strip() for pred in preds]
|
50 |
-
#print(preds)
|
51 |
-
|
52 |
return preds
|
53 |
|
54 |
-
|
55 |
#1: Text to Speech
|
56 |
-
|
57 |
-
title = "Image to text generation"
|
58 |
|
59 |
-
demo = gr.Interface(fn=get_quote, inputs=gr.inputs.Image(type="pil"), outputs=['text'],title = title, description = "
|
60 |
-
#inputs = "image"
|
61 |
-
#inputs=gr.inputs.Image(type="pil")
|
62 |
if __name__ == "__main__":
|
63 |
|
64 |
demo.launch(debug=True, cache_examples=True)
|
|
|
3 |
from PIL import Image
|
4 |
import requests
|
5 |
|
|
|
6 |
from transformers import ViTFeatureExtractor
|
7 |
feature_extractor = ViTFeatureExtractor()
|
8 |
# or, to load one that corresponds to a checkpoint on the hub:
|
9 |
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
|
10 |
|
|
|
11 |
from transformers import VisionEncoderDecoderModel
|
12 |
# initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
|
13 |
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
|
|
|
26 |
model = VisionEncoderDecoderModel.from_pretrained(repo_name)
|
27 |
|
28 |
def get_quote(image):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
##############
|
31 |
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
|
|
|
36 |
# decode into text
|
37 |
preds = tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)
|
38 |
preds = [pred.strip() for pred in preds]
|
|
|
|
|
39 |
return preds
|
40 |
|
|
|
41 |
#1: Text to Speech
|
42 |
+
title = "Get a sentence with items, present in the image"
|
|
|
43 |
|
44 |
+
demo = gr.Interface(fn=get_quote, inputs=gr.inputs.Image(type="pil"), outputs=['text'],title = title, description = "Upload an image file and get text from it" ,cache_examples=False, enable_queue=True).launch()
|
|
|
|
|
45 |
if __name__ == "__main__":
|
46 |
|
47 |
demo.launch(debug=True, cache_examples=True)
|