Spaces:

ixxan
/

cross-lingual-vqa

Running

Irpan commited on Nov 17, 2024

Commit

290c136

1 Parent(s): b063473

a

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import ViltProcessor, ViltForQuestionAnswering
 import torch
 import httpcore
@@ -11,9 +11,6 @@ import re
 torch.hub.download_url_to_file('https://media.istockphoto.com/id/1174602891/photo/two-monkeys-mom-and-cub-eat-bananas.jpg?s=612x612&w=0&k=20&c=r7VXi9d1wHhyq3iAk9D2Z3yTZiOJMlLNtjdVRBEjG7g=', 'monkeys.jpg')
-processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
-model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 # List of acceptable languages
 acceptable_languages = set(L.split()[0] for L in LANGCODES)
 acceptable_languages.add("mandarin")
@@ -59,14 +56,16 @@ def remove_language_phrase(sentence):
     return cleaned_sentence
 def vqa(image, text):
-    encoding = processor(image, text, return_tensors="pt")
     # forward pass
     with torch.no_grad():
-     outputs = model(**encoding)
     logits = outputs.logits
     idx = logits.argmax(-1).item()
-    predicted_answer = model.config.id2label[idx]
     return predicted_answer
@@ -78,9 +77,9 @@ def main(image, text):
     return vqa_answer
-image = gr.inputs.Image(type="pil")
-question = gr.inputs.Textbox(label="Question")
-answer = gr.outputs.Textbox(label="Predicted answer")
 examples = [["monkeys.jpg", "How many monkeys are there, in French?"]]
 title = "Cross-lingual VQA"

 import gradio as gr
+from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline
 import torch
 import httpcore
 torch.hub.download_url_to_file('https://media.istockphoto.com/id/1174602891/photo/two-monkeys-mom-and-cub-eat-bananas.jpg?s=612x612&w=0&k=20&c=r7VXi9d1wHhyq3iAk9D2Z3yTZiOJMlLNtjdVRBEjG7g=', 'monkeys.jpg')
 # List of acceptable languages
 acceptable_languages = set(L.split()[0] for L in LANGCODES)
 acceptable_languages.add("mandarin")
     return cleaned_sentence
 def vqa(image, text):
+    vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+    vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+    encoding = vqa_processor(image, text, return_tensors="pt")
     # forward pass
     with torch.no_grad():
+     outputs = vqa_model(**encoding)
     logits = outputs.logits
     idx = logits.argmax(-1).item()
+    predicted_answer = vqa_model.config.id2label[idx]
     return predicted_answer
     return vqa_answer
+image = gr.Image(type="pil")
+question = gr.Textbox(label="Question")
+answer = gr.Textbox(label="Predicted answer")
 examples = [["monkeys.jpg", "How many monkeys are there, in French?"]]
 title = "Cross-lingual VQA"