Spaces:

ixxan
/

cross-lingual-vqa

Running

App Files Files Community

Irpan commited on Nov 17, 2024

Commit

190650c

1 Parent(s): 205a1e3

m

Browse files

Files changed (1) hide show

app.py +30 -8

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline
 import torch
 import httpcore
@@ -9,8 +9,6 @@ from googletrans import Translator
 from googletrans import LANGCODES
 import re
-torch.hub.download_url_to_file('https://media.istockphoto.com/id/1174602891/photo/two-monkeys-mom-and-cub-eat-bananas.jpg?s=612x612&w=0&k=20&c=r7VXi9d1wHhyq3iAk9D2Z3yTZiOJMlLNtjdVRBEjG7g=', 'monkeys.jpg')
 # List of acceptable languages
 acceptable_languages = set(L.split()[0] for L in LANGCODES)
 acceptable_languages.add("mandarin")
@@ -56,10 +54,7 @@ def remove_language_phrase(sentence):
     return cleaned_sentence
 def vqa(image, text):
-    vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
-    vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
     encoding = vqa_processor(image, text, return_tensors="pt")
-    # forward pass
     with torch.no_grad():
      outputs = vqa_model(**encoding)
@@ -69,12 +64,39 @@ def vqa(image, text):
     return predicted_answer
 def main(image, text):
     en_question, question_src_lang = google_translate(text, dest='en')
     dest_lang = find_dest_language(en_question, question_src_lang)
     cleaned_sentence = remove_language_phrase(en_question)
     vqa_answer = vqa(image, cleaned_sentence)
-    final_answer, _ = google_translate(vqa_answer, dest=dest_lang)
     return final_answer
@@ -84,7 +106,7 @@ answer = gr.Textbox(label="Predicted answer")
 examples = [["monkeys.jpg", "How many monkeys are there, in French?"]]
 title = "Cross-lingual VQA"
-description = "ViLT (Vision and Language Transformer), fine-tuned on VQAv2 "
 interface = gr.Interface(fn=main,
                          inputs=[image, question],

 import gradio as gr
+from transformers import ViltProcessor, ViltForQuestionAnswering, AutoModelForSeq2SeqLM, AutoTokenizer
 import torch
 import httpcore
 from googletrans import LANGCODES
 import re
 # List of acceptable languages
 acceptable_languages = set(L.split()[0] for L in LANGCODES)
 acceptable_languages.add("mandarin")
     return cleaned_sentence
 def vqa(image, text):
     encoding = vqa_processor(image, text, return_tensors="pt")
     with torch.no_grad():
      outputs = vqa_model(**encoding)
     return predicted_answer
+def llm(cleaned_sentence, vqa_answer):
+    # Prepare the input prompt
+    prompt = (
+        f"A question: {cleaned_sentence}\n"
+        f"An answer: {vqa_answer}.\n"
+        f"Based on these, answer the question with a complete sentence without extra information."
+    )
+    inputs = flan_tokenizer(prompt, return_tensors="pt")
+    outputs = flan_model.generate(**inputs, max_length=50)
+    response = flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    print("T5 prompt: " + prompt)
+    print("T5 response: " + response)
+    return response
+torch.hub.download_url_to_file('https://media.istockphoto.com/id/1174602891/photo/two-monkeys-mom-and-cub-eat-bananas.jpg?s=612x612&w=0&k=20&c=r7VXi9d1wHhyq3iAk9D2Z3yTZiOJMlLNtjdVRBEjG7g=', 'monkeys.jpg')
+vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
 def main(image, text):
     en_question, question_src_lang = google_translate(text, dest='en')
     dest_lang = find_dest_language(en_question, question_src_lang)
     cleaned_sentence = remove_language_phrase(en_question)
     vqa_answer = vqa(image, cleaned_sentence)
+    llm_answer = llm(cleaned_sentence, vqa_answer)
+    final_answer, _ = google_translate(llm_answer, dest=dest_lang)
     return final_answer
 examples = [["monkeys.jpg", "How many monkeys are there, in French?"]]
 title = "Cross-lingual VQA"
+description = "Visual Question Answering (VQA) across langages"
 interface = gr.Interface(fn=main,
                          inputs=[image, question],