Rehan3024 commited on
Commit
4772a9b
·
verified ·
1 Parent(s): c876057

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering, MarianMTModel, MarianTokenizer
3
+ import gradio as gr
4
+ import torch
5
+ import warnings
6
+
7
+ warnings.filterwarnings("ignore")
8
+
9
+ # Load BLIP models
10
+ captioning_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
11
+ captioning_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
12
+
13
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
14
+ model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
15
+
16
+ # Dictionary to store translation models and tokenizers for different languages
17
+ translation_models = {
18
+ "Spanish": 'Helsinki-NLP/opus-mt-en-es',
19
+ "German": 'Helsinki-NLP/opus-mt-en-de',
20
+ "Chinese": 'Helsinki-NLP/opus-mt-en-zh',
21
+ "Japanese": 'Helsinki-NLP/opus-mt-en-ja',
22
+ "Russian": 'Helsinki-NLP/opus-mt-en-ru',
23
+ "Arabic": 'Helsinki-NLP/opus-mt-en-ar',
24
+ "Hindi": 'Helsinki-NLP/opus-mt-en-hi',
25
+ "Urdu": 'Helsinki-NLP/opus-mt-en-ur'
26
+ }
27
+
28
+ # Load translation models and tokenizers
29
+ loaded_translation_models = {}
30
+ loaded_translation_tokenizers = {}
31
+
32
+ for lang, model_name in translation_models.items():
33
+ try:
34
+ loaded_translation_models[lang] = MarianMTModel.from_pretrained(model_name)
35
+ loaded_translation_tokenizers[lang] = MarianTokenizer.from_pretrained(model_name)
36
+ except Exception as e:
37
+ print(f"Error loading model for {lang}: {e}")
38
+
39
+ # Captioning function
40
+ def caption(image):
41
+ image = image.convert("RGB")
42
+ inputs = captioning_processor(image, return_tensors="pt")
43
+ out = captioning_model.generate(**inputs)
44
+ return captioning_processor.decode(out[0], skip_special_tokens=True)
45
+
46
+ # Visual Question Answering function
47
+ def qna(image, question):
48
+ image = image.convert("RGB")
49
+ inputs = processor(image, question, return_tensors="pt")
50
+ out = model.generate(**inputs)
51
+ return processor.decode(out[0], skip_special_tokens=True)
52
+
53
+ # Translation function
54
+ def translate_text(text, target_lang="Spanish"):
55
+ model = loaded_translation_models.get(target_lang)
56
+ tokenizer = loaded_translation_tokenizers.get(target_lang)
57
+ if model is None or tokenizer is None:
58
+ return f"Translation model for {target_lang} is not available."
59
+ inputs = tokenizer(text, return_tensors="pt")
60
+ translated = model.generate(**inputs)
61
+ return tokenizer.decode(translated[0], skip_special_tokens=True)
62
+
63
+ # Combined Captioning and Translation function
64
+ def caption_and_translate(image, target_lang="Spanish"):
65
+ caption_text = caption(image)
66
+ translated_caption = translate_text(caption_text, target_lang)
67
+ return caption_text, translated_caption
68
+
69
+ # Create Gradio interfaces
70
+ interface1 = gr.Interface(fn=caption,
71
+ inputs=gr.components.Image(type="pil"),
72
+ outputs=gr.components.Textbox(label="Generated Caption by BLIP"),
73
+ description="BLIP Image Captioning")
74
+
75
+ interface2 = gr.Interface(fn=qna,
76
+ inputs=[gr.components.Image(type="pil"), gr.components.Textbox(label="Question")],
77
+ outputs=gr.components.Textbox(label="Answer generated by BLIP"),
78
+ description="BLIP Visual Question Answering of Images")
79
+
80
+ interface3 = gr.Interface(fn=caption_and_translate,
81
+ inputs=[gr.components.Image(type="pil"), gr.components.Dropdown(label="Target Language", choices=["Spanish", "German", "Chinese", "Japanese", "Russian", "Arabic", "Hindi", "Urdu"])],
82
+ outputs=[gr.components.Textbox(label="Generated Caption"),
83
+ gr.components.Textbox(label="Translated Caption")],
84
+ description="Image Captioning and Translation")
85
+
86
+ title = "Automated Image Captioning and Visual QnA Engine"
87
+
88
+ final_interface = gr.TabbedInterface([interface1, interface2, interface3],
89
+ ["Captioning", "Visual QnA", "Captioning and Translation"],
90
+ title=title, theme=gr.themes.Soft())
91
+
92
+ final_interface.launch(inbrowser=True)