Spaces:
Runtime error
Runtime error
from PIL import Image | |
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering, MarianMTModel, MarianTokenizer | |
import gradio as gr | |
import torch | |
import warnings | |
warnings.filterwarnings("ignore") | |
# Load BLIP models | |
captioning_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
captioning_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") | |
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") | |
# Dictionary to store translation models and tokenizers for different languages | |
translation_models = { | |
"Spanish": 'Helsinki-NLP/opus-mt-en-es', | |
"German": 'Helsinki-NLP/opus-mt-en-de', | |
"Chinese": 'Helsinki-NLP/opus-mt-en-zh', | |
"Japanese": 'Helsinki-NLP/opus-mt-en-ja', | |
"Russian": 'Helsinki-NLP/opus-mt-en-ru', | |
"Arabic": 'Helsinki-NLP/opus-mt-en-ar', | |
"Hindi": 'Helsinki-NLP/opus-mt-en-hi', | |
"Urdu": 'Helsinki-NLP/opus-mt-en-ur' | |
} | |
# Load translation models and tokenizers | |
loaded_translation_models = {} | |
loaded_translation_tokenizers = {} | |
for lang, model_name in translation_models.items(): | |
try: | |
loaded_translation_models[lang] = MarianMTModel.from_pretrained(model_name) | |
loaded_translation_tokenizers[lang] = MarianTokenizer.from_pretrained(model_name) | |
except Exception as e: | |
print(f"Error loading model for {lang}: {e}") | |
# Captioning function | |
def caption(image): | |
image = image.convert("RGB") | |
inputs = captioning_processor(image, return_tensors="pt") | |
out = captioning_model.generate(**inputs) | |
return captioning_processor.decode(out[0], skip_special_tokens=True) | |
# Visual Question Answering function | |
def qna(image, question): | |
image = image.convert("RGB") | |
inputs = processor(image, question, return_tensors="pt") | |
out = model.generate(**inputs) | |
return processor.decode(out[0], skip_special_tokens=True) | |
# Translation function | |
def translate_text(text, target_lang="Spanish"): | |
model = loaded_translation_models.get(target_lang) | |
tokenizer = loaded_translation_tokenizers.get(target_lang) | |
if model is None or tokenizer is None: | |
return f"Translation model for {target_lang} is not available." | |
inputs = tokenizer(text, return_tensors="pt") | |
translated = model.generate(**inputs) | |
return tokenizer.decode(translated[0], skip_special_tokens=True) | |
# Combined Captioning and Translation function | |
def caption_and_translate(image, target_lang="Spanish"): | |
caption_text = caption(image) | |
translated_caption = translate_text(caption_text, target_lang) | |
return caption_text, translated_caption | |
# Create Gradio interfaces | |
interface1 = gr.Interface(fn=caption, | |
inputs=gr.components.Image(type="pil"), | |
outputs=gr.components.Textbox(label="Generated Caption by BLIP"), | |
description="BLIP Image Captioning") | |
interface2 = gr.Interface(fn=qna, | |
inputs=[gr.components.Image(type="pil"), gr.components.Textbox(label="Question")], | |
outputs=gr.components.Textbox(label="Answer generated by BLIP"), | |
description="BLIP Visual Question Answering of Images") | |
interface3 = gr.Interface(fn=caption_and_translate, | |
inputs=[gr.components.Image(type="pil"), gr.components.Dropdown(label="Target Language", choices=["Spanish", "German", "Chinese", "Japanese", "Russian", "Arabic", "Hindi", "Urdu"])], | |
outputs=[gr.components.Textbox(label="Generated Caption"), | |
gr.components.Textbox(label="Translated Caption")], | |
description="Image Captioning and Translation") | |
title = "Automated Image Captioning and Visual QnA Engine" | |
final_interface = gr.TabbedInterface([interface1, interface2, interface3], | |
["Captioning", "Visual QnA", "Captioning and Translation"], | |
title=title, theme=gr.themes.Soft()) | |
final_interface.launch(inbrowser=True) | |