Spaces:
Runtime error
Runtime error
File size: 4,344 Bytes
338a91c 4772a9b 674ccb7 4772a9b 674ccb7 4772a9b 674ccb7 4772a9b 674ccb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering, MarianMTModel, MarianTokenizer
import gradio as gr
import torch
import warnings
warnings.filterwarnings("ignore")
# Load BLIP models
captioning_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
captioning_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
# Dictionary to store translation models and tokenizers for different languages
translation_models = {
"Spanish": 'Helsinki-NLP/opus-mt-en-es',
"German": 'Helsinki-NLP/opus-mt-en-de',
"Chinese": 'Helsinki-NLP/opus-mt-en-zh',
"Japanese": 'Helsinki-NLP/opus-mt-en-ja',
"Russian": 'Helsinki-NLP/opus-mt-en-ru',
"Arabic": 'Helsinki-NLP/opus-mt-en-ar',
"Hindi": 'Helsinki-NLP/opus-mt-en-hi',
"Urdu": 'Helsinki-NLP/opus-mt-en-ur'
}
# Load translation models and tokenizers
loaded_translation_models = {}
loaded_translation_tokenizers = {}
for lang, model_name in translation_models.items():
try:
loaded_translation_models[lang] = MarianMTModel.from_pretrained(model_name)
loaded_translation_tokenizers[lang] = MarianTokenizer.from_pretrained(model_name)
print(f"Successfully loaded translation model for {lang}")
except Exception as e:
print(f"Error loading model for {lang}: {e}")
# Captioning function
def caption(image):
image = image.convert("RGB")
inputs = captioning_processor(image, return_tensors="pt")
out = captioning_model.generate(**inputs)
return captioning_processor.decode(out[0], skip_special_tokens=True)
# Visual Question Answering function
def qna(image, question):
image = image.convert("RGB")
inputs = processor(image, question, return_tensors="pt")
out = model.generate(**inputs)
return processor.decode(out[0], skip_special_tokens=True)
# Translation function
def translate_text(text, target_lang="Spanish"):
model = loaded_translation_models.get(target_lang)
tokenizer = loaded_translation_tokenizers.get(target_lang)
if model is None or tokenizer is None:
return f"Translation model for {target_lang} is not available."
inputs = tokenizer(text, return_tensors="pt")
translated = model.generate(**inputs)
return tokenizer.decode(translated[0], skip_special_tokens=True)
# Combined Captioning and Translation function
def caption_and_translate(image, target_lang="Spanish"):
caption_text = caption(image)
print(f"Generated caption: {caption_text}")
translated_caption = translate_text(caption_text, target_lang)
print(f"Translated caption: {translated_caption}")
return caption_text, translated_caption
# Create Gradio interfaces
interface1 = gr.Interface(fn=caption,
inputs=gr.components.Image(type="pil"),
outputs=gr.components.Textbox(label="Generated Caption by BLIP"),
description="BLIP Image Captioning")
interface2 = gr.Interface(fn=qna,
inputs=[gr.components.Image(type="pil"), gr.components.Textbox(label="Question")],
outputs=gr.components.Textbox(label="Answer generated by BLIP"),
description="BLIP Visual Question Answering of Images")
interface3 = gr.Interface(fn=caption_and_translate,
inputs=[gr.components.Image(type="pil"), gr.components.Dropdown(label="Target Language", choices=["Spanish", "German", "Chinese", "Japanese", "Russian", "Arabic", "Hindi", "Urdu"])],
outputs=[gr.components.Textbox(label="Generated Caption"),
gr.components.Textbox(label="Translated Caption")],
description="Image Captioning and Translation")
title = "Automated Image Captioning and Visual QnA Engine"
final_interface = gr.TabbedInterface([interface1, interface2, interface3],
["Captioning", "Visual QnA", "Captioning and Translation"],
title=title, theme=gr.themes.Soft())
final_interface.launch(inbrowser=True)
|