Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import ( | |
BlipProcessor, | |
BlipForQuestionAnswering, | |
pipeline, | |
AutoTokenizer, | |
AutoModelForCausalLM | |
) | |
from modelscope.pipelines import pipeline as ms_pipeline | |
from PIL import Image | |
def load_models(): | |
# Chargement des modèles | |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") | |
blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") | |
# Modèle de transcription audio | |
audio_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small") | |
# Modèle de génération de texte (version gratuite GPT-2) | |
text_generator = pipeline("text-generation", model="gpt2") | |
return blip_processor, blip_model, audio_transcriber, text_generator | |
def analyze_image(image, blip_processor, blip_model): | |
# Questions pour l'analyse d'image | |
questions = [ | |
"What is in the picture?", | |
"What are the main colors?", | |
"What is the setting or background?", | |
"What is happening in the image?", | |
] | |
responses = {} | |
for question in questions: | |
inputs = blip_processor(images=image, text=question, return_tensors="pt") | |
outputs = blip_model.generate(**inputs) | |
answer = blip_processor.decode(outputs[0], skip_special_tokens=True) | |
responses[question] = answer | |
description = f"This image shows {responses['What is in the picture?']}. " | |
description += f"The main colors are {responses['What are the main colors?']}. " | |
description += f"The setting is {responses['What is the setting or background?']}. " | |
description += f"In the scene, {responses['What is happening in the image?']}." | |
return description | |
def process_inputs(image, audio, text, models): | |
blip_processor, blip_model, audio_transcriber, text_generator = models | |
final_prompt = "" | |
# Analyse de l'image si présente | |
if image is not None: | |
image_description = analyze_image(image, blip_processor, blip_model) | |
final_prompt += f"Visual description: {image_description}\n" | |
# Transcription audio si présent | |
if audio is not None: | |
audio_text = audio_transcriber(audio)["text"] | |
final_prompt += f"Audio content: {audio_text}\n" | |
# Ajout du texte si présent | |
if text: | |
final_prompt += f"Additional context: {text}\n" | |
# Génération du prompt optimisé avec GPT-2 | |
prompt_enhancement = text_generator( | |
final_prompt, | |
max_length=200, | |
num_return_sequences=1 | |
)[0]["generated_text"] | |
# Création de la vidéo avec ModelScope | |
video_pipeline = ms_pipeline( | |
'text-to-video-synthesis', | |
model='damo/text-to-video-synthesis' | |
) | |
result = video_pipeline({ | |
'text': prompt_enhancement, | |
'output_video_path': 'output.mp4' | |
}) | |
return 'output.mp4', prompt_enhancement | |
# Interface Gradio | |
def create_interface(): | |
models = load_models() | |
interface = gr.Interface( | |
fn=lambda img, audio, txt: process_inputs(img, audio, txt, models), | |
inputs=[ | |
gr.Image(type="pil", label="Upload Image"), | |
gr.Audio(type="filepath", label="Upload Audio"), | |
gr.Textbox(label="Enter Additional Text") | |
], | |
outputs=[ | |
gr.Video(label="Generated Video"), | |
gr.Textbox(label="Generated Prompt") | |
], | |
title="Multimodal Content to Video Generator", | |
description="Upload an image, audio, or text (or any combination) to generate a video." | |
) | |
return interface | |
# Lancement de l'application | |
if __name__ == "__main__": | |
interface = create_interface() | |
interface.launch() |