SY23 / app.py
brightlembo's picture
Create app.py
87fd27b verified
raw
history blame
3.76 kB
import gradio as gr
import torch
from transformers import (
BlipProcessor,
BlipForQuestionAnswering,
pipeline,
AutoTokenizer,
AutoModelForCausalLM
)
from modelscope.pipelines import pipeline as ms_pipeline
from PIL import Image
def load_models():
# Chargement des modèles
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
# Modèle de transcription audio
audio_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
# Modèle de génération de texte (version gratuite GPT-2)
text_generator = pipeline("text-generation", model="gpt2")
return blip_processor, blip_model, audio_transcriber, text_generator
def analyze_image(image, blip_processor, blip_model):
# Questions pour l'analyse d'image
questions = [
"What is in the picture?",
"What are the main colors?",
"What is the setting or background?",
"What is happening in the image?",
]
responses = {}
for question in questions:
inputs = blip_processor(images=image, text=question, return_tensors="pt")
outputs = blip_model.generate(**inputs)
answer = blip_processor.decode(outputs[0], skip_special_tokens=True)
responses[question] = answer
description = f"This image shows {responses['What is in the picture?']}. "
description += f"The main colors are {responses['What are the main colors?']}. "
description += f"The setting is {responses['What is the setting or background?']}. "
description += f"In the scene, {responses['What is happening in the image?']}."
return description
def process_inputs(image, audio, text, models):
blip_processor, blip_model, audio_transcriber, text_generator = models
final_prompt = ""
# Analyse de l'image si présente
if image is not None:
image_description = analyze_image(image, blip_processor, blip_model)
final_prompt += f"Visual description: {image_description}\n"
# Transcription audio si présent
if audio is not None:
audio_text = audio_transcriber(audio)["text"]
final_prompt += f"Audio content: {audio_text}\n"
# Ajout du texte si présent
if text:
final_prompt += f"Additional context: {text}\n"
# Génération du prompt optimisé avec GPT-2
prompt_enhancement = text_generator(
final_prompt,
max_length=200,
num_return_sequences=1
)[0]["generated_text"]
# Création de la vidéo avec ModelScope
video_pipeline = ms_pipeline(
'text-to-video-synthesis',
model='damo/text-to-video-synthesis'
)
result = video_pipeline({
'text': prompt_enhancement,
'output_video_path': 'output.mp4'
})
return 'output.mp4', prompt_enhancement
# Interface Gradio
def create_interface():
models = load_models()
interface = gr.Interface(
fn=lambda img, audio, txt: process_inputs(img, audio, txt, models),
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Audio(type="filepath", label="Upload Audio"),
gr.Textbox(label="Enter Additional Text")
],
outputs=[
gr.Video(label="Generated Video"),
gr.Textbox(label="Generated Prompt")
],
title="Multimodal Content to Video Generator",
description="Upload an image, audio, or text (or any combination) to generate a video."
)
return interface
# Lancement de l'application
if __name__ == "__main__":
interface = create_interface()
interface.launch()