Spaces:

brightlembo
/

SY23

Running

App Files Files Community

SY23 / app.py

brightlembo

Create app.py

87fd27b verified 6 months ago

raw

history blame

3.76 kB

	import gradio as gr
	import torch
	from transformers import (
	BlipProcessor,
	BlipForQuestionAnswering,
	pipeline,
	AutoTokenizer,
	AutoModelForCausalLM
	)
	from modelscope.pipelines import pipeline as ms_pipeline
	from PIL import Image

	def load_models():
	# Chargement des modèles
	blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
	blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

	# Modèle de transcription audio
	audio_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")

	# Modèle de génération de texte (version gratuite GPT-2)
	text_generator = pipeline("text-generation", model="gpt2")

	return blip_processor, blip_model, audio_transcriber, text_generator

	def analyze_image(image, blip_processor, blip_model):
	# Questions pour l'analyse d'image
	questions = [
	"What is in the picture?",
	"What are the main colors?",
	"What is the setting or background?",
	"What is happening in the image?",
	]

	responses = {}
	for question in questions:
	inputs = blip_processor(images=image, text=question, return_tensors="pt")
	outputs = blip_model.generate(**inputs)
	answer = blip_processor.decode(outputs[0], skip_special_tokens=True)
	responses[question] = answer

	description = f"This image shows {responses['What is in the picture?']}. "
	description += f"The main colors are {responses['What are the main colors?']}. "
	description += f"The setting is {responses['What is the setting or background?']}. "
	description += f"In the scene, {responses['What is happening in the image?']}."

	return description

	def process_inputs(image, audio, text, models):
	blip_processor, blip_model, audio_transcriber, text_generator = models

	final_prompt = ""

	# Analyse de l'image si présente
	if image is not None:
	image_description = analyze_image(image, blip_processor, blip_model)
	final_prompt += f"Visual description: {image_description}\n"

	# Transcription audio si présent
	if audio is not None:
	audio_text = audio_transcriber(audio)["text"]
	final_prompt += f"Audio content: {audio_text}\n"

	# Ajout du texte si présent
	if text:
	final_prompt += f"Additional context: {text}\n"

	# Génération du prompt optimisé avec GPT-2
	prompt_enhancement = text_generator(
	final_prompt,
	max_length=200,
	num_return_sequences=1
	)[0]["generated_text"]

	# Création de la vidéo avec ModelScope
	video_pipeline = ms_pipeline(
	'text-to-video-synthesis',
	model='damo/text-to-video-synthesis'
	)

	result = video_pipeline({
	'text': prompt_enhancement,
	'output_video_path': 'output.mp4'
	})

	return 'output.mp4', prompt_enhancement

	# Interface Gradio
	def create_interface():
	models = load_models()

	interface = gr.Interface(
	fn=lambda img, audio, txt: process_inputs(img, audio, txt, models),
	inputs=[
	gr.Image(type="pil", label="Upload Image"),
	gr.Audio(type="filepath", label="Upload Audio"),
	gr.Textbox(label="Enter Additional Text")
	],
	outputs=[
	gr.Video(label="Generated Video"),
	gr.Textbox(label="Generated Prompt")
	],
	title="Multimodal Content to Video Generator",
	description="Upload an image, audio, or text (or any combination) to generate a video."
	)

	return interface

	# Lancement de l'application
	if __name__ == "__main__":
	interface = create_interface()
	interface.launch()