Spaces:
Running
Running
File size: 5,677 Bytes
87fd27b 7c9adee d63c5ba 87fd27b 3267fb2 87fd27b 3267fb2 87fd27b 3267fb2 7c9adee 3267fb2 d63c5ba 3267fb2 87fd27b 3267fb2 7c9adee 3267fb2 7c9adee 3267fb2 7c9adee 3267fb2 7c9adee 3267fb2 7c9adee 3267fb2 7c9adee 3267fb2 7c9adee 3267fb2 7c9adee 3267fb2 d63c5ba 3267fb2 7c9adee d63c5ba 3267fb2 d63c5ba 3267fb2 d63c5ba 3267fb2 7c9adee d63c5ba 3267fb2 7c9adee d63c5ba 3267fb2 d63c5ba 3267fb2 d63c5ba 7c9adee d63c5ba 3267fb2 d63c5ba 87fd27b 3267fb2 87fd27b 3267fb2 87fd27b 3267fb2 87fd27b d63c5ba 87fd27b d63c5ba 3267fb2 d63c5ba 7c9adee d63c5ba 3267fb2 87fd27b 7c9adee 87fd27b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import gradio as gr
import torch
from transformers import (
BlipProcessor,
BlipForQuestionAnswering,
pipeline,
AutoTokenizer,
AutoModelForCausalLM
)
from PIL import Image
import os
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MultimodalProcessor:
def __init__(self):
self.load_models()
def load_models(self):
"""Charge les modèles avec gestion d'erreurs"""
try:
logger.info("Chargement des modèles...")
self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
self.blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
self.audio_transcriber = pipeline("automatic-speech-recognition",
model="openai/whisper-small")
self.text_generator = pipeline("text-generation",
model="gpt2")
logger.info("Modèles chargés avec succès")
except Exception as e:
logger.error(f"Erreur lors du chargement des modèles: {str(e)}")
raise
def analyze_image(self, image):
"""Analyse une image et retourne une description"""
try:
if image is None:
return ""
questions = [
"What is in the picture?",
"What are the main colors?",
"What is the setting or background?",
"What is happening in the image?",
]
responses = {}
for question in questions:
inputs = self.blip_processor(images=image, text=question, return_tensors="pt")
outputs = self.blip_model.generate(**inputs)
answer = self.blip_processor.decode(outputs[0], skip_special_tokens=True)
responses[question] = answer
description = (
f"This image shows {responses['What is in the picture?']}. "
f"The main colors are {responses['What are the main colors?']}. "
f"The setting is {responses['What is the setting or background?']}. "
f"In the scene, {responses['What is happening in the image?']}"
)
return description
except Exception as e:
logger.error(f"Erreur lors de l'analyse de l'image: {str(e)}")
return "Erreur lors de l'analyse de l'image."
def transcribe_audio(self, audio_path):
"""Transcrit un fichier audio"""
try:
if audio_path is None:
return ""
return self.audio_transcriber(audio_path)["text"]
except Exception as e:
logger.error(f"Erreur lors de la transcription audio: {str(e)}")
return "Erreur lors de la transcription audio."
def generate_text(self, prompt):
"""Génère du texte à partir d'un prompt"""
try:
if not prompt:
return ""
response = self.text_generator(prompt,
max_length=200,
num_return_sequences=1)[0]["generated_text"]
return response
except Exception as e:
logger.error(f"Erreur lors de la génération de texte: {str(e)}")
return "Erreur lors de la génération de texte."
def process_inputs(self, image, audio, text):
"""Traite les entrées multimodales"""
try:
# Analyse de l'image
image_description = self.analyze_image(image) if image is not None else ""
# Transcription audio
audio_text = self.transcribe_audio(audio) if audio is not None else ""
# Combinaison des entrées
combined_input = ""
if image_description:
combined_input += f"Visual description: {image_description}\n"
if audio_text:
combined_input += f"Audio content: {audio_text}\n"
if text:
combined_input += f"Additional context: {text}\n"
# Génération du prompt final
if combined_input:
final_prompt = self.generate_text(combined_input)
else:
final_prompt = "Aucune entrée fournie."
return final_prompt
except Exception as e:
logger.error(f"Erreur lors du traitement des entrées: {str(e)}")
return "Une erreur est survenue lors du traitement des entrées."
def create_interface():
"""Crée l'interface Gradio"""
processor = MultimodalProcessor()
interface = gr.Interface(
fn=processor.process_inputs,
inputs=[
gr.Image(type="pil", label="Télécharger une image"),
gr.Audio(type="filepath", label="Télécharger un fichier audio"),
gr.Textbox(label="Entrez du texte additionnel")
],
outputs=[
gr.Textbox(label="Description générée")
],
title="Analyseur de Contenu Multimodal",
description="""
Cette application analyse vos contenus multimodaux :
- Images : génère une description détaillée
- Audio : transcrit le contenu
- Texte : enrichit la description
La sortie combine toutes ces informations en une description cohérente.
"""
)
return interface
if __name__ == "__main__":
interface = create_interface()
interface.launch() |