DGutierrez81 commited on
Commit
59e1ddd
·
verified ·
1 Parent(s): 3605d59

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -0
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ from PIL import Image
4
+ import numpy as np
5
+ from datasets import load_dataset
6
+ import soundfile as sf
7
+ import torch
8
+
9
+ image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
10
+ synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
11
+
12
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
13
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
14
+
15
+ def predict_step(image):
16
+ if isinstance(image, np.ndarray):
17
+ image = Image.fromarray(image)
18
+
19
+
20
+ result = image_to_text(image)
21
+
22
+ texto = result[0]['generated_text']
23
+ speech = synthesiser(texto, forward_params={"speaker_embeddings": speaker_embedding})
24
+ sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
25
+ return "speech.wav", texto
26
+
27
+ demo = gr.Interface(
28
+ fn=predict_step,
29
+ inputs="image",
30
+ outputs=["audio","textbox"],
31
+ title="Descripción de Imágenes",
32
+ description="Cargue una imagen y obtenga una descripción generada por IA."
33
+ )
34
+
35
+ demo.launch()