DGutierrez81 commited on
Commit
c6eb91a
·
verified ·
1 Parent(s): e120916

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -18
app.py CHANGED
@@ -1,35 +1,106 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  from PIL import Image
4
- import numpy as np
 
5
  from datasets import load_dataset
6
- import soundfile as sf
7
  import torch
 
 
8
 
9
  image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
10
- synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
11
 
12
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
13
  speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
14
 
15
- def predict_step(image):
16
- if isinstance(image, np.ndarray):
17
- image = Image.fromarray(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
-
20
  result = image_to_text(image)
 
21
 
22
- texto = result[0]['generated_text']
23
- speech = synthesiser(texto, forward_params={"speaker_embeddings": speaker_embedding})
24
  sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
25
- return "speech.wav", texto
26
-
27
- demo = gr.Interface(
28
- fn=predict_step,
29
- inputs="image",
30
- outputs=["audio","textbox"],
31
- title="Descripción de Imágenes",
32
- description="Cargue una imagen y obtenga una descripción generada por IA."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  )
 
 
 
 
 
 
 
 
 
 
34
 
35
  demo.launch()
 
1
  import gradio as gr
2
+ import requests
3
  from PIL import Image
4
+ from io import BytesIO
5
+ from transformers import pipeline
6
  from datasets import load_dataset
 
7
  import torch
8
+ import soundfile as sf
9
+
10
 
11
  image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
12
+ synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
13
 
14
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
15
  speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
16
 
17
+ url = "https://www.thecocktaildb.com/api/json/v1/1/search.php?s=margarita"
18
+ response = requests.get(url)
19
+ lista = []
20
+
21
+ if response.status_code == 200:
22
+ datos = response.json()
23
+ drinks = datos.get("drinks", [])
24
+ for drink in drinks:
25
+ lista.append(drink['strDrink'])
26
+ else:
27
+ print(f"Error: {response.status_code}")
28
+
29
+ def change_textbox(choice):
30
+ cocktail = requests.get(f"https://www.thecocktaildb.com/api/json/v1/1/search.php?s={choice}")
31
+ data = cocktail.json()
32
+ dataCocktail = data.get("drinks", [])
33
+
34
+ for i in dataCocktail:
35
+ if i['strDrink'].lower() == choice.lower():
36
+ name = i['strDrink']
37
+ instructions = i['strInstructions']
38
+ image_url = i['strDrinkThumb']
39
+ break
40
+
41
+ textInstructions = gr.Textbox(instructions)
42
+
43
+ img_response = requests.get(image_url)
44
+ image = Image.open(BytesIO(img_response.content)).convert("RGB")
45
 
 
46
  result = image_to_text(image)
47
+ descripcion = result[0]['generated_text']
48
 
49
+ speech = synthesiser(instructions, forward_params={"speaker_embeddings": speaker_embedding})
 
50
  sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
51
+ audio_path = "speech.wav"
52
+
53
+ speech2 = synthesiser(descripcion, forward_params={"speaker_embeddings": speaker_embedding})
54
+ sf.write("speech2.wav", speech2["audio"], samplerate=speech2["sampling_rate"])
55
+ audio_path2 = "speech2.wav"
56
+
57
+ return name, image,textInstructions,audio_path,descripcion,audio_path2
58
+
59
+ with gr.Blocks() as demo:
60
+ gr.HTML(
61
+ """
62
+ <style>
63
+ /* Cambiar el fondo de toda la página */
64
+ body {
65
+ background-color: #000000; /* Fondo negro */
66
+ color: #ffffff; /* Texto blanco */
67
+ font-family: Arial, sans-serif; /* Cambiar la fuente global */
68
+ margin: 0;
69
+ padding: 0;
70
+ text-align: center; /* Centrar todo el texto en la página */
71
+ }
72
+ /* Cambiar el fondo del contenedor principal */
73
+ .gradio-container {
74
+ background-color: #000000; /* Fondo negro */
75
+ padding: 20px;
76
+ border-radius: 10px;
77
+ display: flex;
78
+ flex-direction: column;
79
+ align-items: center; /* Centrar los elementos dentro del contenedor */
80
+ justify-content: center;
81
+ }
82
+ /* Centrar el contenido dentro de los bloques */
83
+ .gradio-container .gradio-radio {
84
+ display: inline-block;
85
+ margin: 10px;
86
+ text-align: center;
87
+ }
88
+ </style>
89
+ """
90
+ )
91
+
92
+ gr.Markdown(
93
+ """<h1 style="text-align: center; color: #ffffff;">Cocktails Descriptions</h1>"""
94
  )
95
+
96
+ radio = gr.Radio(lista, label="Choose your cocktail:")
97
+ text = gr.Textbox(lines=2, interactive=False, show_copy_button=True, label="Cocktail Name")
98
+ imagen = gr.Image(label="Cocktail Image")
99
+ text2 = gr.Textbox(lines=2, interactive=False, show_copy_button=True, label="Instructions")
100
+ audio = gr.Audio(label="Cocktail Instructions Audio")
101
+ text3 = gr.Textbox(lines=2, interactive=False, show_copy_button=True, label="Image description")
102
+ audio2 = gr.Audio(label="Audio image description")
103
+
104
+ radio.change(fn=change_textbox, inputs=radio, outputs=[text, imagen,text2, audio,text3, audio2])
105
 
106
  demo.launch()