Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,15 +2,16 @@ import os
|
|
2 |
import shutil
|
3 |
import zipfile
|
4 |
from pathlib import Path
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
import torch
|
8 |
from pydub import AudioSegment
|
9 |
from transformers import pipeline
|
10 |
|
11 |
-
#
|
12 |
-
#
|
13 |
-
#
|
14 |
|
15 |
MODEL_NAME = "openai/whisper-large-v3"
|
16 |
device = 0 if torch.cuda.is_available() else "cpu"
|
@@ -25,221 +26,221 @@ pipe = pipeline(
|
|
25 |
TEMP_DIR = "./temp_audio"
|
26 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
27 |
|
28 |
-
#
|
29 |
-
#
|
|
|
30 |
def init_metadata_state():
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
return []
|
32 |
|
33 |
-
#
|
34 |
-
#
|
35 |
-
#
|
36 |
-
|
37 |
def transcribe_audio(audio_path):
|
38 |
"""
|
39 |
-
|
40 |
-
|
|
|
|
|
41 |
"""
|
42 |
if not audio_path:
|
43 |
-
return "Aucun fichier audio fourni
|
44 |
|
45 |
-
#
|
46 |
result = pipe(audio_path, return_timestamps="word")
|
47 |
text = result["text"]
|
48 |
-
chunks = result["chunks"] #
|
49 |
|
|
|
50 |
raw_transcription = " ".join([c["text"] for c in chunks])
|
51 |
|
52 |
-
#
|
53 |
-
|
54 |
|
|
|
55 |
|
|
|
|
|
|
|
56 |
def validate_segments(audio_path, table_data, metadata_state):
|
57 |
"""
|
58 |
-
|
59 |
-
|
60 |
-
-
|
61 |
-
- Retourne
|
62 |
-
1) Une liste de chemins (extraits audio) pour les players
|
63 |
-
2) La liste des nouvelles métadonnées (mise à jour).
|
64 |
"""
|
65 |
if not audio_path:
|
66 |
-
return [
|
67 |
|
68 |
-
#
|
69 |
if os.path.exists(TEMP_DIR):
|
70 |
shutil.rmtree(TEMP_DIR)
|
71 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
72 |
|
73 |
original_audio = AudioSegment.from_file(audio_path)
|
|
|
74 |
segment_paths = []
|
75 |
updated_metadata = []
|
76 |
|
77 |
for i, row in enumerate(table_data):
|
78 |
-
# row = [
|
79 |
if len(row) < 4:
|
80 |
-
continue
|
|
|
81 |
|
82 |
-
|
83 |
-
|
84 |
continue
|
85 |
-
|
86 |
-
# Si l'utilisateur n'a pas mis d'ID, en créer un
|
87 |
if not seg_id:
|
88 |
seg_id = f"seg_{i+1:02d}"
|
89 |
|
90 |
-
# Découpe
|
91 |
start_ms = int(float(start_time) * 1000)
|
92 |
end_ms = int(float(end_time) * 1000)
|
93 |
-
|
|
|
|
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
segment_filename = f"{stem_name}_{seg_id}.wav"
|
98 |
-
segment_filepath = os.path.join(TEMP_DIR, segment_filename)
|
99 |
-
extract.export(segment_filepath, format="wav")
|
100 |
|
101 |
-
|
|
|
|
|
102 |
|
103 |
-
# Stocker
|
|
|
104 |
updated_metadata.append({
|
105 |
"audio_file": segment_filename,
|
106 |
-
"text":
|
107 |
"start_time": start_time,
|
108 |
"end_time": end_time,
|
109 |
-
"id": seg_id
|
110 |
})
|
111 |
|
|
|
112 |
return segment_paths, updated_metadata
|
113 |
|
114 |
-
|
|
|
|
|
115 |
def generate_zip(metadata_state):
|
116 |
"""
|
117 |
-
|
118 |
-
Retourne le chemin
|
119 |
"""
|
120 |
if not metadata_state:
|
121 |
return None
|
122 |
|
123 |
-
# Supprimer un ancien zip si présent
|
124 |
zip_path = os.path.join(TEMP_DIR, "dataset.zip")
|
125 |
if os.path.exists(zip_path):
|
126 |
os.remove(zip_path)
|
127 |
|
128 |
-
# Créer metadata.csv
|
129 |
metadata_csv_path = os.path.join(TEMP_DIR, "metadata.csv")
|
130 |
with open(metadata_csv_path, "w", encoding="utf-8") as f:
|
131 |
f.write("audio_file|text|speaker_name|API\n")
|
132 |
for seg in metadata_state:
|
133 |
-
# Ajuste speaker_name ou API selon ton besoin
|
134 |
line = f"{seg['audio_file']}|{seg['text']}|projectname|/API_PHONETIC/\n"
|
135 |
f.write(line)
|
136 |
|
137 |
-
#
|
138 |
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
139 |
# Ajouter chaque extrait
|
140 |
for seg in metadata_state:
|
141 |
-
|
142 |
-
if os.path.exists(
|
143 |
-
zf.write(
|
144 |
-
|
145 |
-
# Ajouter le metadata.csv
|
146 |
zf.write(metadata_csv_path, "metadata.csv")
|
147 |
|
148 |
return zip_path
|
149 |
|
150 |
-
|
151 |
-
|
|
|
|
|
152 |
"""
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
"""
|
157 |
-
|
158 |
-
|
159 |
-
for i in
|
160 |
-
if i <
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
# ------------------------
|
168 |
-
# CONSTRUCTION UI GRADIO
|
169 |
-
# ------------------------
|
170 |
with gr.Blocks(css="style.css") as demo:
|
|
|
171 |
|
172 |
-
|
173 |
metadata_state = gr.State(init_metadata_state())
|
174 |
|
175 |
-
# Étape 1 :
|
176 |
with gr.Box():
|
177 |
-
gr.Markdown("### 1. Téléversez
|
178 |
audio_input = gr.Audio(source="upload", type="filepath", label="Fichier audio")
|
179 |
|
180 |
-
#
|
181 |
raw_transcription = gr.Textbox(
|
182 |
-
label="Transcription
|
183 |
-
placeholder="Le texte
|
184 |
interactive=False
|
185 |
)
|
186 |
|
187 |
-
#
|
188 |
-
gr.Markdown("### 2. Définissez
|
189 |
-
gr.Markdown("
|
190 |
-
1) Texte (phrase ou portion copiée depuis la transcription)
|
191 |
-
2) Début (en secondes)
|
192 |
-
3) Fin (en secondes)
|
193 |
-
4) ID segment (optionnel)""")
|
194 |
-
|
195 |
table = gr.Dataframe(
|
196 |
headers=["Texte", "Début (s)", "Fin (s)", "ID"],
|
197 |
datatype=["str", "number", "number", "str"],
|
198 |
-
row_count=20,
|
199 |
col_count=4
|
200 |
)
|
201 |
-
|
202 |
validate_button = gr.Button("Valider et générer les extraits")
|
203 |
|
204 |
-
#
|
205 |
-
|
206 |
-
players = []
|
207 |
for i in range(20):
|
208 |
-
|
|
|
209 |
|
210 |
-
#
|
211 |
-
for i in range(0, 20, 4):
|
212 |
-
with gr.Row():
|
213 |
-
for j in range(i, i+4):
|
214 |
-
players[j]
|
215 |
-
|
216 |
-
# Étape 8 : Génération ZIP
|
217 |
generate_button = gr.Button("Générer le fichier ZIP")
|
218 |
-
zip_file = gr.File(label="Télécharger le ZIP
|
|
|
|
|
|
|
|
|
219 |
|
220 |
-
#
|
221 |
audio_input.change(
|
222 |
fn=transcribe_audio,
|
223 |
inputs=audio_input,
|
224 |
outputs=[raw_transcription, table, audio_input]
|
225 |
)
|
226 |
|
227 |
-
#
|
228 |
validate_button.click(
|
229 |
fn=validate_segments,
|
230 |
inputs=[audio_input, table, metadata_state],
|
231 |
-
outputs=[
|
232 |
-
players, # Les 20 players
|
233 |
-
metadata_state
|
234 |
-
],
|
235 |
-
# On va mapper la liste de segments sur 20 players
|
236 |
).then(
|
237 |
-
fn=
|
238 |
-
inputs=
|
239 |
-
outputs=
|
240 |
)
|
241 |
|
242 |
-
#
|
243 |
generate_button.click(
|
244 |
fn=generate_zip,
|
245 |
inputs=metadata_state,
|
|
|
2 |
import shutil
|
3 |
import zipfile
|
4 |
from pathlib import Path
|
5 |
+
from datetime import datetime
|
6 |
|
7 |
import gradio as gr
|
8 |
import torch
|
9 |
from pydub import AudioSegment
|
10 |
from transformers import pipeline
|
11 |
|
12 |
+
# -------------------------------------------------
|
13 |
+
# Configuration
|
14 |
+
# -------------------------------------------------
|
15 |
|
16 |
MODEL_NAME = "openai/whisper-large-v3"
|
17 |
device = 0 if torch.cuda.is_available() else "cpu"
|
|
|
26 |
TEMP_DIR = "./temp_audio"
|
27 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
28 |
|
29 |
+
# -------------------------------------------------
|
30 |
+
# Gestion de l'état
|
31 |
+
# -------------------------------------------------
|
32 |
def init_metadata_state():
|
33 |
+
"""
|
34 |
+
Stockera la liste des segments validés :
|
35 |
+
[
|
36 |
+
{ "audio_file":..., "text":..., "start_time":..., "end_time":..., "id":... }, ...
|
37 |
+
]
|
38 |
+
"""
|
39 |
return []
|
40 |
|
41 |
+
# -------------------------------------------------
|
42 |
+
# Étape 2 : Transcription avec Whisper
|
43 |
+
# -------------------------------------------------
|
|
|
44 |
def transcribe_audio(audio_path):
|
45 |
"""
|
46 |
+
Retourne :
|
47 |
+
- Transcription brute (concaténation des mots)
|
48 |
+
- Un tableau de 20 lignes vides (4 colonnes)
|
49 |
+
- Le chemin du fichier audio pour la suite
|
50 |
"""
|
51 |
if not audio_path:
|
52 |
+
return "Aucun fichier audio fourni", [], None
|
53 |
|
54 |
+
# Transcrire
|
55 |
result = pipe(audio_path, return_timestamps="word")
|
56 |
text = result["text"]
|
57 |
+
chunks = result["chunks"] # [{'timestamp': (start, end), 'text': ... }, ...]
|
58 |
|
59 |
+
# Concaténer le texte brut
|
60 |
raw_transcription = " ".join([c["text"] for c in chunks])
|
61 |
|
62 |
+
# Générer un tableau de 20 lignes vides (utilisateur remplit manuellement)
|
63 |
+
table_init = [["", None, None, ""] for _ in range(20)]
|
64 |
|
65 |
+
return raw_transcription, table_init, audio_path
|
66 |
|
67 |
+
# -------------------------------------------------
|
68 |
+
# Étape 5 : Validation + découpe
|
69 |
+
# -------------------------------------------------
|
70 |
def validate_segments(audio_path, table_data, metadata_state):
|
71 |
"""
|
72 |
+
- Parcourt chaque ligne du tableau (jusqu'à 20).
|
73 |
+
- Découpe l'audio via pydub si la ligne est valide.
|
74 |
+
- Met à jour la "State" (metadata_state).
|
75 |
+
- Retourne la liste des chemins générés (segment_paths).
|
|
|
|
|
76 |
"""
|
77 |
if not audio_path:
|
78 |
+
return [], metadata_state
|
79 |
|
80 |
+
# Nettoyer le dossier temporaire
|
81 |
if os.path.exists(TEMP_DIR):
|
82 |
shutil.rmtree(TEMP_DIR)
|
83 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
84 |
|
85 |
original_audio = AudioSegment.from_file(audio_path)
|
86 |
+
|
87 |
segment_paths = []
|
88 |
updated_metadata = []
|
89 |
|
90 |
for i, row in enumerate(table_data):
|
91 |
+
# row = [ texte, start, end, seg_id ]
|
92 |
if len(row) < 4:
|
93 |
+
continue # ligne incomplète
|
94 |
+
segment_text, start_time, end_time, seg_id = row
|
95 |
|
96 |
+
if not segment_text or start_time is None or end_time is None:
|
97 |
+
# Ligne vide ou incomplète => on ignore
|
98 |
continue
|
|
|
|
|
99 |
if not seg_id:
|
100 |
seg_id = f"seg_{i+1:02d}"
|
101 |
|
|
|
102 |
start_ms = int(float(start_time) * 1000)
|
103 |
end_ms = int(float(end_time) * 1000)
|
104 |
+
# Vérifier que la sélection est valide
|
105 |
+
if start_ms < 0 or end_ms <= start_ms:
|
106 |
+
continue
|
107 |
|
108 |
+
segment_filename = f"{Path(audio_path).stem}_{seg_id}.wav"
|
109 |
+
segment_path = os.path.join(TEMP_DIR, segment_filename)
|
|
|
|
|
|
|
110 |
|
111 |
+
# Découpe + export
|
112 |
+
extract = original_audio[start_ms:end_ms]
|
113 |
+
extract.export(segment_path, format="wav")
|
114 |
|
115 |
+
# Stocker chemin + info
|
116 |
+
segment_paths.append(segment_path)
|
117 |
updated_metadata.append({
|
118 |
"audio_file": segment_filename,
|
119 |
+
"text": segment_text,
|
120 |
"start_time": start_time,
|
121 |
"end_time": end_time,
|
122 |
+
"id": seg_id
|
123 |
})
|
124 |
|
125 |
+
# On renvoie la liste + on met à jour la state
|
126 |
return segment_paths, updated_metadata
|
127 |
|
128 |
+
# -------------------------------------------------
|
129 |
+
# Étape 8 : Génération du ZIP
|
130 |
+
# -------------------------------------------------
|
131 |
def generate_zip(metadata_state):
|
132 |
"""
|
133 |
+
Crée metadata.csv + zip tous les extraits + le CSV
|
134 |
+
Retourne le chemin du zip pour téléchargement
|
135 |
"""
|
136 |
if not metadata_state:
|
137 |
return None
|
138 |
|
|
|
139 |
zip_path = os.path.join(TEMP_DIR, "dataset.zip")
|
140 |
if os.path.exists(zip_path):
|
141 |
os.remove(zip_path)
|
142 |
|
143 |
+
# Créer le metadata.csv
|
144 |
metadata_csv_path = os.path.join(TEMP_DIR, "metadata.csv")
|
145 |
with open(metadata_csv_path, "w", encoding="utf-8") as f:
|
146 |
f.write("audio_file|text|speaker_name|API\n")
|
147 |
for seg in metadata_state:
|
|
|
148 |
line = f"{seg['audio_file']}|{seg['text']}|projectname|/API_PHONETIC/\n"
|
149 |
f.write(line)
|
150 |
|
151 |
+
# Zipper
|
152 |
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
153 |
# Ajouter chaque extrait
|
154 |
for seg in metadata_state:
|
155 |
+
file_path = os.path.join(TEMP_DIR, seg["audio_file"])
|
156 |
+
if os.path.exists(file_path):
|
157 |
+
zf.write(file_path, seg["audio_file"])
|
158 |
+
# Ajouter le CSV
|
|
|
159 |
zf.write(metadata_csv_path, "metadata.csv")
|
160 |
|
161 |
return zip_path
|
162 |
|
163 |
+
# -------------------------------------------------
|
164 |
+
# Mise à jour des players (jusqu'à 20)
|
165 |
+
# -------------------------------------------------
|
166 |
+
def spread_segments_to_players(segment_paths):
|
167 |
"""
|
168 |
+
Reçoit la liste de chemins (X <= 20).
|
169 |
+
Retourne un tuple de 20 valeurs pour remplir
|
170 |
+
chaque gr.Audio individuellement, ou None si inexistant.
|
171 |
"""
|
172 |
+
max_slots = 20
|
173 |
+
out = [None] * max_slots
|
174 |
+
for i, p in enumerate(segment_paths):
|
175 |
+
if i < max_slots:
|
176 |
+
out[i] = p
|
177 |
+
return tuple(out)
|
178 |
+
|
179 |
+
# -------------------------------------------------
|
180 |
+
# Construction de l'interface Gradio
|
181 |
+
# -------------------------------------------------
|
|
|
|
|
|
|
182 |
with gr.Blocks(css="style.css") as demo:
|
183 |
+
gr.Markdown("# Application de Découpe Audio (jusqu'à 20 segments)")
|
184 |
|
185 |
+
# State global pour stocker la metadata (liste de dict)
|
186 |
metadata_state = gr.State(init_metadata_state())
|
187 |
|
188 |
+
# Étape 1 : Upload audio
|
189 |
with gr.Box():
|
190 |
+
gr.Markdown("### 1. Téléversez un fichier audio (MP3/WAV)")
|
191 |
audio_input = gr.Audio(source="upload", type="filepath", label="Fichier audio")
|
192 |
|
193 |
+
# Transcription brute
|
194 |
raw_transcription = gr.Textbox(
|
195 |
+
label="Transcription (Whisper)",
|
196 |
+
placeholder="Le texte apparaîtra ici après chargement.",
|
197 |
interactive=False
|
198 |
)
|
199 |
|
200 |
+
# Tableau de 20 lignes
|
201 |
+
gr.Markdown("### 2. Définissez vos segments (vous pouvez en remplir moins, ça fonctionnera !)")
|
202 |
+
gr.Markdown("**Colonne 1** : Texte, **Colonne 2** : D��but (s), **Colonne 3** : Fin (s), **Colonne 4** : ID (optionnel)")
|
|
|
|
|
|
|
|
|
|
|
203 |
table = gr.Dataframe(
|
204 |
headers=["Texte", "Début (s)", "Fin (s)", "ID"],
|
205 |
datatype=["str", "number", "number", "str"],
|
206 |
+
row_count=20,
|
207 |
col_count=4
|
208 |
)
|
|
|
209 |
validate_button = gr.Button("Valider et générer les extraits")
|
210 |
|
211 |
+
# 20 lecteurs audio
|
212 |
+
audio_players = []
|
|
|
213 |
for i in range(20):
|
214 |
+
ap = gr.Audio(label=f"Extrait {i+1}", interactive=False)
|
215 |
+
audio_players.append(ap)
|
216 |
|
217 |
+
# Générer ZIP
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
generate_button = gr.Button("Générer le fichier ZIP")
|
219 |
+
zip_file = gr.File(label="Télécharger le ZIP")
|
220 |
+
|
221 |
+
# ----------------
|
222 |
+
# Callbacks
|
223 |
+
# ----------------
|
224 |
|
225 |
+
# Au changement d'audio => transcription
|
226 |
audio_input.change(
|
227 |
fn=transcribe_audio,
|
228 |
inputs=audio_input,
|
229 |
outputs=[raw_transcription, table, audio_input]
|
230 |
)
|
231 |
|
232 |
+
# Validation => découpe => mise à jour players
|
233 |
validate_button.click(
|
234 |
fn=validate_segments,
|
235 |
inputs=[audio_input, table, metadata_state],
|
236 |
+
outputs=["temp_paths", metadata_state],
|
|
|
|
|
|
|
|
|
237 |
).then(
|
238 |
+
fn=spread_segments_to_players,
|
239 |
+
inputs="temp_paths",
|
240 |
+
outputs=audio_players
|
241 |
)
|
242 |
|
243 |
+
# Génération ZIP
|
244 |
generate_button.click(
|
245 |
fn=generate_zip,
|
246 |
inputs=metadata_state,
|