Ngoufack commited on
Commit
628e92e
·
1 Parent(s): 09e94bd
Files changed (2) hide show
  1. app.py +184 -138
  2. requirements.txt +147 -13
app.py CHANGED
@@ -1,143 +1,189 @@
1
- import spaces
2
- import torch
3
- import torchaudio
4
- import gradio as gr
5
- import yt_dlp as youtube_dl
6
- from faster_whisper import WhisperModel
7
- from transformers.pipelines.audio_utils import ffmpeg_read
8
- import tempfile
9
  import os
10
- from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
11
- from pyannote.audio import Model
12
- from pyannote.core import Segment
13
- from transformers.pipelines.audio_utils import ffmpeg_read
14
- from pyannote.audio import Pipeline
15
-
16
- MODEL_NAME = "medium"
17
- BATCH_SIZE = 8
18
- FILE_LIMIT_MB = 1000
19
- YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
20
-
21
- device = "cuda" if torch.cuda.is_available() else "cpu"
22
- model = WhisperModel(MODEL_NAME, device=device, compute_type="float16" if torch.cuda.is_available() else "int8")
23
-
24
- #model_pyannote = Model.from_pretrained("pyannote/speaker-diarization")
25
- pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",os.getenv('HF_TOKEN'))
26
-
27
- #pipeline = SpeakerDiarization(model_pyannote)
28
-
29
-
30
- @spaces.GPU
31
- def transcribe(inputs, task):
32
- if inputs is None:
33
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
34
-
35
- waveform, sample_rate = torchaudio.load(inputs)
36
- if sample_rate != 16000:
37
- transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
38
- waveform = transform(waveform)
39
-
40
- segments, _ = model.transcribe(inputs, task=task)
41
- transcription = "\n".join([segment.text for segment in segments])
42
- print(transcription)
43
- # Diarisation avec le pipeline de pyannote
44
- diarization = pipeline(inputs)
45
- speaker_segments = []
46
- for segment, _, speaker in diarization.itertracks(yield_label=True):
47
- speaker_segments.append((segment.start, segment.end, speaker))
48
-
49
- # Associer les segments de transcription aux locuteurs
50
- speaker_texts = []
51
- for start, end, speaker in speaker_segments:
52
- spoken_text = " ".join([seg.text for seg in segments if seg.start >= start and seg.end <= end])
53
- if spoken_text:
54
- speaker_texts.append(f"{speaker}: {spoken_text}")
55
- return "\n".join(speaker_texts)
56
-
57
- def _return_yt_html_embed(yt_url):
58
- video_id = yt_url.split("?v=")[-1]
59
- HTML_str = (
60
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
61
- " </center>"
62
- )
63
- return HTML_str
64
-
65
- def download_yt_audio(yt_url, filename):
66
- info_loader = youtube_dl.YoutubeDL()
67
-
68
  try:
69
- info = info_loader.extract_info(yt_url, download=False)
70
- except youtube_dl.utils.DownloadError as err:
71
- raise gr.Error(str(err))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- file_length_s = info["duration"]
74
- if file_length_s > YT_LENGTH_LIMIT_S:
75
- raise gr.Error(f"Maximum YouTube length is {YT_LENGTH_LIMIT_S} seconds, got {file_length_s} seconds.")
76
-
77
- ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
78
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
79
- try:
80
- ydl.download([yt_url])
81
- except youtube_dl.utils.ExtractorError as err:
82
- raise gr.Error(str(err))
83
-
84
- @spaces.GPU
85
- def yt_transcribe(yt_url, task):
86
- html_embed_str = _return_yt_html_embed(yt_url)
87
-
88
- with tempfile.TemporaryDirectory() as tmpdirname:
89
- filepath = os.path.join(tmpdirname, "video.mp4")
90
- download_yt_audio(yt_url, filepath)
91
- with open(filepath, "rb") as f:
92
- inputs = f.read()
93
 
94
- inputs = ffmpeg_read(inputs, 16000) # Convertir en 16kHz
95
- segments, _ = model.transcribe(inputs, task=task)
96
- text = " ".join([segment.text for segment in segments])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- return html_embed_str, text
99
-
100
- demo = gr.Blocks(theme=gr.themes.Ocean())
101
-
102
- mf_transcribe = gr.Interface(
103
- fn=transcribe,
104
- inputs=[
105
- gr.Audio(sources="microphone", type="filepath"),
106
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
107
- ],
108
- outputs="text",
109
- title="VerbaLend Demo 1 : Prototype",
110
- description=(
111
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses Faster Whisper"
112
- ),
113
- allow_flagging="never",
114
- )
115
-
116
- file_transcribe = gr.Interface(
117
- fn=transcribe,
118
- inputs=[
119
- gr.Audio(sources="upload", type="filepath", label="Audio file"),
120
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
121
- ],
122
- outputs="text",
123
- title="VerbaLend Demo 1 : Prototype",
124
- description="Transcribe uploaded audio files with Faster Whisper.",
125
- allow_flagging="never",
126
- )
127
-
128
- yt_transcribe = gr.Interface(
129
- fn=yt_transcribe,
130
- inputs=[
131
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
132
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
133
- ],
134
- outputs=["html", "text"],
135
- title="VerbaLend Demo 1 : Prototype",
136
- description="Transcribe YouTube videos using Faster Whisper.",
137
- allow_flagging="never",
138
- )
139
-
140
- with demo:
141
- gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
142
-
143
- demo.queue().launch(ssr_mode=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import tempfile
3
+ import time
4
+ import gradio as gr
5
+ import whisperx
6
+ import torch
7
+ from docx import Document
8
+ from docx.shared import RGBColor
9
+ import numpy as np
10
+ import soundfile as sf
11
+ from datetime import date
12
+ from dotenv import load_dotenv
13
+
14
+ # Load environment variables from .env file
15
+ load_dotenv()
16
+ # Get Hugging Face token from environment variables
17
+ HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
18
+
19
+ # Set device for computation
20
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
+ COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8"
22
+
23
+ # Speaker colors for docx export
24
+ SPEAKER_COLORS = {
25
+ "SPEAKER_00": RGBColor(255, 0, 0), # Red
26
+ "SPEAKER_01": RGBColor(0, 0, 255), # Blue
27
+ "SPEAKER_02": RGBColor(0, 128, 0), # Green
28
+ "SPEAKER_03": RGBColor(128, 0, 128), # Purple
29
+ "SPEAKER_04": RGBColor(255, 165, 0), # Orange
30
+ "SPEAKER_05": RGBColor(0, 128, 128), # Teal
31
+ "SPEAKER_06": RGBColor(139, 69, 19), # Brown
32
+ "SPEAKER_07": RGBColor(105, 105, 105), # Gray
33
+ "SPEAKER_08": RGBColor(255, 20, 147), # Pink
34
+ "SPEAKER_09": RGBColor(0, 191, 255), # Sky Blue
35
+ }
36
+
37
+ def format_time(seconds):
38
+ """Convert seconds to HH:MM:SS format."""
39
+ hours = int(seconds // 3600)
40
+ minutes = int((seconds % 3600) // 60)
41
+ seconds = int(seconds % 60)
42
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
43
+
44
+ def transcribe_audio(audio_path, model_name="large-v2"):
45
+ """Transcribe audio using WhisperX and identify speakers."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  try:
47
+ # 1. Load and transcribe audio with whisperx
48
+ model = whisperx.load_model(model_name, DEVICE, compute_type=COMPUTE_TYPE)
49
+ audio = whisperx.load_audio(audio_path)
50
+ result = model.transcribe(audio, batch_size=16)
51
+
52
+ # 2. Align whisper output
53
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=DEVICE)
54
+ result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)
55
+
56
+ # 3. Assign speaker labels
57
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=HUGGINGFACE_TOKEN, device=DEVICE)
58
+ diarize_segments = diarize_model(audio, min_speakers=1, max_speakers=10)
59
+ result = whisperx.assign_word_speakers(diarize_segments, result)
60
+
61
+ return result, None
62
+ except Exception as e:
63
+ return None, str(e)
64
+
65
+ def export_to_docx(result, output_path=None):
66
+ """Export transcription to DOCX with timecodes and color-coded speakers."""
67
+ if output_path is None:
68
+ output_path = os.path.join(tempfile.gettempdir(), f"transcript_{int(time.time())}.docx")
69
 
70
+ formatted_date = date.today().strftime("%d/%m/%Y")
71
+ doc = Document()
72
+ doc.add_heading('Transcription ' + formatted_date, 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ for segment in result["segments"]:
75
+ start_time = format_time(segment["start"])
76
+ end_time = format_time(segment["end"])
77
+
78
+ # Create a paragraph for this segment
79
+ p = doc.add_paragraph()
80
+ p.add_run(f"[{start_time} - {end_time}] ").bold = True
81
+
82
+ speaker = segment.get("speaker", "UNKNOWN")
83
+ if speaker in SPEAKER_COLORS:
84
+ speaker_run = p.add_run(f"{speaker}: ")
85
+ speaker_run.font.color.rgb = SPEAKER_COLORS[speaker]
86
+ speaker_run.bold = True
87
+ else:
88
+ p.add_run(f"{speaker}: ").bold = True
89
+
90
+ p.add_run(segment["text"])
91
 
92
+ doc.save(output_path)
93
+ return output_path
94
+
95
+ def save_audio(audio_data, sample_rate):
96
+ """Save the recorded audio to a temporary file."""
97
+ filename = f"recorded_audio_{int(time.time())}.wav"
98
+ temp_file = os.path.join(tempfile.gettempdir(), filename)
99
+ sf.write(temp_file, audio_data, sample_rate)
100
+ return temp_file, filename
101
+
102
+ def process_audio(audio_input=None, model_name="large-v2"):
103
+ """Process the audio file or recording."""
104
+ try:
105
+ if audio_input is None:
106
+ return None, None, "No audio provided", None
107
+
108
+ # Determine if it's a file path (upload) or tuple (recording)
109
+ if isinstance(audio_input, tuple) and len(audio_input) >= 2:
110
+ # Handle recorded audio
111
+ filepath, filename = save_audio(audio_input[0], audio_input[1])
112
+ is_temp_file = True
113
+ else:
114
+ # Handle uploaded file
115
+ filepath = audio_input
116
+ filename = os.path.basename(filepath) if filepath else None
117
+ is_temp_file = False
118
+
119
+ # Transcribe audio
120
+ result, error = transcribe_audio(filepath, model_name)
121
+ if error:
122
+ return None, None, f"Transcription error: {error}", None
123
+
124
+ # Export to DOCX
125
+ docx_path = export_to_docx(result)
126
+
127
+ # Prepare display table
128
+ table_data = []
129
+ for segment in result["segments"]:
130
+ start_time = format_time(segment["start"])
131
+ end_time = format_time(segment["end"])
132
+ speaker = segment.get("speaker", "UNKNOWN")
133
+ text = segment["text"]
134
+ table_data.append([f"{start_time} - {end_time}", speaker, text])
135
+
136
+ # Prepare audio for download
137
+ if is_temp_file:
138
+ download_path = (filepath, filename)
139
+ else:
140
+ # For uploaded files, no need to provide download as user already has the file
141
+ download_path = None
142
+
143
+ return table_data, docx_path, "Transcription completed successfully", download_path
144
+ except Exception as e:
145
+ return None, None, f"Error: {str(e)}", None
146
+
147
+ def create_interface():
148
+ """Create the Gradio interface."""
149
+ with gr.Blocks(title="WhisperX Transcription") as interface:
150
+ gr.Markdown("# 🎙️ Audio Transcription with Speaker Identification")
151
+ gr.Markdown("Upload an audio file or record directly to transcribe and identify speakers.")
152
+
153
+ with gr.Row():
154
+ with gr.Column():
155
+ gr.Markdown("### Input")
156
+ with gr.Tab("Audio Input"):
157
+ audio_file = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or Record Audio (MP3 or WAV)")
158
+
159
+ model_dropdown = gr.Dropdown(
160
+ choices=["large-v2", "large-v3", "medium", "small", "base", "tiny"],
161
+ value="large-v2",
162
+ label="Whisper Model",
163
+ info="Larger models are more accurate but require more computational resources"
164
+ )
165
+
166
+ process_btn = gr.Button("Transcribe", variant="primary")
167
+
168
+ with gr.Column():
169
+ gr.Markdown("### Results")
170
+ status = gr.Textbox(label="Status", interactive=False)
171
+ transcription = gr.DataFrame(
172
+ headers=["Time", "Speaker", "Text"],
173
+ label="Transcription Results"
174
+ )
175
+
176
+ with gr.Row():
177
+ docx_output = gr.File(label="DOCX Export")
178
+ audio_download = gr.File(label="Download Recorded Audio")
179
+
180
+ process_btn.click(
181
+ fn=process_audio,
182
+ inputs=[audio_file, model_dropdown],
183
+ outputs=[transcription, docx_output, status, audio_download]
184
+ )
185
+
186
+ return interface
187
+
188
+ interface = create_interface()
189
+ interface.queue.launch(ssr_mode=False)
requirements.txt CHANGED
@@ -1,13 +1,147 @@
1
- transformers
2
- yt-dlp
3
- torch
4
- torchvision
5
- torchaudio
6
- nemo_toolkit
7
- faster-whisper
8
- ctranslate2
9
- intervaltree
10
- srt
11
- torch
12
- dotenv
13
- pyannote.audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1 ; python_version >= "3.10" and python_version < "3.12"
2
+ aiohappyeyeballs==2.4.6 ; python_version >= "3.10" and python_version < "3.12"
3
+ aiohttp==3.11.13 ; python_version >= "3.10" and python_version < "3.12"
4
+ aiosignal==1.3.2 ; python_version >= "3.10" and python_version < "3.12"
5
+ alembic==1.14.1 ; python_version >= "3.10" and python_version < "3.12"
6
+ annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "3.12"
7
+ antlr4-python3-runtime==4.9.3 ; python_version >= "3.10" and python_version < "3.12"
8
+ anyio==4.8.0 ; python_version >= "3.10" and python_version < "3.12"
9
+ asteroid-filterbanks==0.4.0 ; python_version >= "3.10" and python_version < "3.12"
10
+ async-timeout==5.0.1 ; python_version == "3.10"
11
+ attrs==25.1.0 ; python_version >= "3.10" and python_version < "3.12"
12
+ av==14.2.0 ; python_version >= "3.10" and python_version < "3.12"
13
+ certifi==2025.1.31 ; python_version >= "3.10" and python_version < "3.12"
14
+ cffi==1.17.1 ; python_version >= "3.10" and python_version < "3.12"
15
+ charset-normalizer==3.4.1 ; python_version >= "3.10" and python_version < "3.12"
16
+ click==8.1.8 ; python_version >= "3.10" and python_version < "3.12"
17
+ colorama==0.4.6 ; python_version >= "3.10" and python_version < "3.12" and (platform_system == "Windows" or sys_platform == "win32")
18
+ coloredlogs==15.0.1 ; python_version >= "3.10" and python_version < "3.12"
19
+ colorlog==6.9.0 ; python_version >= "3.10" and python_version < "3.12"
20
+ contourpy==1.3.1 ; python_version >= "3.10" and python_version < "3.12"
21
+ ctranslate2==4.4.0 ; python_version >= "3.10" and python_version < "3.12"
22
+ cycler==0.12.1 ; python_version >= "3.10" and python_version < "3.12"
23
+ docopt==0.6.2 ; python_version >= "3.10" and python_version < "3.12"
24
+ einops==0.8.1 ; python_version >= "3.10" and python_version < "3.12"
25
+ exceptiongroup==1.2.2 ; python_version == "3.10"
26
+ fastapi==0.115.11 ; python_version >= "3.10" and python_version < "3.12"
27
+ faster-whisper==1.1.0 ; python_version >= "3.10" and python_version < "3.12"
28
+ ffmpy==0.5.0 ; python_version >= "3.10" and python_version < "3.12"
29
+ filelock==3.17.0 ; python_version >= "3.10" and python_version < "3.12"
30
+ flatbuffers==25.2.10 ; python_version >= "3.10" and python_version < "3.12"
31
+ fonttools==4.56.0 ; python_version >= "3.10" and python_version < "3.12"
32
+ frozenlist==1.5.0 ; python_version >= "3.10" and python_version < "3.12"
33
+ fsspec==2025.2.0 ; python_version >= "3.10" and python_version < "3.12"
34
+ gradio-client==1.3.0 ; python_version >= "3.10" and python_version < "3.12"
35
+ gradio==4.44.1 ; python_version >= "3.10" and python_version < "3.12"
36
+ greenlet==3.1.1 ; python_version >= "3.10" and python_version < "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32")
37
+ h11==0.14.0 ; python_version >= "3.10" and python_version < "3.12"
38
+ httpcore==1.0.7 ; python_version >= "3.10" and python_version < "3.12"
39
+ httpx==0.28.1 ; python_version >= "3.10" and python_version < "3.12"
40
+ huggingface-hub==0.29.1 ; python_version >= "3.10" and python_version < "3.12"
41
+ humanfriendly==10.0 ; python_version >= "3.10" and python_version < "3.12"
42
+ hyperpyyaml==1.2.2 ; python_version >= "3.10" and python_version < "3.12"
43
+ idna==3.10 ; python_version >= "3.10" and python_version < "3.12"
44
+ importlib-resources==6.5.2 ; python_version >= "3.10" and python_version < "3.12"
45
+ jinja2==3.1.5 ; python_version >= "3.10" and python_version < "3.12"
46
+ joblib==1.4.2 ; python_version >= "3.10" and python_version < "3.12"
47
+ julius==0.2.7 ; python_version >= "3.10" and python_version < "3.12"
48
+ kiwisolver==1.4.8 ; python_version >= "3.10" and python_version < "3.12"
49
+ lightning-utilities==0.12.0 ; python_version >= "3.10" and python_version < "3.12"
50
+ lightning==2.5.0.post0 ; python_version >= "3.10" and python_version < "3.12"
51
+ lxml==5.3.1 ; python_version >= "3.10" and python_version < "3.12"
52
+ mako==1.3.9 ; python_version >= "3.10" and python_version < "3.12"
53
+ markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "3.12"
54
+ markupsafe==2.1.5 ; python_version >= "3.10" and python_version < "3.12"
55
+ matplotlib==3.10.1 ; python_version >= "3.10" and python_version < "3.12"
56
+ mdurl==0.1.2 ; python_version >= "3.10" and python_version < "3.12"
57
+ mpmath==1.3.0 ; python_version >= "3.10" and python_version < "3.12"
58
+ multidict==6.1.0 ; python_version >= "3.10" and python_version < "3.12"
59
+ networkx==3.4.2 ; python_version >= "3.10" and python_version < "3.12"
60
+ nltk==3.9.1 ; python_version >= "3.10" and python_version < "3.12"
61
+ numpy==1.26.4 ; python_version >= "3.10" and python_version < "3.12"
62
+ nvidia-cublas-cu12==12.4.5.8 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
63
+ nvidia-cuda-cupti-cu12==12.4.127 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
64
+ nvidia-cuda-nvrtc-cu12==12.4.127 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
65
+ nvidia-cuda-runtime-cu12==12.4.127 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
66
+ nvidia-cudnn-cu12==9.1.0.70 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
67
+ nvidia-cufft-cu12==11.2.1.3 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
68
+ nvidia-curand-cu12==10.3.5.147 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
69
+ nvidia-cusolver-cu12==11.6.1.9 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
70
+ nvidia-cusparse-cu12==12.3.1.170 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
71
+ nvidia-cusparselt-cu12==0.6.2 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
72
+ nvidia-nccl-cu12==2.21.5 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
73
+ nvidia-nvjitlink-cu12==12.4.127 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
74
+ nvidia-nvtx-cu12==12.4.127 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
75
+ omegaconf==2.3.0 ; python_version >= "3.10" and python_version < "3.12"
76
+ onnxruntime==1.20.1 ; python_version >= "3.10" and python_version < "3.12"
77
+ optuna==4.2.1 ; python_version >= "3.10" and python_version < "3.12"
78
+ orjson==3.10.15 ; python_version >= "3.10" and python_version < "3.12"
79
+ packaging==24.2 ; python_version >= "3.10" and python_version < "3.12"
80
+ pandas==2.2.3 ; python_version >= "3.10" and python_version < "3.12"
81
+ pillow==10.4.0 ; python_version >= "3.10" and python_version < "3.12"
82
+ primepy==1.3 ; python_version >= "3.10" and python_version < "3.12"
83
+ propcache==0.3.0 ; python_version >= "3.10" and python_version < "3.12"
84
+ protobuf==5.29.3 ; python_version >= "3.10" and python_version < "3.12"
85
+ pyannote-audio==3.3.2 ; python_version >= "3.10" and python_version < "3.12"
86
+ pyannote-core==5.0.0 ; python_version >= "3.10" and python_version < "3.12"
87
+ pyannote-database==5.1.3 ; python_version >= "3.10" and python_version < "3.12"
88
+ pyannote-metrics==3.2.1 ; python_version >= "3.10" and python_version < "3.12"
89
+ pyannote-pipeline==3.0.1 ; python_version >= "3.10" and python_version < "3.12"
90
+ pycparser==2.22 ; python_version >= "3.10" and python_version < "3.12"
91
+ pydantic-core==2.27.2 ; python_version >= "3.10" and python_version < "3.12"
92
+ pydantic==2.10.6 ; python_version >= "3.10" and python_version < "3.12"
93
+ pydub==0.25.1 ; python_version >= "3.10" and python_version < "3.12"
94
+ pygments==2.19.1 ; python_version >= "3.10" and python_version < "3.12"
95
+ pyparsing==3.2.1 ; python_version >= "3.10" and python_version < "3.12"
96
+ pyreadline3==3.5.4 ; python_version >= "3.10" and python_version < "3.12" and sys_platform == "win32"
97
+ python-dateutil==2.9.0.post0 ; python_version >= "3.10" and python_version < "3.12"
98
+ python-docx==1.1.2 ; python_version >= "3.10" and python_version < "3.12"
99
+ python-dotenv==1.0.1 ; python_version >= "3.10" and python_version < "3.12"
100
+ python-multipart==0.0.20 ; python_version >= "3.10" and python_version < "3.12"
101
+ pytorch-lightning==2.5.0.post0 ; python_version >= "3.10" and python_version < "3.12"
102
+ pytorch-metric-learning==2.8.1 ; python_version >= "3.10" and python_version < "3.12"
103
+ pytz==2025.1 ; python_version >= "3.10" and python_version < "3.12"
104
+ pyyaml==6.0.2 ; python_version >= "3.10" and python_version < "3.12"
105
+ regex==2024.11.6 ; python_version >= "3.10" and python_version < "3.12"
106
+ requests==2.32.3 ; python_version >= "3.10" and python_version < "3.12"
107
+ rich==13.9.4 ; python_version >= "3.10" and python_version < "3.12"
108
+ ruamel-yaml-clib==0.2.12 ; python_version >= "3.10" and python_version < "3.12" and platform_python_implementation == "CPython"
109
+ ruamel-yaml==0.18.10 ; python_version >= "3.10" and python_version < "3.12"
110
+ ruff==0.9.9 ; python_version >= "3.10" and python_version < "3.12" and sys_platform != "emscripten"
111
+ safetensors==0.5.3 ; python_version >= "3.10" and python_version < "3.12"
112
+ scikit-learn==1.6.1 ; python_version >= "3.10" and python_version < "3.12"
113
+ scipy==1.15.2 ; python_version >= "3.10" and python_version < "3.12"
114
+ semantic-version==2.10.0 ; python_version >= "3.10" and python_version < "3.12"
115
+ semver==3.0.4 ; python_version >= "3.10" and python_version < "3.12"
116
+ sentencepiece==0.2.0 ; python_version >= "3.10" and python_version < "3.12"
117
+ setuptools==75.8.2 ; python_version >= "3.10" and python_version < "3.12"
118
+ shellingham==1.5.4 ; python_version >= "3.10" and python_version < "3.12"
119
+ six==1.17.0 ; python_version >= "3.10" and python_version < "3.12"
120
+ sniffio==1.3.1 ; python_version >= "3.10" and python_version < "3.12"
121
+ sortedcontainers==2.4.0 ; python_version >= "3.10" and python_version < "3.12"
122
+ soundfile==0.13.1 ; python_version >= "3.10" and python_version < "3.12"
123
+ speechbrain==1.0.2 ; python_version >= "3.10" and python_version < "3.12"
124
+ sqlalchemy==2.0.38 ; python_version >= "3.10" and python_version < "3.12"
125
+ starlette==0.46.0 ; python_version >= "3.10" and python_version < "3.12"
126
+ sympy==1.13.1 ; python_version >= "3.10" and python_version < "3.12"
127
+ tabulate==0.9.0 ; python_version >= "3.10" and python_version < "3.12"
128
+ tensorboardx==2.6.2.2 ; python_version >= "3.10" and python_version < "3.12"
129
+ threadpoolctl==3.5.0 ; python_version >= "3.10" and python_version < "3.12"
130
+ tokenizers==0.21.0 ; python_version >= "3.10" and python_version < "3.12"
131
+ tomlkit==0.12.0 ; python_version >= "3.10" and python_version < "3.12"
132
+ torch-audiomentations==0.12.0 ; python_version >= "3.10" and python_version < "3.12"
133
+ torch-pitch-shift==1.2.5 ; python_version >= "3.10" and python_version < "3.12"
134
+ torch==2.6.0 ; python_version >= "3.10" and python_version < "3.12"
135
+ torchaudio==2.6.0 ; python_version >= "3.10" and python_version < "3.12"
136
+ torchmetrics==1.6.1 ; python_version >= "3.10" and python_version < "3.12"
137
+ tqdm==4.67.1 ; python_version >= "3.10" and python_version < "3.12"
138
+ transformers==4.49.0 ; python_version >= "3.10" and python_version < "3.12"
139
+ triton==3.2.0 ; python_version >= "3.10" and python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64"
140
+ typer==0.15.2 ; python_version >= "3.10" and python_version < "3.12"
141
+ typing-extensions==4.12.2 ; python_version >= "3.10" and python_version < "3.12"
142
+ tzdata==2025.1 ; python_version >= "3.10" and python_version < "3.12"
143
+ urllib3==2.3.0 ; python_version >= "3.10" and python_version < "3.12"
144
+ uvicorn==0.34.0 ; python_version >= "3.10" and python_version < "3.12" and sys_platform != "emscripten"
145
+ websockets==12.0 ; python_version >= "3.10" and python_version < "3.12"
146
+ whisperx==3.3.1 ; python_version >= "3.10" and python_version < "3.12"
147
+ yarl==1.18.3 ; python_version >= "3.10" and python_version < "3.12"