hans00 commited on
Commit
ca494e8
·
unverified ·
1 Parent(s): bfaeb2a

Alias whisper to whisperx

Browse files
Files changed (6) hide show
  1. .gitignore +5 -1
  2. alias.py +116 -0
  3. app.py +4 -2
  4. pyproject.toml +1 -3
  5. requirements.txt +2 -1
  6. uv.lock +0 -0
.gitignore CHANGED
@@ -1 +1,5 @@
1
- .venv
 
 
 
 
 
1
+ .venv
2
+ __pycache__
3
+ .DS_Store
4
+ .gradio
5
+ venv
alias.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Alias module to redirect whisper imports to whisperx.
3
+ This allows OuteTTS to use whisperx instead of the standard whisper package.
4
+ """
5
+
6
+ import sys
7
+ import importlib.util
8
+
9
+ def setup_whisper_alias():
10
+ """Setup alias so that 'import whisper' uses whisperx instead."""
11
+ try:
12
+ # Check if whisperx is available
13
+ whisperx_spec = importlib.util.find_spec("whisperx")
14
+ if whisperx_spec is None:
15
+ print("Warning: whisperx not found, falling back to regular whisper")
16
+ return
17
+
18
+ # Import whisperx
19
+ import whisperx
20
+
21
+ # Create a module wrapper that provides whisper-like interface
22
+ class WhisperAlias:
23
+ def __init__(self):
24
+ self.model = whisperx.WhisperModel if hasattr(whisperx, 'WhisperModel') else None
25
+ self.load_model = self._load_model
26
+
27
+ def _load_model(self, name, **kwargs):
28
+ """Load model with whisperx compatible interface."""
29
+ # Create WhisperX model instance
30
+ device = "cuda" if kwargs.get("device", "auto") == "cuda" else "cpu"
31
+ compute_type = "float16" if device == "cuda" else "int8"
32
+
33
+ model = whisperx.load_model(
34
+ name,
35
+ device=device,
36
+ compute_type=compute_type
37
+ )
38
+
39
+ return WhisperXModelWrapper(model, device)
40
+
41
+ class WhisperXModelWrapper:
42
+ """Wrapper to make whisperx compatible with whisper interface."""
43
+
44
+ def __init__(self, model, device):
45
+ self.model = model
46
+ self.device = device
47
+
48
+ def transcribe(self, audio, **kwargs):
49
+ """Transcribe audio with whisper-compatible interface."""
50
+ # Store original word_timestamps setting
51
+ original_word_timestamps = kwargs.get('word_timestamps', False)
52
+
53
+ # Load audio if it's a file path
54
+ if isinstance(audio, str):
55
+ audio_data = whisperx.load_audio(audio)
56
+ else:
57
+ audio_data = audio
58
+
59
+ # Use whisperx's transcribe method
60
+ batch_size = kwargs.get('batch_size', 16)
61
+ result = self.model.transcribe(audio_data, batch_size=batch_size)
62
+
63
+ # If word timestamps are requested, perform alignment
64
+ if original_word_timestamps and result.get("segments"):
65
+ try:
66
+ # Load alignment model
67
+ model_a, metadata = whisperx.load_align_model(
68
+ language_code=result.get("language", "en"),
69
+ device=self.device
70
+ )
71
+
72
+ # Align the segments
73
+ result = whisperx.align(
74
+ result["segments"],
75
+ model_a,
76
+ metadata,
77
+ audio_data,
78
+ self.device,
79
+ return_char_alignments=False
80
+ )
81
+ except Exception as e:
82
+ print(f"Warning: Could not perform alignment: {e}")
83
+ # Continue without alignment
84
+
85
+ # Ensure result format is compatible with whisper format
86
+ if "segments" not in result:
87
+ result["segments"] = []
88
+
89
+ # Ensure 'text' field exists - concatenate all segment texts
90
+ if "text" not in result:
91
+ result["text"] = " ".join([segment.get("text", "") for segment in result.get("segments", [])])
92
+
93
+ # Add words field to segments if word timestamps were requested
94
+ for segment in result.get("segments", []):
95
+ if original_word_timestamps and "words" not in segment:
96
+ # If we don't have words but they were requested, create empty words list
97
+ segment["words"] = []
98
+
99
+ return result
100
+
101
+ # Create the alias module
102
+ whisper_alias = WhisperAlias()
103
+
104
+ # Add to sys.modules so 'import whisper' uses our alias
105
+ sys.modules['whisper'] = whisper_alias
106
+
107
+ print("✅ Successfully aliased whisper to whisperx")
108
+
109
+ except ImportError as e:
110
+ print(f"Warning: Could not setup whisper alias: {e}")
111
+ print("Falling back to regular whisper (if available)")
112
+ except Exception as e:
113
+ print(f"Warning: Error setting up whisper alias: {e}")
114
+
115
+ # Auto-setup when module is imported
116
+ setup_whisper_alias()
app.py CHANGED
@@ -1,4 +1,6 @@
1
  import gradio as gr
 
 
2
  import outetts
3
  import json
4
  import tempfile
@@ -14,7 +16,7 @@ def initialize_interface(model_name: str):
14
  config = outetts.ModelConfig.auto_config(
15
  model=model,
16
  backend=outetts.Backend.LLAMACPP,
17
- quantization=outetts.LlamaCppQuantization.Q8_0,
18
  )
19
 
20
  # Initialize the interface
@@ -30,7 +32,7 @@ def create_speaker_and_generate(model_name, audio_file, test_text="", temperatur
30
  interface = initialize_interface(model_name)
31
 
32
  # Create speaker profile from audio
33
- speaker = interface.create_speaker(audio_file)
34
 
35
  # Convert speaker dict to formatted JSON
36
  speaker_json = json.dumps(speaker, indent=2, ensure_ascii=False)
 
1
  import gradio as gr
2
+ # Import alias module before outetts to setup whisper redirection
3
+ import alias
4
  import outetts
5
  import json
6
  import tempfile
 
16
  config = outetts.ModelConfig.auto_config(
17
  model=model,
18
  backend=outetts.Backend.LLAMACPP,
19
+ quantization=outetts.LlamaCppQuantization.Q5_0,
20
  )
21
 
22
  # Initialize the interface
 
32
  interface = initialize_interface(model_name)
33
 
34
  # Create speaker profile from audio
35
+ speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo")
36
 
37
  # Convert speaker dict to formatted JSON
38
  speaker_json = json.dumps(speaker, indent=2, ensure_ascii=False)
pyproject.toml CHANGED
@@ -8,7 +8,5 @@ dependencies = [
8
  "gradio>=5.35.0",
9
  "numba==0.61.2",
10
  "outetts",
 
11
  ]
12
-
13
- [tool.uv.sources]
14
- outetts = { git = "https://github.com/edwko/OuteTTS.git" }
 
8
  "gradio>=5.35.0",
9
  "numba==0.61.2",
10
  "outetts",
11
+ "whisperx>=3.4.2",
12
  ]
 
 
 
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
2
  llama-cpp-python
 
3
  numba==0.61.2
4
- outetts==0.4.4
5
  gradio
 
1
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
2
  llama-cpp-python
3
+ whisperx
4
  numba==0.61.2
5
+ outetts>=0.4.4
6
  gradio
uv.lock CHANGED
The diff for this file is too large to render. See raw diff