imseldrith commited on
Commit
ef251c2
1 Parent(s): 9a2e8e8

Upload folder using huggingface_hub (#5)

Browse files

- Upload folder using huggingface_hub (5fc1e76a65e9e1f3bb413c0a6d1540973637ad01)

.github/workflows/build-docker.yml CHANGED
@@ -26,6 +26,15 @@ jobs:
26
  - name: Check out code
27
  uses: actions/checkout@v4
28
 
 
 
 
 
 
 
 
 
 
29
  - name: Set up Docker Buildx
30
  uses: docker/setup-buildx-action@v2
31
  with:
@@ -69,7 +78,7 @@ jobs:
69
  labels: version=${{ github.run_id }}
70
  platforms: linux/amd64,linux/arm64
71
 
72
- build-and-push-min-image:
73
  runs-on: ubuntu-latest
74
 
75
  permissions:
@@ -86,6 +95,15 @@ jobs:
86
  - name: Check out code
87
  uses: actions/checkout@v4
88
 
 
 
 
 
 
 
 
 
 
89
  - name: Set up Docker Buildx
90
  uses: docker/setup-buildx-action@v2
91
  with:
@@ -129,7 +147,7 @@ jobs:
129
  labels: version=${{ github.run_id }}
130
  platforms: linux/amd64,linux/arm64
131
 
132
- build-and-push-rocm-image:
133
  runs-on: ubuntu-latest
134
 
135
  permissions:
@@ -147,6 +165,15 @@ jobs:
147
  - name: Check out code
148
  uses: actions/checkout@v4
149
 
 
 
 
 
 
 
 
 
 
150
  - name: Set up Docker Buildx
151
  uses: docker/setup-buildx-action@v2
152
  with:
@@ -193,4 +220,3 @@ jobs:
193
  platforms: linux/amd64,linux/arm64
194
  build-args: |
195
  USE_ROCM=1
196
-
 
26
  - name: Check out code
27
  uses: actions/checkout@v4
28
 
29
+ - name: Free Disk Space Before Build
30
+ run: |
31
+ sudo rm -rf /usr/local/.ghcup
32
+ sudo rm -rf /opt/hostedtoolcache/CodeQL
33
+ sudo rm -rf /usr/local/lib/android
34
+ sudo rm -rf /usr/share/dotnet
35
+ sudo rm -rf /opt/ghc
36
+ sudo rm -rf /usr/local/share/boost
37
+
38
  - name: Set up Docker Buildx
39
  uses: docker/setup-buildx-action@v2
40
  with:
 
78
  labels: version=${{ github.run_id }}
79
  platforms: linux/amd64,linux/arm64
80
 
81
+ build-and-push-image-min:
82
  runs-on: ubuntu-latest
83
 
84
  permissions:
 
95
  - name: Check out code
96
  uses: actions/checkout@v4
97
 
98
+ - name: Free Disk Space Before Build
99
+ run: |
100
+ sudo rm -rf /usr/local/.ghcup
101
+ sudo rm -rf /opt/hostedtoolcache/CodeQL
102
+ sudo rm -rf /usr/local/lib/android
103
+ sudo rm -rf /usr/share/dotnet
104
+ sudo rm -rf /opt/ghc
105
+ sudo rm -rf /usr/local/share/boost
106
+
107
  - name: Set up Docker Buildx
108
  uses: docker/setup-buildx-action@v2
109
  with:
 
147
  labels: version=${{ github.run_id }}
148
  platforms: linux/amd64,linux/arm64
149
 
150
+ build-and-push-image-rocm:
151
  runs-on: ubuntu-latest
152
 
153
  permissions:
 
165
  - name: Check out code
166
  uses: actions/checkout@v4
167
 
168
+ - name: Free Disk Space Before Build
169
+ run: |
170
+ sudo rm -rf /usr/local/.ghcup
171
+ sudo rm -rf /opt/hostedtoolcache/CodeQL
172
+ sudo rm -rf /usr/local/lib/android
173
+ sudo rm -rf /usr/share/dotnet
174
+ sudo rm -rf /opt/ghc
175
+ sudo rm -rf /usr/local/share/boost
176
+
177
  - name: Set up Docker Buildx
178
  uses: docker/setup-buildx-action@v2
179
  with:
 
220
  platforms: linux/amd64,linux/arm64
221
  build-args: |
222
  USE_ROCM=1
 
Dockerfile.min ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ARG TARGETPLATFORM
4
+ RUN <<EOF
5
+ apt-get update
6
+ apt-get install --no-install-recommends -y curl ffmpeg
7
+ if [ "$TARGETPLATFORM" != "linux/amd64" ]; then
8
+ apt-get install --no-install-recommends -y build-essential
9
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
10
+ fi
11
+ apt-get clean
12
+ rm -rf /var/lib/apt/lists/*
13
+ EOF
14
+ ENV PATH="/root/.cargo/bin:${PATH}"
15
+
16
+ WORKDIR /app
17
+ RUN mkdir -p voices config
18
+
19
+ COPY requirements*.txt /app/
20
+ RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements-min.txt
21
+ COPY *.py *.sh *.default.yaml README.md LICENSE /app/
22
+
23
+ ENV TTS_HOME=voices
24
+ ENV HF_HOME=voices
25
+
26
+ CMD bash startup.min.sh
pre_process_map.default.yaml CHANGED
@@ -31,7 +31,6 @@
31
  - ' F.Y. '
32
  - - ([0-9]+)-([0-9]+)
33
  - \1 to \2
34
- - - '\*\*\*'
35
- - '*'
36
- - - '\*\*'
37
- - '*'
 
31
  - ' F.Y. '
32
  - - ([0-9]+)-([0-9]+)
33
  - \1 to \2
34
+ # xtts has a lot of trouble with these, but piper is fine.
35
+ #- - '[\*=+-]+'
36
+ # - ' '
 
requirements-min.txt CHANGED
@@ -2,4 +2,5 @@ fastapi
2
  uvicorn
3
  loguru
4
  numpy<2
5
- piper-tts
 
 
2
  uvicorn
3
  loguru
4
  numpy<2
5
+ piper-tts
6
+ pyyaml
requirements-rocm.txt CHANGED
@@ -2,8 +2,9 @@ fastapi
2
  uvicorn
3
  loguru
4
  piper-tts
5
- coqui-tts
6
  langdetect
 
7
  # Creating an environment where deepspeed works is complex, for now it will be disabled by default.
8
  #deepspeed
9
  torch; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
 
2
  uvicorn
3
  loguru
4
  piper-tts
5
+ coqui-tts[languages]
6
  langdetect
7
+ pyyaml
8
  # Creating an environment where deepspeed works is complex, for now it will be disabled by default.
9
  #deepspeed
10
  torch; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
requirements.txt CHANGED
@@ -4,6 +4,7 @@ loguru
4
  piper-tts
5
  coqui-tts[languages]
6
  langdetect
 
7
  # Creating an environment where deepspeed works is complex, for now it will be disabled by default.
8
  #deepspeed
9
 
 
4
  piper-tts
5
  coqui-tts[languages]
6
  langdetect
7
+ pyyaml
8
  # Creating an environment where deepspeed works is complex, for now it will be disabled by default.
9
  #deepspeed
10
 
sample.env ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ TTS_HOME=voices
2
+ HF_HOME=voices
3
+ #PRELOAD_MODEL=xtts
4
+ #PRELOAD_MODEL=xtts_v2.0.2
5
+ #EXTRA_ARGS=--log-level DEBUG --unload-timer 300
6
+ #USE_ROCM=1
speech.py CHANGED
@@ -10,6 +10,7 @@ import sys
10
  import threading
11
  import time
12
  import yaml
 
13
 
14
  from fastapi.responses import StreamingResponse
15
  from loguru import logger
@@ -84,13 +85,15 @@ class xtts_wrapper():
84
  self.timer.daemon = True
85
  self.timer.start()
86
 
87
- def tts(self, text, language, speaker_wav, **hf_generate_kwargs):
88
  with torch.no_grad():
89
  self.last_used = time.time()
90
  tokens = 0
91
  try:
92
  with self.lock:
93
- gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # not worth caching calls, it's < 0.001s after model is loaded
 
 
94
  pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
95
  self.last_used = time.time()
96
 
@@ -230,7 +233,15 @@ async def generate_speech(request: GenerateSpeechRequest):
230
  tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
231
  tts_proc.stdin.close()
232
 
233
- ffmpeg_args = build_ffmpeg_args(response_format, input_format="s16le", sample_rate="22050")
 
 
 
 
 
 
 
 
234
 
235
  # Pipe the output from piper/xtts to the input of ffmpeg
236
  ffmpeg_args.extend(["-"])
@@ -308,6 +319,21 @@ async def generate_speech(request: GenerateSpeechRequest):
308
  in_q = queue.Queue() # speech pcm
309
  ex_q = queue.Queue() # exceptions
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  def exception_check(exq: queue.Queue):
312
  try:
313
  e = exq.get_nowait()
@@ -318,9 +344,13 @@ async def generate_speech(request: GenerateSpeechRequest):
318
 
319
  def generator():
320
  # text -> in_q
 
 
 
 
321
  try:
322
  for text in all_text:
323
- for chunk in xtts.tts(text=text, language=language, speaker_wav=speaker, **hf_generate_kwargs):
324
  exception_check(ex_q)
325
  in_q.put(chunk)
326
 
 
10
  import threading
11
  import time
12
  import yaml
13
+ import json
14
 
15
  from fastapi.responses import StreamingResponse
16
  from loguru import logger
 
85
  self.timer.daemon = True
86
  self.timer.start()
87
 
88
+ def tts(self, text, language, audio_path, **hf_generate_kwargs):
89
  with torch.no_grad():
90
  self.last_used = time.time()
91
  tokens = 0
92
  try:
93
  with self.lock:
94
+ logger.debug(f"generating [{language}]: {[text]}")
95
+
96
+ gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=audio_path) # not worth caching calls, it's < 0.001s after model is loaded
97
  pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
98
  self.last_used = time.time()
99
 
 
233
  tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
234
  tts_proc.stdin.close()
235
 
236
+ try:
237
+ with open(f"{piper_model}.json", 'r') as pvc_f:
238
+ conf = json.load(pvc_f)
239
+ sample_rate = str(conf['audio']['sample_rate'])
240
+
241
+ except:
242
+ sample_rate = '22050'
243
+
244
+ ffmpeg_args = build_ffmpeg_args(response_format, input_format="s16le", sample_rate=sample_rate)
245
 
246
  # Pipe the output from piper/xtts to the input of ffmpeg
247
  ffmpeg_args.extend(["-"])
 
319
  in_q = queue.Queue() # speech pcm
320
  ex_q = queue.Queue() # exceptions
321
 
322
+ def get_speaker_samples(samples: str) -> list[str]:
323
+ if os.path.isfile(samples):
324
+ audio_path = [samples]
325
+ elif os.path.isdir(samples):
326
+ audio_path = [os.path.join(samples, sample) for sample in os.listdir(samples) if os.path.isfile(os.path.join(samples, sample))]
327
+
328
+ if len(audio_path) < 1:
329
+ logger.error(f"No files found: {samples}")
330
+ raise ServiceUnavailableError(f"Invalid path: {samples}")
331
+ else:
332
+ logger.error(f"Invalid path: {samples}")
333
+ raise ServiceUnavailableError(f"Invalid path: {samples}")
334
+
335
+ return audio_path
336
+
337
  def exception_check(exq: queue.Queue):
338
  try:
339
  e = exq.get_nowait()
 
344
 
345
  def generator():
346
  # text -> in_q
347
+
348
+ audio_path = get_speaker_samples(speaker)
349
+ logger.debug(f"{voice} wav samples: {audio_path}")
350
+
351
  try:
352
  for text in all_text:
353
+ for chunk in xtts.tts(text=text, language=language, audio_path=audio_path, **hf_generate_kwargs):
354
  exception_check(ex_q)
355
  in_q.put(chunk)
356
 
startup.min.sh CHANGED
@@ -4,4 +4,4 @@
4
 
5
  bash download_voices_tts-1.sh
6
 
7
- python speech.py --xtts_device none $EXTRA_ARGS $@ -P 7860
 
4
 
5
  bash download_voices_tts-1.sh
6
 
7
+ python speech.py --xtts_device none $EXTRA_ARGS $@
startup.sh CHANGED
@@ -7,4 +7,4 @@ echo "First startup may download 2GB of speech models. Please wait."
7
  bash download_voices_tts-1.sh
8
  bash download_voices_tts-1-hd.sh $PRELOAD_MODEL
9
 
10
- python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $EXTRA_ARGS $@ -P 7860
 
7
  bash download_voices_tts-1.sh
8
  bash download_voices_tts-1-hd.sh $PRELOAD_MODEL
9
 
10
+ python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $EXTRA_ARGS $@