MatteoScript commited on
Commit
697538e
·
verified ·
1 Parent(s): e9077b8

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +168 -3
main.py CHANGED
@@ -3,7 +3,7 @@ import time
3
  import random
4
  import asyncio
5
  import json
6
- from fastapi import FastAPI, HTTPException, Depends, File, UploadFile, Form
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.security.api_key import APIKeyHeader
9
  from pydantic import BaseModel
@@ -16,6 +16,11 @@ import io
16
  import copy
17
  from pathlib import Path
18
  from pydub import AudioSegment
 
 
 
 
 
19
 
20
  load_dotenv()
21
 
@@ -60,10 +65,14 @@ app.add_middleware(
60
  allow_headers=["*"],
61
  )
62
 
 
 
 
 
63
  # Client OpenAI
64
  def get_openai_client():
65
  ''' Client OpenAI passando in modo RANDOM le Chiavi API. In questo modo posso aggirare i limiti "Quota Exceeded" '''
66
- api_key = random.choice(API_KEYS)
67
  return OpenAI(api_key=api_key, base_url=BASE_URL)
68
 
69
  # Validazione API
@@ -299,7 +308,6 @@ def _transcribe_chunk(chunk_bytes: bytes,
299
  return resp.text
300
  return resp.get("text", "")
301
 
302
-
303
  def get_whisper_client():
304
  api_key = random.choice(GROQ_API_KEYS)
305
  return OpenAI(api_key=api_key, base_url=GROQ_BASE_URL)
@@ -322,6 +330,133 @@ def call_whisper_api(audio_file: io.BytesIO,
322
  return call_whisper_api(audio_file, model, language, response_format)
323
  raise e
324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  # ---------------------------------- Metodi API ---------------------------------------
326
  @app.get("/")
327
  def read_general():
@@ -368,6 +503,36 @@ async def audio_transcriptions_endpoint(
368
  except Exception as e:
369
  raise HTTPException(status_code=500, detail=str(e))
370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  if __name__ == "__main__":
372
  import uvicorn
373
  uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
 
3
  import random
4
  import asyncio
5
  import json
6
+ from fastapi import FastAPI, HTTPException, Depends, File, UploadFile, Form, Request
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.security.api_key import APIKeyHeader
9
  from pydantic import BaseModel
 
16
  import copy
17
  from pathlib import Path
18
  from pydub import AudioSegment
19
+ import base64, uuid, mimetypes
20
+ import struct
21
+ from google import genai
22
+ from google.genai import types
23
+ import re
24
 
25
  load_dotenv()
26
 
 
65
  allow_headers=["*"],
66
  )
67
 
68
+ # Api Key GEMINI (Random della lista in modo da averne di più)
69
+ def get_gemini_apikey():
70
+ return random.choice(API_KEYS)
71
+
72
  # Client OpenAI
73
  def get_openai_client():
74
  ''' Client OpenAI passando in modo RANDOM le Chiavi API. In questo modo posso aggirare i limiti "Quota Exceeded" '''
75
+ api_key = get_gemini_apikey()
76
  return OpenAI(api_key=api_key, base_url=BASE_URL)
77
 
78
  # Validazione API
 
308
  return resp.text
309
  return resp.get("text", "")
310
 
 
311
  def get_whisper_client():
312
  api_key = random.choice(GROQ_API_KEYS)
313
  return OpenAI(api_key=api_key, base_url=GROQ_BASE_URL)
 
330
  return call_whisper_api(audio_file, model, language, response_format)
331
  raise e
332
 
333
+ class SpeechRequest(BaseModel):
334
+ model: Optional[str] = "gemini-2.5-flash-preview-tts"
335
+ input: str
336
+ voice: Optional[str] = "Kore"
337
+ speed: Optional[float] = 1.0
338
+ response_format: Optional[str] = "wav"
339
+ class Config:
340
+ extra = "allow"
341
+
342
+ class SpeechResponse(BaseModel):
343
+ model: str
344
+ response_format: str
345
+ voice: str
346
+ audio: str
347
+
348
+ def convert_format(audio_bytes: bytes, from_fmt: str, to_fmt: str) -> bytes:
349
+ """
350
+ Converte i byte audio da 'from_fmt' a 'to_fmt' usando pydub/ffmpeg.
351
+ Supporta mp3, wav, opus, flac, aac, pcm (raw little-endian 16-bit).
352
+ """
353
+ if from_fmt == to_fmt:
354
+ return audio_bytes
355
+
356
+ audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format=from_fmt)
357
+ buf = io.BytesIO()
358
+ if to_fmt == "pcm": # raw PCM 16-bit LE
359
+ audio.export(buf, format="raw")
360
+ else:
361
+ audio.export(buf, format=to_fmt)
362
+ return buf.getvalue()
363
+
364
+ def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
365
+ """Parses bits per sample and rate from an audio MIME type string """
366
+ bits_per_sample = 16
367
+ rate = 24000
368
+ parts = mime_type.split(";")
369
+ for param in parts:
370
+ param = param.strip()
371
+ if param.lower().startswith("rate="):
372
+ try:
373
+ rate_str = param.split("=", 1)[1]
374
+ rate = int(rate_str)
375
+ except (ValueError, IndexError):
376
+ pass # Keep rate as default
377
+ elif param.startswith("audio/L"):
378
+ try:
379
+ bits_per_sample = int(param.split("L", 1)[1])
380
+ except (ValueError, IndexError):
381
+ pass # Keep bits_per_sample as default if conversion fails
382
+ return {"bits_per_sample": bits_per_sample, "rate": rate}
383
+
384
+ def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
385
+ """Generates a WAV file header for the given audio data and parameters."""
386
+ parameters = parse_audio_mime_type(mime_type)
387
+ bits_per_sample = parameters["bits_per_sample"]
388
+ sample_rate = parameters["rate"]
389
+ num_channels = 1
390
+ data_size = len(audio_data)
391
+ bytes_per_sample = bits_per_sample // 8
392
+ block_align = num_channels * bytes_per_sample
393
+ byte_rate = sample_rate * block_align
394
+ chunk_size = 36 + data_size
395
+ header = struct.pack(
396
+ "<4sI4s4sIHHIIHH4sI",
397
+ b"RIFF", # ChunkID
398
+ chunk_size, # ChunkSize (total file size - 8 bytes)
399
+ b"WAVE", # Format
400
+ b"fmt ", # Subchunk1ID
401
+ 16, # Subchunk1Size (16 for PCM)
402
+ 1, # AudioFormat (1 for PCM)
403
+ num_channels, # NumChannels
404
+ sample_rate, # SampleRate
405
+ byte_rate, # ByteRate
406
+ block_align, # BlockAlign
407
+ bits_per_sample, # BitsPerSample
408
+ b"data", # Subchunk2ID
409
+ data_size # Subchunk2Size (size of audio data)
410
+ )
411
+ return header + audio_data
412
+
413
+ # Generazione Audio
414
+ def generate_audio(model: str,
415
+ content: str,
416
+ speaker1: str = "Kore",
417
+ speaker2: str = "Schedar") -> bytes:
418
+ """Restituisce i byte WAV generati da Gemini-TTS (multi-speaker)."""
419
+ client = genai.Client(api_key=get_gemini_apikey())
420
+ contents = [types.Content(role="user", parts=[types.Part.from_text(text=content)])]
421
+ cfg = types.GenerateContentConfig(
422
+ temperature=1,
423
+ response_modalities=["audio"],
424
+ speech_config=types.SpeechConfig(
425
+ multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
426
+ speaker_voice_configs=[
427
+ types.SpeakerVoiceConfig(
428
+ speaker="Speaker 1",
429
+ voice_config=types.VoiceConfig(
430
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
431
+ voice_name=speaker1
432
+ )
433
+ ),
434
+ ),
435
+ types.SpeakerVoiceConfig(
436
+ speaker="Speaker 2",
437
+ voice_config=types.VoiceConfig(
438
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
439
+ voice_name=speaker2
440
+ )
441
+ ),
442
+ ),
443
+ ]
444
+ ),
445
+ ),
446
+ )
447
+
448
+ for chunk in client.models.generate_content_stream(
449
+ model=model, contents=contents, config=cfg
450
+ ):
451
+ part = chunk.candidates[0].content.parts[0]
452
+ if part.inline_data and part.inline_data.data:
453
+ data = part.inline_data.data
454
+ if mimetypes.guess_extension(part.inline_data.mime_type) is None:
455
+ data = convert_to_wav(data, part.inline_data.mime_type)
456
+ return data
457
+ raise RuntimeError("Nessun dato audio ricevuto")
458
+
459
+
460
  # ---------------------------------- Metodi API ---------------------------------------
461
  @app.get("/")
462
  def read_general():
 
503
  except Exception as e:
504
  raise HTTPException(status_code=500, detail=str(e))
505
 
506
+ @app.post("/v1/audio/speech", dependencies=[Depends(verify_api_key)],
507
+ response_model=SpeechResponse)
508
+ async def audio_speech_endpoint(req: SpeechRequest, request: Request):
509
+ try:
510
+ voices = re.split(r"[;,|]", req.voice)
511
+ speaker1 = voices[0].strip()
512
+ speaker2 = voices[1].strip() if len(voices) > 1 else "Schedar"
513
+ print('------------------------------------------------------- INPUT ---------------------------------------------------------------')
514
+ print(req.voice)
515
+ print(req.input)
516
+ wav_bytes = generate_audio(
517
+ model=req.model,
518
+ content=req.input,
519
+ speaker1=speaker1,
520
+ speaker2=speaker2
521
+ )
522
+ audio_bytes = convert_format(wav_bytes, "wav", req.response_format)
523
+ audio_fmt = req.response_format.lower()
524
+ audio_bytes = convert_format(wav_bytes, "wav", audio_fmt)
525
+ return StreamingResponse(
526
+ io.BytesIO(audio_bytes),
527
+ media_type="application/octet-stream",
528
+ headers={
529
+ "Content-Disposition": f'attachment; filename="audio.{audio_fmt}"',
530
+ "X-OpenAI-Response-Format": audio_fmt,
531
+ },
532
+ )
533
+ except Exception as e:
534
+ raise HTTPException(status_code=500, detail=str(e))
535
+
536
  if __name__ == "__main__":
537
  import uvicorn
538
  uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)