File size: 2,569 Bytes
bfcb0a5
9de27c4
 
73a9cc3
 
bfcb0a5
a03524e
 
485f9b6
 
9de27c4
485f9b6
af79835
485f9b6
 
 
 
9de27c4
485f9b6
a03524e
485f9b6
 
 
 
 
 
 
 
 
a03524e
73a9cc3
 
 
bfcb0a5
bdaabef
bfcb0a5
bdaabef
bfcb0a5
 
 
 
 
73a9cc3
bfcb0a5
 
 
 
 
73a9cc3
bfcb0a5
73a9cc3
bfcb0a5
 
 
 
 
 
73a9cc3
bfcb0a5
 
73a9cc3
bfcb0a5
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from flask import Flask, request, jsonify
from transformers import AutoProcessor, SeamlessM4Tv2Model
import numpy as np
import wave
import os
from huggingface_hub import InferenceClient, login
import logging

# Configurer les logs de debug
logging.basicConfig(level=logging.DEBUG)

app = Flask(__name__)

# Définir et créer le dossier de cache pour Hugging Face
HUGGINGFACE_CACHE_DIR = "./huggingface_cache"
os.makedirs(HUGGINGFACE_CACHE_DIR, exist_ok=True)
logging.debug("Dossier de cache Hugging Face : %s", HUGGINGFACE_CACHE_DIR)

# Charger le processor et le modèle en utilisant le cache
logging.debug("Début du chargement du processor et du modèle...")
processor = AutoProcessor.from_pretrained(
    "facebook/seamless-m4t-v2-large", cache_dir=HUGGINGFACE_CACHE_DIR
)
logging.debug("Processor chargé avec succès et mis en cache : %s", processor)
model = SeamlessM4Tv2Model.from_pretrained(
    "facebook/seamless-m4t-v2-large", cache_dir=HUGGINGFACE_CACHE_DIR
)
logging.debug("Modèle chargé avec succès et mis en cache : %s", model)


UPLOAD_FOLDER = "audio_files"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

@app.route("/", methods=["GET"])
def return_text():
    return jsonify({"text": "Hello, world!"})

@app.route("/record", methods=["POST"])
def record_audio():
    file = request.files['audio']
    filename = os.path.join(UPLOAD_FOLDER, file.filename)
    file.save(filename)
    
    # Charger et traiter l'audio
    audio_data, orig_freq = torchaudio.load(filename)
    audio_inputs = processor(audios=audio_data, return_tensors="pt")
    output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False)
    translated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
    
    return jsonify({"translated_text": translated_text})

@app.route("/text_to_speech", methods=["POST"])
def text_to_speech():
    data = request.get_json()
    text = data.get("text")
    src_lang = data.get("src_lang")
    tgt_lang = data.get("tgt_lang")
    
    text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt")
    audio_array = model.generate(**text_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
    
    output_filename = os.path.join(UPLOAD_FOLDER, "output.wav")
    with wave.open(output_filename, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(16000)
        wf.writeframes((audio_array * 32767).astype(np.int16).tobytes())
    
    return jsonify({"audio_url": output_filename})

if __name__ == "__main__":
    app.run(debug=True)