msa1 / app.py
zouhairk's picture
init envs tocken
af79835
raw
history blame
2.02 kB
from flask import Flask, request, jsonify
from transformers import AutoProcessor, SeamlessM4Tv2Model
import numpy as np
import sounddevice as sd
import wave
import os
from huggingface_hub import InferenceClient, login
from dotenv import load_dotenv
app = Flask(__name__)
load_dotenv()
hftoken = os.getenv("HF_TOKEN")
login(token=hftoken)
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large" )
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
UPLOAD_FOLDER = "audio_files"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
@app.route("/", methods=["GET"])
def return_text():
return jsonify({"text": "Hello, world!"})
@app.route("/record", methods=["POST"])
def record_audio():
file = request.files['audio']
filename = os.path.join(UPLOAD_FOLDER, file.filename)
file.save(filename)
# Charger et traiter l'audio
audio_data, orig_freq = torchaudio.load(filename)
audio_inputs = processor(audios=audio_data, return_tensors="pt")
output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False)
translated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
return jsonify({"translated_text": translated_text})
@app.route("/text_to_speech", methods=["POST"])
def text_to_speech():
data = request.get_json()
text = data.get("text")
src_lang = data.get("src_lang")
tgt_lang = data.get("tgt_lang")
text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt")
audio_array = model.generate(**text_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
output_filename = os.path.join(UPLOAD_FOLDER, "output.wav")
with wave.open(output_filename, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(16000)
wf.writeframes((audio_array * 32767).astype(np.int16).tobytes())
return jsonify({"audio_url": output_filename})
if __name__ == "__main__":
app.run(debug=True)