msa1 / app.py
zouhairk's picture
test
bdaabef
raw
history blame
1.86 kB
from flask import Flask, request, jsonify
from transformers import AutoProcessor, SeamlessM4Tv2Model
import numpy as np
import sounddevice as sd
import wave
import os
app = Flask(__name__)
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
UPLOAD_FOLDER = "audio_files"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
@app.route("/", methods=["GET"])
def return_text():
return jsonify({"text": "Hello, world!"})
@app.route("/record", methods=["POST"])
def record_audio():
file = request.files['audio']
filename = os.path.join(UPLOAD_FOLDER, file.filename)
file.save(filename)
# Charger et traiter l'audio
audio_data, orig_freq = torchaudio.load(filename)
audio_inputs = processor(audios=audio_data, return_tensors="pt")
output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False)
translated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
return jsonify({"translated_text": translated_text})
@app.route("/text_to_speech", methods=["POST"])
def text_to_speech():
data = request.get_json()
text = data.get("text")
src_lang = data.get("src_lang")
tgt_lang = data.get("tgt_lang")
text_inputs = processor(text=text, src_lang=src_lang, return_tensors="pt")
audio_array = model.generate(**text_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
output_filename = os.path.join(UPLOAD_FOLDER, "output.wav")
with wave.open(output_filename, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(16000)
wf.writeframes((audio_array * 32767).astype(np.int16).tobytes())
return jsonify({"audio_url": output_filename})
if __name__ == "__main__":
app.run(debug=True)