from fastapi import FastAPI, UploadFile, File from fastapi.responses import JSONResponse from pathlib import Path import os from gector import GecBERTModel from faster_whisper import WhisperModel, BatchedInferencePipeline from transformers.models.whisper.english_normalizer import BasicTextNormalizer from text_processing.inverse_normalize import InverseNormalizer import shutil import uvicorn # Initialize the FastAPI app app = FastAPI() # Initialize models and normalizer current_dir = Path(__file__).parent.as_posix() inverse_normalizer = InverseNormalizer('vi') whisper_model = WhisperModel("pho_distill_q8", device="cuda", compute_type="auto") batched_model = BatchedInferencePipeline(model=whisper_model, use_vad_model=True, chunk_length=20) gector_model = GecBERTModel( vocab_path=os.path.join(current_dir, "gector/vocabulary"), model_paths=[os.path.join(current_dir, "gector/Model_GECTOR")], split_chunk=True ) normalizer = BasicTextNormalizer() @app.post("/transcriptions") async def transcribe_audio(file: UploadFile = File(...)): # Save the uploaded file temporarily temp_file_path = Path(f"temp_{file.filename}") with open(temp_file_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) segments, info = batched_model.transcribe(str(temp_file_path), language="vi", batch_size=16) os.remove(temp_file_path) transcriptions = [segment.text for segment in segments] normalized_transcriptions = [inverse_normalizer.inverse_normalize(normalizer(text)) for text in transcriptions] corrected_texts = gector_model(normalized_transcriptions) return JSONResponse({"text": ' '.join(corrected_texts)}) if __name__ == "__main__": uvicorn.run("api:app", host="0.0.0.0", port=8000)