Spaces:

slabstech
/

dhwani-internal-api-server

Paused

App Files Files Community

sachin commited on 15 days ago

Commit

1936ef7

1 Parent(s): 843c466

test-changes

Browse files

Files changed (1) hide show

src/server/main.py +169 -240

src/server/main.py CHANGED Viewed

@@ -3,7 +3,6 @@ import io
 import os
 from time import time
 from typing import List
 import tempfile
 import uvicorn
 from fastapi import Depends, FastAPI, File, HTTPException, Query, Request, UploadFile, Body, Form
@@ -15,31 +14,18 @@ from pydantic_settings import BaseSettings
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from IndicTransToolkit import IndicProcessor
-from logging_config import logger
-from tts_config import SPEED, ResponseFormat, config as tts_config
-from gemma_llm import LLMManager
-# from auth import get_api_key, settings as auth_settings
-import time
 from contextlib import asynccontextmanager
-from typing import Annotated, Any, OrderedDict, List
-import zipfile
 import soundfile as sf
-import torch
-from fastapi import Body, FastAPI, HTTPException, Response
-from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 import numpy as np
-from config import SPEED, ResponseFormat, config
-from logger import logger
-import uvicorn
-import argparse
-from fastapi.responses import RedirectResponse, StreamingResponse
-import io
-import os
-import logging
 # Device setup
 if torch.cuda.is_available():
@@ -63,40 +49,29 @@ if torch.cuda.is_available():
 else:
     print("CUDA is not available on this system.")
-app = FastAPI(
-    title="Dhwani API",
-    description="AI Chat API supporting Indian languages",
-    version="1.0.0",
-    redirect_slashes=False,
-    #lifespan=lifespan
-)
-def chunk_text(text, chunk_size):
-    words = text.split()
-    chunks = []
-    for i in range(0, len(words), chunk_size):
-        chunks.append(' '.join(words[i:i + chunk_size]))
-    return chunks
-import io
-import torch
-import requests
-import tempfile
-import numpy as np
-import soundfile as sf
-from fastapi import FastAPI, HTTPException
-from transformers import AutoModel
-from pydantic import BaseModel
-from typing import Optional
-from starlette.responses import StreamingResponse
 tts_repo_id = "ai4bharat/IndicF5"
-tts_model = AutoModel.from_pretrained(tts_repo_id, trust_remote_code=True)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print("Device:", device)
-tts_model = tts_model.to(device)
 EXAMPLES = [
     {
@@ -107,18 +82,16 @@ EXAMPLES = [
     },
 ]
-# Pydantic model for request body
 class SynthesizeRequest(BaseModel):
-    text: str  # Text to synthesize (expected in Kannada)
-    ref_audio_name: str  # Dropdown of audio names from EXAMPLES
-    ref_text: Optional[str] = None  # Optional, defaults to example ref_text if not provided
 class KannadaSynthesizeRequest(BaseModel):
-    text: str  # Text to synthesize (must be in Kannada)
-# Function to load audio from URL
 def load_audio_from_url(url: str):
     response = requests.get(url)
     if response.status_code == 200:
@@ -126,9 +99,7 @@ def load_audio_from_url(url: str):
         return sample_rate, audio_data
     raise HTTPException(status_code=500, detail="Failed to load reference audio from URL.")
-# Function to synthesize speech
 def synthesize_speech(text: str, ref_audio_name: str, ref_text: str):
-    # Find the matching example
     ref_audio_url = None
     for example in EXAMPLES:
         if example["audio_name"] == ref_audio_name:
@@ -139,58 +110,25 @@ def synthesize_speech(text: str, ref_audio_name: str, ref_text: str):
     if not ref_audio_url:
         raise HTTPException(status_code=400, detail="Invalid reference audio name.")
     if not text.strip():
         raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
     if not ref_text or not ref_text.strip():
         raise HTTPException(status_code=400, detail="Reference text cannot be empty.")
-    # Load reference audio from URL
     sample_rate, audio_data = load_audio_from_url(ref_audio_url)
-    # Save reference audio to a temporary file
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
         sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
         temp_audio.flush()
-        # Generate speech
         audio = tts_model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
-    # Normalize output
     if audio.dtype == np.int16:
         audio = audio.astype(np.float32) / 32768.0
-    # Save generated audio to a BytesIO buffer
     buffer = io.BytesIO()
     sf.write(buffer, audio, 24000, format='WAV')
     buffer.seek(0)
     return buffer
-@app.post("/audio/speech", response_class=StreamingResponse)
-async def synthesize_kannada(request: KannadaSynthesizeRequest):
-    # Use the Kannada example as fixed reference
-    kannada_example = next(ex for ex in EXAMPLES if ex["audio_name"] == "KAN_F (Happy)")
-    if not request.text.strip():
-        raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
-    # Use the fixed Kannada reference audio and text
-    audio_buffer = synthesize_speech(
-        text=request.text,
-        ref_audio_name="KAN_F (Happy)",
-        ref_text=kannada_example["ref_text"]
-    )
-    return StreamingResponse(
-        audio_buffer,
-        media_type="audio/wav",
-        headers={"Content-Disposition": "attachment; filename=synthesized_kannada_speech.wav"}
-    )
-# Supported language codes
 SUPPORTED_LANGUAGES = {
     "asm_Beng", "kas_Arab", "pan_Guru", "ben_Beng", "kas_Deva", "san_Deva",
     "brx_Deva", "mai_Deva", "sat_Olck", "doi_Deva", "mal_Mlym", "snd_Arab",
@@ -201,43 +139,9 @@ SUPPORTED_LANGUAGES = {
     "por_Latn", "rus_Cyrl", "pol_Latn"
 }
-class Settings(BaseSettings):
-    llm_model_name: str = "google/gemma-3-4b-it"
-    max_tokens: int = 512
-    host: str = "0.0.0.0"
-    port: int = 7860
-    chat_rate_limit: str = "100/minute"
-    speech_rate_limit: str = "5/minute"
-    @field_validator("chat_rate_limit", "speech_rate_limit")
-    def validate_rate_limit(cls, v):
-        if not v.count("/") == 1 or not v.split("/")[0].isdigit():
-            raise ValueError("Rate limit must be in format 'number/period' (e.g., '5/minute')")
-        return v
-    class Config:
-        env_file = ".env"
-settings = Settings()
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=False,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-limiter = Limiter(key_func=get_remote_address)
-app.state.limiter = limiter
-llm_manager = LLMManager(settings.llm_model_name)
-# Translation Manager and Model Manager
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 class TranslateManager:
-    def __init__(self, src_lang, tgt_lang, device_type=DEVICE, use_distilled=True):
         self.device_type = device_type
         self.tokenizer, self.model = self.initialize_model(src_lang, tgt_lang, use_distilled)
@@ -258,55 +162,84 @@ class TranslateManager:
             torch_dtype=torch.float16,
             attn_implementation="flash_attention_2"
         ).to(self.device_type)
         model = torch.compile(model, mode="reduce-overhead")
         print("Model compiled with torch.compile")
         return tokenizer, model
 class ModelManager:
-    def __init__(self, device_type=DEVICE, use_distilled=True, is_lazy_loading=False):
-        self.models: dict[str, TranslateManager] = {}
         self.device_type = device_type
         self.use_distilled = use_distilled
         self.is_lazy_loading = is_lazy_loading
-        if not is_lazy_loading:
-            self.preload_models()
-    def preload_models(self):
-        self.models['eng_indic'] = TranslateManager('eng_Latn', 'kan_Knda', self.device_type, self.use_distilled)
-        self.models['indic_eng'] = TranslateManager('kan_Knda', 'eng_Latn', self.device_type, self.use_distilled)
-        self.models['indic_indic'] = TranslateManager('kan_Knda', 'hin_Deva', self.device_type, self.use_distilled)
-    def get_model(self, src_lang, tgt_lang) -> TranslateManager:
         if src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
-            key = 'eng_indic'
         elif not src_lang.startswith("eng") and tgt_lang.startswith("eng"):
-            key = 'indic_eng'
-        elif not src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
-            key = 'indic_indic'
         else:
-            raise ValueError("Invalid language combination: English to English translation is not supported.")
         if key not in self.models:
             if self.is_lazy_loading:
-                if key == 'eng_indic':
-                    self.models[key] = TranslateManager('eng_Latn', 'kan_Knda', self.device_type, self.use_distilled)
-                elif key == 'indic_eng':
-                    self.models[key] = TranslateManager('kan_Knda', 'eng_Latn', self.device_type, self.use_distilled)
-                elif key == 'indic_indic':
-                    self.models[key] = TranslateManager('kan_Knda', 'hin_Deva', self.device_type, self.use_distilled)
             else:
                 raise ValueError(f"Model for {key} is not preloaded and lazy loading is disabled.")
-        return self.models[key]
-ip = IndicProcessor(inference=True)
 model_manager = ModelManager()
 # Pydantic Models
 class ChatRequest(BaseModel):
     prompt: str
-    src_lang: str = "kan_Knda"  # Default to Kannada
-    tgt_lang: str = "kan_Knda"  # Default to Kannada
     @field_validator("prompt")
     def prompt_must_be_valid(cls, v):
@@ -331,11 +264,72 @@ class TranslationRequest(BaseModel):
 class TranslationResponse(BaseModel):
     translations: List[str]
-# Dependency to get TranslateManager
 def get_translate_manager(src_lang: str, tgt_lang: str) -> TranslateManager:
     return model_manager.get_model(src_lang, tgt_lang)
-# Internal Translation Endpoint
 @app.post("/translate", response_model=TranslationResponse)
 async def translate(request: TranslationRequest, translate_manager: TranslateManager = Depends(get_translate_manager)):
     input_sentences = request.sentences
@@ -346,7 +340,6 @@ async def translate(request: TranslationRequest, translate_manager: TranslateMan
         raise HTTPException(status_code=400, detail="Input sentences are required")
     batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
     inputs = translate_manager.tokenizer(
         batch,
         truncation=True,
@@ -375,14 +368,12 @@ async def translate(request: TranslationRequest, translate_manager: TranslateMan
     translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
     return TranslationResponse(translations=translations)
-# Helper function to perform internal translation
 async def perform_internal_translation(sentences: List[str], src_lang: str, tgt_lang: str) -> List[str]:
     translate_manager = model_manager.get_model(src_lang, tgt_lang)
     request = TranslationRequest(sentences=sentences, src_lang=src_lang, tgt_lang=tgt_lang)
     response = await translate(request, translate_manager)
     return response.translations
-# API Endpoints
 @app.get("/v1/health")
 async def health_check():
     return {"status": "healthy", "model": settings.llm_model_name}
@@ -395,7 +386,7 @@ async def home():
 async def unload_all_models():
     try:
         logger.info("Starting to unload all models...")
-        llm_manager.unload()
         logger.info("All models unloaded successfully")
         return {"status": "success", "message": "All models unloaded"}
     except Exception as e:
@@ -406,7 +397,7 @@ async def unload_all_models():
 async def load_all_models():
     try:
         logger.info("Starting to load all models...")
-        llm_manager.load()
         logger.info("All models loaded successfully")
         return {"status": "success", "message": "All models loaded"}
     except Exception as e:
@@ -596,57 +587,10 @@ async def chat_v2(
         logger.error(f"Error processing request: {str(e)}")
         raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
-class TranscriptionResponse(BaseModel):
-    text: str
-class ASRModelManager:
-    def __init__(self, device_type="cuda"):
-        self.device_type = device_type
-        self.model_language = {
-            "kannada": "kn"
-        }
-        '''
-        self.model_language = {
-            "kannada": "kn", "hindi": "hi", "malayalam": "ml", "assamese": "as", "bengali": "bn",
-            "bodo": "brx", "dogri": "doi", "gujarati": "gu", "kashmiri": "ks", "konkani": "kok",
-            "maithili": "mai", "manipuri": "mni", "marathi": "mr", "nepali": "ne", "odia": "or",
-            "punjabi": "pa", "sanskrit": "sa", "santali": "sat", "sindhi": "sd", "tamil": "ta",
-            "telugu": "te", "urdu": "ur"
-        }
-        '''
-from fastapi import FastAPI, UploadFile
-import torch
-import torchaudio
-from transformers import AutoModel
-import argparse
-import uvicorn
-from pydantic import BaseModel
-from pydub import AudioSegment
-from fastapi import FastAPI, File, UploadFile, HTTPException, Query
-from fastapi.responses import RedirectResponse, JSONResponse
-from typing import List
-# Load the model
-model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True)
-asr_manager = ASRModelManager()
-# Language to script mapping
-LANGUAGE_TO_SCRIPT = {
-    "kannada": "kan_Knda"
-}
-'''
-LANGUAGE_TO_SCRIPT = {
-    "kannada": "kan_Knda", "hindi": "hin_Deva", "malayalam": "mal_Mlym", "tamil": "tam_Taml",
-    "telugu": "tel_Telu", "assamese": "asm_Beng", "bengali": "ben_Beng", "gujarati": "guj_Gujr",
-    "marathi": "mar_Deva", "odia": "ory_Orya", "punjabi": "pan_Guru", "urdu": "urd_Arab",
-    # Add more as needed
-}
-'''
 @app.post("/transcribe/", response_model=TranscriptionResponse)
 async def transcribe_audio(file: UploadFile = File(...), language: str = Query(..., enum=list(asr_manager.model_language.keys()))):
     try:
         wav, sr = torchaudio.load(file.file)
         wav = torch.mean(wav, dim=0, keepdim=True)
@@ -654,51 +598,45 @@ async def transcribe_audio(file: UploadFile = File(...), language: str = Query(.
         if sr != target_sample_rate:
             resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sample_rate)
             wav = resampler(wav)
-        transcription_rnnt = model(wav, asr_manager.model_language[language], "rnnt")
         return TranscriptionResponse(text=transcription_rnnt)
     except Exception as e:
         logger.error(f"Error in transcription: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
 @app.post("/v1/speech_to_speech")
 async def speech_to_speech(
-    request: Request,  # Inject Request object from FastAPI
     file: UploadFile = File(...),
     language: str = Query(..., enum=list(asr_manager.model_language.keys())),
 ) -> StreamingResponse:
-    # Step 1: Transcribe audio to text
     transcription = await transcribe_audio(file, language)
     logger.info(f"Transcribed text: {transcription.text}")
-    # Step 2: Process text with chat endpoint
     chat_request = ChatRequest(
         prompt=transcription.text,
-        src_lang=LANGUAGE_TO_SCRIPT.get(language, "kan_Knda"),  # Dynamic script mapping
         tgt_lang=LANGUAGE_TO_SCRIPT.get(language, "kan_Knda")
     )
-    processed_text = await chat(request, chat_request)  # Pass the injected request
     logger.info(f"Processed text: {processed_text.response}")
     voice_request = KannadaSynthesizeRequest(text=processed_text.response)
-    # Step 3: Convert processed text to speech
-    audio_response = await synthesize_kannada(
-        voice_request
-    )
     return audio_response
-class BatchTranscriptionResponse(BaseModel):
-    transcriptions: List[str]
-import json
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run the FastAPI server.")
     parser.add_argument("--port", type=int, default=settings.port, help="Port to run the server on.")
     parser.add_argument("--host", type=str, default=settings.host, help="Host to run the server on.")
-    parser.add_argument("--config", type=str, default="config_one", help="Configuration to use (e.g., config_one, config_two, config_three, config_four)")
     args = parser.parse_args()
-    # Load the JSON configuration file
     def load_config(config_path="dhwani_config.json"):
         with open(config_path, "r") as f:
             return json.load(f)
@@ -710,7 +648,6 @@ if __name__ == "__main__":
     selected_config = config_data["configs"][args.config]
     global_settings = config_data["global_settings"]
-    # Update settings based on selected config
     settings.llm_model_name = selected_config["components"]["LLM"]["model"]
     settings.max_tokens = selected_config["components"]["LLM"]["max_tokens"]
     settings.host = global_settings["host"]
@@ -718,27 +655,19 @@ if __name__ == "__main__":
     settings.chat_rate_limit = global_settings["chat_rate_limit"]
     settings.speech_rate_limit = global_settings["speech_rate_limit"]
-    # Initialize LLMManager with the selected LLM model
     llm_manager = LLMManager(settings.llm_model_name)
-    # Initialize ASR model if present in config
     if selected_config["components"]["ASR"]:
         asr_model_name = selected_config["components"]["ASR"]["model"]
-        model = AutoModel.from_pretrained(asr_model_name, trust_remote_code=True)
         asr_manager.model_language[selected_config["language"]] = selected_config["components"]["ASR"]["language_code"]
-    # Initialize Translation models - load all specified models
     if selected_config["components"]["Translation"]:
         for translation_config in selected_config["components"]["Translation"]:
             src_lang = translation_config["src_lang"]
             tgt_lang = translation_config["tgt_lang"]
-            model_manager.get_model(src_lang, tgt_lang)
-    # Override host and port from command line arguments if provided
     host = args.host if args.host != settings.host else settings.host
     port = args.port if args.port != settings.port else settings.port
-    # Run the server
     uvicorn.run(app, host=host, port=port)

 import os
 from time import time
 from typing import List
 import tempfile
 import uvicorn
 from fastapi import Depends, FastAPI, File, HTTPException, Query, Request, UploadFile, Body, Form
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel
 from IndicTransToolkit import IndicProcessor
+import json
+import asyncio
 from contextlib import asynccontextmanager
 import soundfile as sf
 import numpy as np
+import requests
+from starlette.responses import StreamingResponse
+from logging_config import logger
+from tts_config import SPEED, ResponseFormat, config as tts_config
+from gemma_llm import LLMManager  # Assuming this is your custom LLMManager
 # Device setup
 if torch.cuda.is_available():
 else:
     print("CUDA is not available on this system.")
+# Settings
+class Settings(BaseSettings):
+    llm_model_name: str = "google/gemma-3-4b-it"
+    max_tokens: int = 512
+    host: str = "0.0.0.0"
+    port: int = 7860
+    chat_rate_limit: str = "100/minute"
+    speech_rate_limit: str = "5/minute"
+    @field_validator("chat_rate_limit", "speech_rate_limit")
+    def validate_rate_limit(cls, v):
+        if not v.count("/") == 1 or not v.split("/")[0].isdigit():
+            raise ValueError("Rate limit must be in format 'number/period' (e.g., '5/minute')")
+        return v
+    class Config:
+        env_file = ".env"
+settings = Settings()
+# TTS Setup
 tts_repo_id = "ai4bharat/IndicF5"
+tts_model = AutoModel.from_pretrained(tts_repo_id, trust_remote_code=True).to(device)
 EXAMPLES = [
     {
     },
 ]
+# Pydantic models for TTS
 class SynthesizeRequest(BaseModel):
+    text: str
+    ref_audio_name: str
+    ref_text: str = None
 class KannadaSynthesizeRequest(BaseModel):
+    text: str
+# TTS Functions
 def load_audio_from_url(url: str):
     response = requests.get(url)
     if response.status_code == 200:
         return sample_rate, audio_data
     raise HTTPException(status_code=500, detail="Failed to load reference audio from URL.")
 def synthesize_speech(text: str, ref_audio_name: str, ref_text: str):
     ref_audio_url = None
     for example in EXAMPLES:
         if example["audio_name"] == ref_audio_name:
     if not ref_audio_url:
         raise HTTPException(status_code=400, detail="Invalid reference audio name.")
     if not text.strip():
         raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
     if not ref_text or not ref_text.strip():
         raise HTTPException(status_code=400, detail="Reference text cannot be empty.")
     sample_rate, audio_data = load_audio_from_url(ref_audio_url)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
         sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
         temp_audio.flush()
         audio = tts_model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
     if audio.dtype == np.int16:
         audio = audio.astype(np.float32) / 32768.0
     buffer = io.BytesIO()
     sf.write(buffer, audio, 24000, format='WAV')
     buffer.seek(0)
     return buffer
+# Supported languages
 SUPPORTED_LANGUAGES = {
     "asm_Beng", "kas_Arab", "pan_Guru", "ben_Beng", "kas_Deva", "san_Deva",
     "brx_Deva", "mai_Deva", "sat_Olck", "doi_Deva", "mal_Mlym", "snd_Arab",
     "por_Latn", "rus_Cyrl", "pol_Latn"
 }
+# Translation Manager
 class TranslateManager:
+    def __init__(self, src_lang, tgt_lang, device_type=device, use_distilled=True):
         self.device_type = device_type
         self.tokenizer, self.model = self.initialize_model(src_lang, tgt_lang, use_distilled)
             torch_dtype=torch.float16,
             attn_implementation="flash_attention_2"
         ).to(self.device_type)
         model = torch.compile(model, mode="reduce-overhead")
         print("Model compiled with torch.compile")
         return tokenizer, model
 class ModelManager:
+    def __init__(self, device_type=device, use_distilled=True, is_lazy_loading=False):
+        self.models = {}
         self.device_type = device_type
         self.use_distilled = use_distilled
         self.is_lazy_loading = is_lazy_loading
+    async def load_model(self, src_lang, tgt_lang, key):
+        logger.info(f"Loading translation model for {src_lang} -> {tgt_lang}")
         if src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
+            model_name = "ai4bharat/indictrans2-en-indic-dist-200M" if self.use_distilled else "ai4bharat/indictrans2-en-indic-1B"
         elif not src_lang.startswith("eng") and tgt_lang.startswith("eng"):
+            model_name = "ai4bharat/indictrans2-indic-en-dist-200M" if self.use_distilled else "ai4bharat/indictrans2-indic-en-1B"
         else:
+            model_name = "ai4bharat/indictrans2-indic-indic-dist-320M" if self.use_distilled else "ai4bharat/indictrans2-indic-indic-1B"
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        model = await asyncio.to_thread(
+            AutoModelForSeq2SeqLM.from_pretrained,
+            model_name,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
+            attn_implementation="flash_attention_2"
+        )
+        model = model.to(self.device_type)
+        model = torch.compile(model, mode="reduce-overhead")
+        self.models[key] = TranslateManager(src_lang, tgt_lang, self.device_type, self.use_distilled)
+        logger.info(f"Loaded translation model for {key}")
+    def get_model(self, src_lang, tgt_lang):
+        key = self._get_model_key(src_lang, tgt_lang)
         if key not in self.models:
             if self.is_lazy_loading:
+                asyncio.create_task(self.load_model(src_lang, tgt_lang, key))
             else:
                 raise ValueError(f"Model for {key} is not preloaded and lazy loading is disabled.")
+        return self.models.get(key)
+    def _get_model_key(self, src_lang, tgt_lang):
+        if src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
+            return 'eng_indic'
+        elif not src_lang.startswith("eng") and tgt_lang.startswith("eng"):
+            return 'indic_eng'
+        elif not src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
+            return 'indic_indic'
+        raise ValueError("Invalid language combination")
+# ASR Manager
+class ASRModelManager:
+    def __init__(self, device_type="cuda"):
+        self.device_type = device_type
+        self.model = None
+        self.model_language = {"kannada": "kn"}
+    async def load(self):
+        logger.info("Loading ASR model...")
+        self.model = await asyncio.to_thread(
+            AutoModel.from_pretrained,
+            "ai4bharat/indic-conformer-600m-multilingual",
+            trust_remote_code=True
+        )
+        logger.info("ASR model loaded")
+# Global Managers
+llm_manager = LLMManager(settings.llm_model_name)
 model_manager = ModelManager()
+asr_manager = ASRModelManager()
+ip = IndicProcessor(inference=True)
 # Pydantic Models
 class ChatRequest(BaseModel):
     prompt: str
+    src_lang: str = "kan_Knda"
+    tgt_lang: str = "kan_Knda"
     @field_validator("prompt")
     def prompt_must_be_valid(cls, v):
 class TranslationResponse(BaseModel):
     translations: List[str]
+class TranscriptionResponse(BaseModel):
+    text: str
+# Dependency
 def get_translate_manager(src_lang: str, tgt_lang: str) -> TranslateManager:
     return model_manager.get_model(src_lang, tgt_lang)
+# Lifespan Event Handler
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    async def load_all_models():
+        tasks = [
+            asyncio.create_task(llm_manager.load()),
+            asyncio.create_task(asr_manager.load()),
+            asyncio.create_task(model_manager.load_model('eng_Latn', 'kan_Knda', 'eng_indic')),
+            asyncio.create_task(model_manager.load_model('kan_Knda', 'eng_Latn', 'indic_eng')),
+            asyncio.create_task(model_manager.load_model('kan_Knda', 'hin_Deva', 'indic_indic')),
+        ]
+        await asyncio.gather(*tasks)
+        logger.info("All models loaded successfully")
+    logger.info("Starting model loading in background...")
+    asyncio.create_task(load_all_models())
+    yield
+    await llm_manager.unload()
+    logger.info("Server shutdown complete")
+# FastAPI App
+app = FastAPI(
+    title="Dhwani API",
+    description="AI Chat API supporting Indian languages",
+    version="1.0.0",
+    redirect_slashes=False,
+    lifespan=lifespan
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=False,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+limiter = Limiter(key_func=get_remote_address)
+app.state.limiter = limiter
+# API Endpoints
+@app.post("/audio/speech", response_class=StreamingResponse)
+async def synthesize_kannada(request: KannadaSynthesizeRequest):
+    kannada_example = next(ex for ex in EXAMPLES if ex["audio_name"] == "KAN_F (Happy)")
+    if not request.text.strip():
+        raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
+    audio_buffer = synthesize_speech(
+        text=request.text,
+        ref_audio_name="KAN_F (Happy)",
+        ref_text=kannada_example["ref_text"]
+    )
+    return StreamingResponse(
+        audio_buffer,
+        media_type="audio/wav",
+        headers={"Content-Disposition": "attachment; filename=synthesized_kannada_speech.wav"}
+    )
 @app.post("/translate", response_model=TranslationResponse)
 async def translate(request: TranslationRequest, translate_manager: TranslateManager = Depends(get_translate_manager)):
     input_sentences = request.sentences
         raise HTTPException(status_code=400, detail="Input sentences are required")
     batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
     inputs = translate_manager.tokenizer(
         batch,
         truncation=True,
     translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
     return TranslationResponse(translations=translations)
 async def perform_internal_translation(sentences: List[str], src_lang: str, tgt_lang: str) -> List[str]:
     translate_manager = model_manager.get_model(src_lang, tgt_lang)
     request = TranslationRequest(sentences=sentences, src_lang=src_lang, tgt_lang=tgt_lang)
     response = await translate(request, translate_manager)
     return response.translations
 @app.get("/v1/health")
 async def health_check():
     return {"status": "healthy", "model": settings.llm_model_name}
 async def unload_all_models():
     try:
         logger.info("Starting to unload all models...")
+        await llm_manager.unload()
         logger.info("All models unloaded successfully")
         return {"status": "success", "message": "All models unloaded"}
     except Exception as e:
 async def load_all_models():
     try:
         logger.info("Starting to load all models...")
+        await llm_manager.load()
         logger.info("All models loaded successfully")
         return {"status": "success", "message": "All models loaded"}
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
         raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
 @app.post("/transcribe/", response_model=TranscriptionResponse)
 async def transcribe_audio(file: UploadFile = File(...), language: str = Query(..., enum=list(asr_manager.model_language.keys()))):
+    if not asr_manager.model:
+        raise HTTPException(status_code=503, detail="ASR model still loading, please try again later")
     try:
         wav, sr = torchaudio.load(file.file)
         wav = torch.mean(wav, dim=0, keepdim=True)
         if sr != target_sample_rate:
             resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sample_rate)
             wav = resampler(wav)
+        transcription_rnnt = asr_manager.model(wav, asr_manager.model_language[language], "rnnt")
         return TranscriptionResponse(text=transcription_rnnt)
     except Exception as e:
         logger.error(f"Error in transcription: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
 @app.post("/v1/speech_to_speech")
 async def speech_to_speech(
+    request: Request,
     file: UploadFile = File(...),
     language: str = Query(..., enum=list(asr_manager.model_language.keys())),
 ) -> StreamingResponse:
     transcription = await transcribe_audio(file, language)
     logger.info(f"Transcribed text: {transcription.text}")
     chat_request = ChatRequest(
         prompt=transcription.text,
+        src_lang=LANGUAGE_TO_SCRIPT.get(language, "kan_Knda"),
         tgt_lang=LANGUAGE_TO_SCRIPT.get(language, "kan_Knda")
     )
+    processed_text = await chat(request, chat_request)
     logger.info(f"Processed text: {processed_text.response}")
     voice_request = KannadaSynthesizeRequest(text=processed_text.response)
+    audio_response = await synthesize_kannada(voice_request)
     return audio_response
+LANGUAGE_TO_SCRIPT = {
+    "kannada": "kan_Knda"
+}
+# Main Execution
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run the FastAPI server.")
     parser.add_argument("--port", type=int, default=settings.port, help="Port to run the server on.")
     parser.add_argument("--host", type=str, default=settings.host, help="Host to run the server on.")
+    parser.add_argument("--config", type=str, default="config_one", help="Configuration to use")
     args = parser.parse_args()
     def load_config(config_path="dhwani_config.json"):
         with open(config_path, "r") as f:
             return json.load(f)
     selected_config = config_data["configs"][args.config]
     global_settings = config_data["global_settings"]
     settings.llm_model_name = selected_config["components"]["LLM"]["model"]
     settings.max_tokens = selected_config["components"]["LLM"]["max_tokens"]
     settings.host = global_settings["host"]
     settings.chat_rate_limit = global_settings["chat_rate_limit"]
     settings.speech_rate_limit = global_settings["speech_rate_limit"]
     llm_manager = LLMManager(settings.llm_model_name)
     if selected_config["components"]["ASR"]:
         asr_model_name = selected_config["components"]["ASR"]["model"]
         asr_manager.model_language[selected_config["language"]] = selected_config["components"]["ASR"]["language_code"]
     if selected_config["components"]["Translation"]:
         for translation_config in selected_config["components"]["Translation"]:
             src_lang = translation_config["src_lang"]
             tgt_lang = translation_config["tgt_lang"]
+            asyncio.create_task(model_manager.load_model(src_lang, tgt_lang, model_manager._get_model_key(src_lang, tgt_lang)))
     host = args.host if args.host != settings.host else settings.host
     port = args.port if args.port != settings.port else settings.port
     uvicorn.run(app, host=host, port=port)