Spaces:

slabstech
/

dhwani-internal-api-server

Paused

App Files Files Community

sachin commited on 13 days ago

Commit

7e9e8cc

1 Parent(s): fea8b58

test

Browse files

Files changed (1) hide show

src/server/main.py +87 -86

src/server/main.py CHANGED Viewed

@@ -28,26 +28,19 @@ from tts_config import SPEED, ResponseFormat, config as tts_config
 import torchaudio
 # Device setup
-if torch.cuda.is_available():
-    device = "cuda:0"
-    logger.info("GPU will be used for inference")
-else:
-    device = "cpu"
-    logger.info("CPU will be used for inference")
 torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32
 # Check CUDA availability and version
 cuda_available = torch.cuda.is_available()
 cuda_version = torch.version.cuda if cuda_available else None
-if torch.cuda.is_available():
     device_idx = torch.cuda.current_device()
     capability = torch.cuda.get_device_capability(device_idx)
-    compute_capability_float = float(f"{capability[0]}.{capability[1]}")
-    print(f"CUDA version: {cuda_version}")
-    print(f"CUDA Compute Capability: {compute_capability_float}")
 else:
-    print("CUDA is not available on this system.")
 # Settings
 class Settings(BaseSettings):
@@ -94,14 +87,7 @@ class LLMManager:
             try:
                 if self.device.type == "cuda":
                     torch.set_float32_matmul_precision('high')
-                    logger.info("Enabled TF32 matrix multiplication for improved performance")
-                quantization_config = BitsAndBytesConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_quant_type="nf4",
-                    bnb_4bit_compute_dtype=self.torch_dtype,
-                    bnb_4bit_use_double_quant=True
-                )
                 self.model = Gemma3ForConditionalGeneration.from_pretrained(
                     self.model_name,
@@ -113,7 +99,7 @@ class LLMManager:
                 self.processor = AutoProcessor.from_pretrained(self.model_name, use_fast=True)
                 self.is_loaded = True
-                logger.info(f"LLM {self.model_name} loaded on {self.device} with 4-bit quantization and fast processor")
             except Exception as e:
                 logger.error(f"Failed to load LLM: {str(e)}")
                 raise HTTPException(status_code=500, detail=f"Model loading failed: {str(e)}")
@@ -124,10 +110,10 @@ class LLMManager:
             del self.processor
             if self.device.type == "cuda":
                 torch.cuda.empty_cache()
-                logger.info(f"GPU memory allocated after unload: {torch.cuda.memory_allocated()}")
             self.is_loaded = False
             self.token_cache.clear()
-            logger.info(f"LLM {self.model_name} unloaded from {self.device}")
     async def generate(self, prompt: str, max_tokens: int = settings.max_tokens, temperature: float = 0.7) -> str:
         if not self.is_loaded:
@@ -139,14 +125,8 @@ class LLMManager:
             return self.token_cache[cache_key]["response"]
         messages_vlm = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Answer questions considering India as base country and Karnataka as base state. Provide a concise response in one sentence maximum."}]
-            },
-            {
-                "role": "user",
-                "content": [{"type": "text", "text": prompt}]
-            }
         ]
         try:
@@ -169,7 +149,7 @@ class LLMManager:
         input_len = inputs_vlm["input_ids"].shape[-1]
         adjusted_max_tokens = min(max_tokens, max(20, input_len * 2))
-        with torch.inference_mode():
             generation = self.model.generate(
                 **inputs_vlm,
                 max_new_tokens=adjusted_max_tokens,
@@ -189,14 +169,8 @@ class LLMManager:
             self.load()
         messages_vlm = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Summarize your answer in one sentence maximum."}]
-            },
-            {
-                "role": "user",
-                "content": [{"type": "text", "text": query}] + ([{"type": "image", "image": image}] if image else [])
-            }
         ]
         cache_key = f"vision_{query}_{'image' if image else 'no_image'}"
@@ -224,7 +198,7 @@ class LLMManager:
         input_len = inputs_vlm["input_ids"].shape[-1]
         adjusted_max_tokens = min(512, max(20, input_len * 2))
-        with torch.inference_mode():
             generation = self.model.generate(
                 **inputs_vlm,
                 max_new_tokens=adjusted_max_tokens,
@@ -244,14 +218,8 @@ class LLMManager:
             self.load()
         messages_vlm = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Answer questions considering India as base country and Karnataka as base state."}]
-            },
-            {
-                "role": "user",
-                "content": [{"type": "text", "text": query}] + ([{"type": "image", "image": image}] if image else [])
-            }
         ]
         cache_key = f"chat_v2_{query}_{'image' if image else 'no_image'}"
@@ -279,7 +247,7 @@ class LLMManager:
         input_len = inputs_vlm["input_ids"].shape[-1]
         adjusted_max_tokens = min(512, max(20, input_len * 2))
-        with torch.inference_mode():
             generation = self.model.generate(
                 **inputs_vlm,
                 max_new_tokens=adjusted_max_tokens,
@@ -297,19 +265,24 @@ class LLMManager:
 # TTS Manager
 class TTSManager:
     def __init__(self, device_type=device):
-        self.device_type = device_type
         self.model = None
         self.repo_id = "ai4bharat/IndicF5"
     def load(self):
         if not self.model:
-            logger.info("Loading TTS model IndicF5...")
-            self.model = AutoModel.from_pretrained(
-                self.repo_id,
-                trust_remote_code=True
-            )
-            self.model = self.model.to(self.device_type)
-            logger.info("TTS model IndicF5 loaded")
     def synthesize(self, text, ref_audio_path, ref_text):
         if not self.model:
@@ -394,11 +367,11 @@ SUPPORTED_LANGUAGES = {
 # Translation Manager
 class TranslateManager:
-    def __init__(self, src_lang, tgt_lang, device_type=device, use_distilled=True):
-        self.device_type = device_type
-        self.tokenizer, self.model = self.initialize_model(src_lang, tgt_lang, use_distilled)
-    def initialize_model(self, src_lang, tgt_lang, use_distilled):
         if src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
             model_name = "ai4bharat/indictrans2-en-indic-dist-200M" if use_distilled else "ai4bharat/indictrans2-en-indic-1B"
         elif not src_lang.startswith("eng") and tgt_lang.startswith("eng"):
@@ -417,6 +390,17 @@ class TranslateManager:
         ).to(self.device_type)
         return tokenizer, model
 class ModelManager:
     def __init__(self, device_type=device, use_distilled=True, is_lazy_loading=False):
         self.models: dict[str, TranslateManager] = {}
@@ -432,7 +416,7 @@ class ModelManager:
             ('kan_Knda', 'hin_Deva', 'indic_indic')
         ]
         for src_lang, tgt_lang, key in translation_pairs:
-            logger.info(f"Preloading translation model for {src_lang} -> {tgt_lang}...")
             self.models[key] = TranslateManager(src_lang, tgt_lang, self.device_type, self.use_distilled)
             logger.info(f"Translation model for {key} preloaded successfully")
@@ -452,21 +436,29 @@ class ModelManager:
 # ASR Manager
 class ASRModelManager:
-    def __init__(self, device_type="cuda"):
-        self.device_type = device_type
         self.model = None
         self.model_language = {"kannada": "kn"}
     def load(self):
         if not self.model:
-            logger.info("Loading ASR model...")
             self.model = AutoModel.from_pretrained(
                 "ai4bharat/indic-conformer-600m-multilingual",
                 trust_remote_code=True
-            )
-            self.model = self.model.to(self.device_type)
             logger.info("ASR model loaded")
 # Global Managers
 llm_manager = LLMManager(settings.llm_model_name)
 model_manager = ModelManager()
@@ -552,15 +544,15 @@ translation_configs = []
 async def lifespan(app: FastAPI):
     def load_all_models():
         try:
-            logger.info("Loading LLM model...")
             llm_manager.load()
             logger.info("LLM model loaded successfully")
-            logger.info("Loading TTS model...")
             tts_manager.load()
             logger.info("TTS model loaded successfully")
-            logger.info("Loading ASR model...")
             asr_manager.load()
             logger.info("ASR model loaded successfully")
@@ -574,7 +566,11 @@ async def lifespan(app: FastAPI):
     load_all_models()
     yield
     llm_manager.unload()
-    logger.info("Server shutdown complete")
 # FastAPI App
 app = FastAPI(
@@ -585,7 +581,6 @@ app = FastAPI(
     lifespan=lifespan
 )
-# Add CORS Middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -594,7 +589,6 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Add Timing Middleware
 @app.middleware("http")
 async def add_request_timing(request: Request, call_next):
     start_time = time()
@@ -616,6 +610,10 @@ async def unload_all_models():
     try:
         logger.info("Starting to unload all models...")
         llm_manager.unload()
         logger.info("All models unloaded successfully")
         return {"status": "success", "message": "All models unloaded"}
     except Exception as e:
@@ -627,6 +625,8 @@ async def load_all_models():
     try:
         logger.info("Starting to load all models...")
         llm_manager.load()
         logger.info("All models loaded successfully")
         return {"status": "success", "message": "All models loaded"}
     except Exception as e:
@@ -775,10 +775,9 @@ async def chat_v2(
         logger.error(f"Error processing request: {str(e)}")
         raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
-# Include LLM Router
 app.include_router(llm_router)
-# Improved Endpoints
 @app.post("/audio/speech", response_class=StreamingResponse)
 async def synthesize_kannada(request: KannadaSynthesizeRequest):
     if not tts_manager.model:
@@ -821,8 +820,11 @@ async def transcribe_audio(file: UploadFile = File(...), language: str = Query(.
         if sr != target_sample_rate:
             logger.info(f"Resampling audio from {sr}Hz to {target_sample_rate}Hz")
             resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sample_rate)
-            wav = resampler(wav)
-        transcription_rnnt = asr_manager.model(wav, asr_manager.model_language[language], "rnnt")
         logger.info(f"Transcription completed: {transcription_rnnt[:50]}...")
         return TranscriptionResponse(text=transcription_rnnt)
     except Exception as e:
@@ -837,8 +839,11 @@ async def transcribe_step(audio_data: bytes, language: str) -> str:
     target_sample_rate = 16000
     if sr != target_sample_rate:
         resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sample_rate)
-        wav = resampler(wav)
-    return asr_manager.model(wav, asr_manager.model_language[language], "rnnt")
 async def synthesize_step(text: str) -> io.BytesIO:
     kannada_example = next((ex for ex in EXAMPLES if ex["audio_name"] == "KAN_F (Happy)"), None)
@@ -863,11 +868,9 @@ async def speech_to_speech(
     logger.info(f"Processing speech-to-speech for file: {file.filename} in language: {language}")
     try:
-        # Step 1: Transcribe
         transcription = await transcribe_step(audio_data, language)
         logger.info(f"Transcribed text: {transcription[:50]}...")
-        # Step 2: Process with LLM
         chat_request = ChatRequest(
             prompt=transcription,
             src_lang=LANGUAGE_TO_SCRIPT.get(language, "kan_Knda"),
@@ -876,7 +879,6 @@ async def speech_to_speech(
         processed_text = await chat(request, chat_request)
         logger.info(f"Processed text: {processed_text.response[:50]}...")
-        # Step 3: Synthesize
         audio_buffer = await synthesize_step(processed_text.response)
         logger.info("Speech-to-speech processing completed")
@@ -900,7 +902,8 @@ async def health_check():
         "translation_models": list(model_manager.models.keys()),
         "device": device,
         "cuda_available": cuda_available,
-        "cuda_version": cuda_version if cuda_available else "N/A"
     }
     logger.info("Health check requested")
     return status
@@ -967,7 +970,6 @@ LANGUAGE_TO_SCRIPT = {
     "kannada": "kan_Knda"
 }
-# Main Execution
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run the FastAPI server.")
     parser.add_argument("--port", type=int, default=settings.port, help="Port to run the server on.")
@@ -996,7 +998,6 @@ if __name__ == "__main__":
     llm_manager = LLMManager(settings.llm_model_name)
     if selected_config["components"]["ASR"]:
-        asr_model_name = selected_config["components"]["ASR"]["model"]
         asr_manager.model_language[selected_config["language"]] = selected_config["components"]["ASR"]["language_code"]
     if selected_config["components"]["Translation"]:

 import torchaudio
 # Device setup
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32
+logger.info(f"Using device: {device} with dtype: {torch_dtype}")
 # Check CUDA availability and version
 cuda_available = torch.cuda.is_available()
 cuda_version = torch.version.cuda if cuda_available else None
+if cuda_available:
     device_idx = torch.cuda.current_device()
     capability = torch.cuda.get_device_capability(device_idx)
+    logger.info(f"CUDA version: {cuda_version}, Compute Capability: {capability[0]}.{capability[1]}")
 else:
+    logger.info("CUDA is not available; falling back to CPU.")
 # Settings
 class Settings(BaseSettings):
             try:
                 if self.device.type == "cuda":
                     torch.set_float32_matmul_precision('high')
+                    logger.info("Enabled TF32 matrix multiplication for improved GPU performance")
                 self.model = Gemma3ForConditionalGeneration.from_pretrained(
                     self.model_name,
                 self.processor = AutoProcessor.from_pretrained(self.model_name, use_fast=True)
                 self.is_loaded = True
+                logger.info(f"LLM {self.model_name} loaded on {self.device}")
             except Exception as e:
                 logger.error(f"Failed to load LLM: {str(e)}")
                 raise HTTPException(status_code=500, detail=f"Model loading failed: {str(e)}")
             del self.processor
             if self.device.type == "cuda":
                 torch.cuda.empty_cache()
+                logger.info(f"GPU memory cleared: {torch.cuda.memory_allocated()} bytes allocated")
             self.is_loaded = False
             self.token_cache.clear()
+            logger.info(f"LLM {self.model_name} unloaded")
     async def generate(self, prompt: str, max_tokens: int = settings.max_tokens, temperature: float = 0.7) -> str:
         if not self.is_loaded:
             return self.token_cache[cache_key]["response"]
         messages_vlm = [
+            {"role": "system", "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Answer questions considering India as base country and Karnataka as base state. Provide a concise response in one sentence maximum."}]},
+            {"role": "user", "content": [{"type": "text", "text": prompt}]}
         ]
         try:
         input_len = inputs_vlm["input_ids"].shape[-1]
         adjusted_max_tokens = min(max_tokens, max(20, input_len * 2))
+        with torch.no_grad():
             generation = self.model.generate(
                 **inputs_vlm,
                 max_new_tokens=adjusted_max_tokens,
             self.load()
         messages_vlm = [
+            {"role": "system", "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Summarize your answer in one sentence maximum."}]},
+            {"role": "user", "content": [{"type": "text", "text": query}] + ([{"type": "image", "image": image}] if image else [])}
         ]
         cache_key = f"vision_{query}_{'image' if image else 'no_image'}"
         input_len = inputs_vlm["input_ids"].shape[-1]
         adjusted_max_tokens = min(512, max(20, input_len * 2))
+        with torch.no_grad():
             generation = self.model.generate(
                 **inputs_vlm,
                 max_new_tokens=adjusted_max_tokens,
             self.load()
         messages_vlm = [
+            {"role": "system", "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Answer questions considering India as base country and Karnataka as base state."}]},
+            {"role": "user", "content": [{"type": "text", "text": query}] + ([{"type": "image", "image": image}] if image else [])}
         ]
         cache_key = f"chat_v2_{query}_{'image' if image else 'no_image'}"
         input_len = inputs_vlm["input_ids"].shape[-1]
         adjusted_max_tokens = min(512, max(20, input_len * 2))
+        with torch.no_grad():
             generation = self.model.generate(
                 **inputs_vlm,
                 max_new_tokens=adjusted_max_tokens,
 # TTS Manager
 class TTSManager:
     def __init__(self, device_type=device):
+        self.device_type = torch.device(device_type)
         self.model = None
         self.repo_id = "ai4bharat/IndicF5"
     def load(self):
         if not self.model:
+            logger.info(f"Loading TTS model {self.repo_id} on {self.device_type}...")
+            self.model = AutoModel.from_pretrained(self.repo_id, trust_remote_code=True).to(self.device_type)
+            logger.info("TTS model loaded")
+    def unload(self):
+        if self.model:
+            del self.model
+            if self.device_type.type == "cuda":
+                torch.cuda.empty_cache()
+                logger.info(f"TTS GPU memory cleared: {torch.cuda.memory_allocated()} bytes allocated")
+            self.model = None
+            logger.info("TTS model unloaded")
     def synthesize(self, text, ref_audio_path, ref_text):
         if not self.model:
 # Translation Manager
 class TranslateManager:
+    def __init__(self, src_lang, tgt_lang, device_type=device):
+        self.device_type = torch.device(device_type)
+        self.tokenizer, self.model = self.initialize_model(src_lang, tgt_lang)
+    def initialize_model(self, src_lang, tgt_lang, use_distilled=True):
         if src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
             model_name = "ai4bharat/indictrans2-en-indic-dist-200M" if use_distilled else "ai4bharat/indictrans2-en-indic-1B"
         elif not src_lang.startswith("eng") and tgt_lang.startswith("eng"):
         ).to(self.device_type)
         return tokenizer, model
+    def unload(self):
+        if self.model:
+            del self.model
+            del self.tokenizer
+            if self.device_type.type == "cuda":
+                torch.cuda.empty_cache()
+                logger.info(f"Translation GPU memory cleared: {torch.cuda.memory_allocated()} bytes allocated")
+            self.model = None
+            self.tokenizer = None
+            logger.info("Translation model unloaded")
 class ModelManager:
     def __init__(self, device_type=device, use_distilled=True, is_lazy_loading=False):
         self.models: dict[str, TranslateManager] = {}
             ('kan_Knda', 'hin_Deva', 'indic_indic')
         ]
         for src_lang, tgt_lang, key in translation_pairs:
+            logger.info(f"Preloading translation model for {src_lang} -> {tgt_lang} on {self.device_type}...")
             self.models[key] = TranslateManager(src_lang, tgt_lang, self.device_type, self.use_distilled)
             logger.info(f"Translation model for {key} preloaded successfully")
 # ASR Manager
 class ASRModelManager:
+    def __init__(self, device_type=device):
+        self.device_type = torch.device(device_type)
         self.model = None
         self.model_language = {"kannada": "kn"}
     def load(self):
         if not self.model:
+            logger.info(f"Loading ASR model on {self.device_type}...")
             self.model = AutoModel.from_pretrained(
                 "ai4bharat/indic-conformer-600m-multilingual",
                 trust_remote_code=True
+            ).to(self.device_type)
             logger.info("ASR model loaded")
+    def unload(self):
+        if self.model:
+            del self.model
+            if self.device_type.type == "cuda":
+                torch.cuda.empty_cache()
+                logger.info(f"ASR GPU memory cleared: {torch.cuda.memory_allocated()} bytes allocated")
+            self.model = None
+            logger.info("ASR model unloaded")
 # Global Managers
 llm_manager = LLMManager(settings.llm_model_name)
 model_manager = ModelManager()
 async def lifespan(app: FastAPI):
     def load_all_models():
         try:
+            logger.info(f"Loading LLM model on {device}...")
             llm_manager.load()
             logger.info("LLM model loaded successfully")
+            logger.info(f"Loading TTS model on {device}...")
             tts_manager.load()
             logger.info("TTS model loaded successfully")
+            logger.info(f"Loading ASR model on {device}...")
             asr_manager.load()
             logger.info("ASR model loaded successfully")
     load_all_models()
     yield
     llm_manager.unload()
+    tts_manager.unload()
+    asr_manager.unload()
+    for model in model_manager.models.values():
+        model.unload()
+    logger.info("Server shutdown complete; all models unloaded")
 # FastAPI App
 app = FastAPI(
     lifespan=lifespan
 )
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
 @app.middleware("http")
 async def add_request_timing(request: Request, call_next):
     start_time = time()
     try:
         logger.info("Starting to unload all models...")
         llm_manager.unload()
+        tts_manager.unload()
+        asr_manager.unload()
+        for model in model_manager.models.values():
+            model.unload()
         logger.info("All models unloaded successfully")
         return {"status": "success", "message": "All models unloaded"}
     except Exception as e:
     try:
         logger.info("Starting to load all models...")
         llm_manager.load()
+        tts_manager.load()
+        asr_manager.load()
         logger.info("All models loaded successfully")
         return {"status": "success", "message": "All models loaded"}
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
         raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
 app.include_router(llm_router)
+# Improved Endpoints with GPU Optimization
 @app.post("/audio/speech", response_class=StreamingResponse)
 async def synthesize_kannada(request: KannadaSynthesizeRequest):
     if not tts_manager.model:
         if sr != target_sample_rate:
             logger.info(f"Resampling audio from {sr}Hz to {target_sample_rate}Hz")
             resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sample_rate)
+            wav = resampler(wav).to(device)
+        else:
+            wav = wav.to(device)
+        with torch.no_grad():
+            transcription_rnnt = asr_manager.model(wav, asr_manager.model_language[language], "rnnt")
         logger.info(f"Transcription completed: {transcription_rnnt[:50]}...")
         return TranscriptionResponse(text=transcription_rnnt)
     except Exception as e:
     target_sample_rate = 16000
     if sr != target_sample_rate:
         resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sample_rate)
+        wav = resampler(wav).to(device)
+    else:
+        wav = wav.to(device)
+    with torch.no_grad():
+        return asr_manager.model(wav, asr_manager.model_language[language], "rnnt")
 async def synthesize_step(text: str) -> io.BytesIO:
     kannada_example = next((ex for ex in EXAMPLES if ex["audio_name"] == "KAN_F (Happy)"), None)
     logger.info(f"Processing speech-to-speech for file: {file.filename} in language: {language}")
     try:
         transcription = await transcribe_step(audio_data, language)
         logger.info(f"Transcribed text: {transcription[:50]}...")
         chat_request = ChatRequest(
             prompt=transcription,
             src_lang=LANGUAGE_TO_SCRIPT.get(language, "kan_Knda"),
         processed_text = await chat(request, chat_request)
         logger.info(f"Processed text: {processed_text.response[:50]}...")
         audio_buffer = await synthesize_step(processed_text.response)
         logger.info("Speech-to-speech processing completed")
         "translation_models": list(model_manager.models.keys()),
         "device": device,
         "cuda_available": cuda_available,
+        "cuda_version": cuda_version if cuda_available else "N/A",
+        "gpu_memory_allocated": torch.cuda.memory_allocated() if cuda_available else 0
     }
     logger.info("Health check requested")
     return status
     "kannada": "kan_Knda"
 }
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run the FastAPI server.")
     parser.add_argument("--port", type=int, default=settings.port, help="Port to run the server on.")
     llm_manager = LLMManager(settings.llm_model_name)
     if selected_config["components"]["ASR"]:
         asr_manager.model_language[selected_config["language"]] = selected_config["components"]["ASR"]["language_code"]
     if selected_config["components"]["Translation"]: