Spaces:

slabstech
/

dhwani-internal-api-server

Paused

App Files Files Community

sachin commited on 14 days ago

Commit

8308b9e

1 Parent(s): 98b17b0

test-dhwani-base-model

Browse files

Files changed (6) hide show

Dockerfile +1 -41
Dockerfile.app +17 -0
Dockerfile.base +32 -0
Dockerfile.models +16 -0
src/server/main.py +20 -23
src/server/main_local.py +913 -0

Dockerfile CHANGED Viewed

@@ -1,49 +1,9 @@
-FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04
 WORKDIR /app
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    python3 \
-    python3-pip python3-distutils python3-dev python3-venv \
-    git \
-    ffmpeg \
-    sudo wget curl software-properties-common build-essential gcc g++ \
-    && ln -s /usr/bin/python3 /usr/bin/python \
-    && rm -rf /var/lib/apt/lists/*
-# Install Rust
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-ENV PATH="/root/.cargo/bin:${PATH}"
-# Set compiler environment variables
-ENV CC=/usr/bin/gcc
-ENV CXX=/usr/bin/g++
-# Upgrade pip and install base Python dependencies
-RUN pip install --upgrade pip setuptools setuptools-rust torch
-RUN pip install flash-attn --no-build-isolation
-# Copy requirements and configuration files
-COPY requirements.txt .
 COPY dhwani_config.json .
-# Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
 # Create a directory for pre-downloaded models
-RUN mkdir -p /app/models
-# Define build argument for HF_TOKEN
-ARG HF_TOKEN_DOCKER
-# Set environment variable for the build process
-ENV HF_TOKEN=$HF_TOKEN_DOCKER
-# Copy and run the model download script
-COPY download_models.py .
-RUN python download_models.py
-# Copy application code
 COPY . .
 # Set up user

+FROM slabstech/dhwani-server-base
 WORKDIR /app
 COPY dhwani_config.json .
 # Create a directory for pre-downloaded models
 COPY . .
 # Set up user

Dockerfile.app ADDED Viewed

	@@ -0,0 +1,17 @@

+# Use the pre-built image with models as the base
+FROM slabstech/dhwani-model-server:latest
+WORKDIR /app
+# Copy application code
+COPY . .
+# Set up user
+RUN useradd -ms /bin/bash appuser \
+    && chown -R appuser:appuser /app
+USER appuser
+# Expose port
+EXPOSE 7860
+# Start the server
+CMD ["python", "/app/src/server/main.py", "--host", "0.0.0.0", "--port", "7860", "--config", "config_two"]

Dockerfile.base ADDED Viewed

	@@ -0,0 +1,32 @@

+# Base image with CUDA support
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04 AS model-downloader
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip python3-distutils python3-dev python3-venv \
+    git \
+    ffmpeg \
+    sudo wget curl software-properties-common build-essential gcc g++ \
+    && ln -s /usr/bin/python3 /usr/bin/python \
+    && rm -rf /var/lib/apt/lists/*
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+# Set compiler environment variables
+ENV CC=/usr/bin/gcc
+ENV CXX=/usr/bin/g++
+# Upgrade pip and install base Python dependencies
+RUN pip install --upgrade pip setuptools setuptools-rust torch
+RUN pip install flash-attn --no-build-isolation
+# Copy requirements and configuration files
+COPY requirements.txt .
+COPY dhwani_config.json .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt

Dockerfile.models ADDED Viewed

	@@ -0,0 +1,16 @@

+# Base image with CUDA support
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04 AS model-downloader
+# Create a directory for pre-downloaded models
+RUN mkdir -p /app/models
+# Define build argument for HF_TOKEN
+ARG HF_TOKEN_DOCKER
+# Set environment variable for the build process
+ENV HF_TOKEN=$HF_TOKEN_DOCKER
+# Copy and run the model download script
+COPY download_models.py .
+RUN python download_models.py

src/server/main.py CHANGED Viewed

@@ -14,7 +14,7 @@ from pydantic_settings import BaseSettings
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor, AutoModel, BitsAndBytesConfig, Gemma3ForConditionalGeneration
 from IndicTransToolkit import IndicProcessor
 import json
 import asyncio
@@ -91,10 +91,9 @@ class LLMManager:
     async def load(self):
         if not self.is_loaded:
             try:
-                local_path = "/app/models/llm_model"
                 self.model = await asyncio.to_thread(
                     Gemma3ForConditionalGeneration.from_pretrained,
-                    local_path,
                     device_map="auto",
                     quantization_config=quantization_config,
                     torch_dtype=self.torch_dtype
@@ -102,10 +101,10 @@ class LLMManager:
                 self.model.eval()
                 self.processor = await asyncio.to_thread(
                     AutoProcessor.from_pretrained,
-                    local_path
                 )
                 self.is_loaded = True
-                logger.info(f"LLM loaded from {local_path} on {self.device}")
             except Exception as e:
                 logger.error(f"Failed to load LLM: {str(e)}")
                 raise
@@ -269,15 +268,14 @@ class TTSManager:
     async def load(self):
         if not self.model:
-            logger.info("Loading TTS model from local path asynchronously...")
-            local_path = "/app/models/tts_model"
             self.model = await asyncio.to_thread(
                 AutoModel.from_pretrained,
-                local_path,
                 trust_remote_code=True
             )
             self.model = self.model.to(self.device_type)
-            logger.info("TTS model loaded from local path asynchronously")
     def synthesize(self, text, ref_audio_path, ref_text):
         if not self.model:
@@ -364,29 +362,29 @@ class TranslateManager:
     async def load(self):
         if not self.tokenizer or not self.model:
             if self.src_lang.startswith("eng") and not self.tgt_lang.startswith("eng"):
-                local_path = "/app/models/trans_en_indic"
             elif not self.src_lang.startswith("eng") and self.tgt_lang.startswith("eng"):
-                local_path = "/app/models/trans_indic_en"
             elif not self.src_lang.startswith("eng") and not self.tgt_lang.startswith("eng"):
-                local_path = "/app/models/trans_indic_indic"
             else:
                 raise ValueError("Invalid language combination")
             self.tokenizer = await asyncio.to_thread(
                 AutoTokenizer.from_pretrained,
-                local_path,
                 trust_remote_code=True
             )
             self.model = await asyncio.to_thread(
                 AutoModelForSeq2SeqLM.from_pretrained,
-                local_path,
                 trust_remote_code=True,
                 torch_dtype=torch.float16,
                 attn_implementation="flash_attention_2"
             )
             self.model = self.model.to(self.device_type)
             self.model = torch.compile(self.model, mode="reduce-overhead")
-            logger.info(f"Translation model loaded from {local_path} asynchronously")
 class ModelManager:
     def __init__(self, device_type=device, use_distilled=True, is_lazy_loading=False):
@@ -396,11 +394,11 @@ class ModelManager:
         self.is_lazy_loading = is_lazy_loading
     async def load_model(self, src_lang, tgt_lang, key):
-        logger.info(f"Loading translation model for {src_lang} -> {tgt_lang} from local path")
         translate_manager = TranslateManager(src_lang, tgt_lang, self.device_type, self.use_distilled)
         await translate_manager.load()
         self.models[key] = translate_manager
-        logger.info(f"Loaded translation model for {key} from local path")
     def get_model(self, src_lang, tgt_lang):
         key = self._get_model_key(src_lang, tgt_lang)
@@ -429,15 +427,14 @@ class ASRModelManager:
     async def load(self):
         if not self.model:
-            logger.info("Loading ASR model from local path asynchronously...")
-            local_path = "/app/models/asr_model"
             self.model = await asyncio.to_thread(
                 AutoModel.from_pretrained,
-                local_path,
                 trust_remote_code=True
             )
             self.model = self.model.to(self.device_type)
-            logger.info("ASR model loaded from local path asynchronously")
 # Global Managers
 llm_manager = LLMManager(settings.llm_model_name)
@@ -508,12 +505,12 @@ async def lifespan(app: FastAPI):
                 translation_tasks.append(model_manager.load_model(src_lang, tgt_lang, key))
             await asyncio.gather(*tasks, *translation_tasks)
-            logger.info("All models loaded successfully from local paths")
         except Exception as e:
             logger.error(f"Error loading models: {str(e)}")
             raise
-    logger.info("Starting asynchronous model loading from local paths...")
     await load_all_models()
     yield
     llm_manager.unload()

 from slowapi import Limiter
 from slowapi.util import get_remote_address
 import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor, BitsAndBytesConfig, AutoModel, Gemma3ForConditionalGeneration
 from IndicTransToolkit import IndicProcessor
 import json
 import asyncio
     async def load(self):
         if not self.is_loaded:
             try:
                 self.model = await asyncio.to_thread(
                     Gemma3ForConditionalGeneration.from_pretrained,
+                    self.model_name,
                     device_map="auto",
                     quantization_config=quantization_config,
                     torch_dtype=self.torch_dtype
                 self.model.eval()
                 self.processor = await asyncio.to_thread(
                     AutoProcessor.from_pretrained,
+                    self.model_name
                 )
                 self.is_loaded = True
+                logger.info(f"LLM {self.model_name} loaded asynchronously on {self.device}")
             except Exception as e:
                 logger.error(f"Failed to load LLM: {str(e)}")
                 raise
     async def load(self):
         if not self.model:
+            logger.info("Loading TTS model IndicF5 asynchronously...")
             self.model = await asyncio.to_thread(
                 AutoModel.from_pretrained,
+                self.repo_id,
                 trust_remote_code=True
             )
             self.model = self.model.to(self.device_type)
+            logger.info("TTS model IndicF5 loaded asynchronously")
     def synthesize(self, text, ref_audio_path, ref_text):
         if not self.model:
     async def load(self):
         if not self.tokenizer or not self.model:
             if self.src_lang.startswith("eng") and not self.tgt_lang.startswith("eng"):
+                model_name = "ai4bharat/indictrans2-en-indic-dist-200M" if self.use_distilled else "ai4bharat/indictrans2-en-indic-1B"
             elif not self.src_lang.startswith("eng") and self.tgt_lang.startswith("eng"):
+                model_name = "ai4bharat/indictrans2-indic-en-dist-200M" if self.use_distilled else "ai4bharat/indictrans2-indic-en-1B"
             elif not self.src_lang.startswith("eng") and not self.tgt_lang.startswith("eng"):
+                model_name = "ai4bharat/indictrans2-indic-indic-dist-320M" if self.use_distilled else "ai4bharat/indictrans2-indic-indic-1B"
             else:
                 raise ValueError("Invalid language combination")
             self.tokenizer = await asyncio.to_thread(
                 AutoTokenizer.from_pretrained,
+                model_name,
                 trust_remote_code=True
             )
             self.model = await asyncio.to_thread(
                 AutoModelForSeq2SeqLM.from_pretrained,
+                model_name,
                 trust_remote_code=True,
                 torch_dtype=torch.float16,
                 attn_implementation="flash_attention_2"
             )
             self.model = self.model.to(self.device_type)
             self.model = torch.compile(self.model, mode="reduce-overhead")
+            logger.info(f"Translation model {model_name} loaded asynchronously")
 class ModelManager:
     def __init__(self, device_type=device, use_distilled=True, is_lazy_loading=False):
         self.is_lazy_loading = is_lazy_loading
     async def load_model(self, src_lang, tgt_lang, key):
+        logger.info(f"Loading translation model for {src_lang} -> {tgt_lang} asynchronously")
         translate_manager = TranslateManager(src_lang, tgt_lang, self.device_type, self.use_distilled)
         await translate_manager.load()
         self.models[key] = translate_manager
+        logger.info(f"Loaded translation model for {key} asynchronously")
     def get_model(self, src_lang, tgt_lang):
         key = self._get_model_key(src_lang, tgt_lang)
     async def load(self):
         if not self.model:
+            logger.info("Loading ASR model asynchronously...")
             self.model = await asyncio.to_thread(
                 AutoModel.from_pretrained,
+                "ai4bharat/indic-conformer-600m-multilingual",
                 trust_remote_code=True
             )
             self.model = self.model.to(self.device_type)
+            logger.info("ASR model loaded asynchronously")
 # Global Managers
 llm_manager = LLMManager(settings.llm_model_name)
                 translation_tasks.append(model_manager.load_model(src_lang, tgt_lang, key))
             await asyncio.gather(*tasks, *translation_tasks)
+            logger.info("All models loaded successfully asynchronously")
         except Exception as e:
             logger.error(f"Error loading models: {str(e)}")
             raise
+    logger.info("Starting asynchronous model loading...")
     await load_all_models()
     yield
     llm_manager.unload()

src/server/main_local.py ADDED Viewed

	@@ -0,0 +1,913 @@

+import argparse
+import io
+import os
+from time import time
+from typing import List
+import tempfile
+import uvicorn
+from fastapi import Depends, FastAPI, File, HTTPException, Query, Request, UploadFile, Body, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
+from PIL import Image
+from pydantic import BaseModel, field_validator
+from pydantic_settings import BaseSettings
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor, AutoModel, BitsAndBytesConfig, Gemma3ForConditionalGeneration
+from IndicTransToolkit import IndicProcessor
+import json
+import asyncio
+from contextlib import asynccontextmanager
+import soundfile as sf
+import numpy as np
+import requests
+from starlette.responses import StreamingResponse
+from logging_config import logger
+from tts_config import SPEED, ResponseFormat, config as tts_config
+import torchaudio
+# Device setup
+if torch.cuda.is_available():
+    device = "cuda:0"
+    logger.info("GPU will be used for inference")
+else:
+    device = "cpu"
+    logger.info("CPU will be used for inference")
+torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32
+# Check CUDA availability and version
+cuda_available = torch.cuda.is_available()
+cuda_version = torch.version.cuda if cuda_available else None
+if torch.cuda.is_available():
+    device_idx = torch.cuda.current_device()
+    capability = torch.cuda.get_device_capability(device_idx)
+    compute_capability_float = float(f"{capability[0]}.{capability[1]}")
+    print(f"CUDA version: {cuda_version}")
+    print(f"CUDA Compute Capability: {compute_capability_float}")
+else:
+    print("CUDA is not available on this system.")
+# Settings
+class Settings(BaseSettings):
+    llm_model_name: str = "google/gemma-3-4b-it"
+    max_tokens: int = 512
+    host: str = "0.0.0.0"
+    port: int = 7860
+    chat_rate_limit: str = "100/minute"
+    speech_rate_limit: str = "5/minute"
+    @field_validator("chat_rate_limit", "speech_rate_limit")
+    def validate_rate_limit(cls, v):
+        if not v.count("/") == 1 or not v.split("/")[0].isdigit():
+            raise ValueError("Rate limit must be in format 'number/period' (e.g., '5/minute')")
+        return v
+    class Config:
+        env_file = ".env"
+settings = Settings()
+# Quantization config for LLM
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+# LLM Manager
+class LLMManager:
+    def __init__(self, model_name: str, device: str = "cuda" if torch.cuda.is_available() else "cpu"):
+        self.model_name = model_name
+        self.device = torch.device(device)
+        self.torch_dtype = torch.bfloat16 if self.device.type != "cpu" else torch.float32
+        self.model = None
+        self.processor = None
+        self.is_loaded = False
+        logger.info(f"LLMManager initialized with model {model_name} on {self.device}")
+    async def load(self):
+        if not self.is_loaded:
+            try:
+                local_path = "/app/models/llm_model"
+                self.model = await asyncio.to_thread(
+                    Gemma3ForConditionalGeneration.from_pretrained,
+                    local_path,
+                    device_map="auto",
+                    quantization_config=quantization_config,
+                    torch_dtype=self.torch_dtype
+                )
+                self.model.eval()
+                self.processor = await asyncio.to_thread(
+                    AutoProcessor.from_pretrained,
+                    local_path
+                )
+                self.is_loaded = True
+                logger.info(f"LLM loaded from {local_path} on {self.device}")
+            except Exception as e:
+                logger.error(f"Failed to load LLM: {str(e)}")
+                raise
+    def unload(self):
+        if self.is_loaded:
+            del self.model
+            del self.processor
+            if self.device.type == "cuda":
+                torch.cuda.empty_cache()
+                logger.info(f"GPU memory allocated after unload: {torch.cuda.memory_allocated()}")
+            self.is_loaded = False
+            logger.info(f"LLM {self.model_name} unloaded from {self.device}")
+    async def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
+        if not self.is_loaded:
+            await self.load()
+        messages_vlm = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Answer questions considering India as base country and Karnataka as base state. Provide a concise response in one sentence maximum."}]
+            },
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt}]
+            }
+        ]
+        try:
+            inputs_vlm = self.processor.apply_chat_template(
+                messages_vlm,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            ).to(self.device, dtype=torch.bfloat16)
+        except Exception as e:
+            logger.error(f"Error in tokenization: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Tokenization failed: {str(e)}")
+        input_len = inputs_vlm["input_ids"].shape[-1]
+        with torch.inference_mode():
+            generation = self.model.generate(
+                **inputs_vlm,
+                max_new_tokens=max_tokens,
+                do_sample=True,
+                temperature=temperature
+            )
+            generation = generation[0][input_len:]
+        response = self.processor.decode(generation, skip_special_tokens=True)
+        logger.info(f"Generated response: {response}")
+        return response
+    async def vision_query(self, image: Image.Image, query: str) -> str:
+        if not self.is_loaded:
+            await self.load()
+        messages_vlm = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Summarize your answer in maximum 1 sentence."}]
+            },
+            {
+                "role": "user",
+                "content": []
+            }
+        ]
+        messages_vlm[1]["content"].append({"type": "text", "text": query})
+        if image and image.size[0] > 0 and image.size[1] > 0:
+            messages_vlm[1]["content"].insert(0, {"type": "image", "image": image})
+            logger.info(f"Received valid image for processing")
+        else:
+            logger.info("No valid image provided, processing text only")
+        try:
+            inputs_vlm = self.processor.apply_chat_template(
+                messages_vlm,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            ).to(self.device, dtype=torch.bfloat16)
+        except Exception as e:
+            logger.error(f"Error in apply_chat_template: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Failed to process input: {str(e)}")
+        input_len = inputs_vlm["input_ids"].shape[-1]
+        with torch.inference_mode():
+            generation = self.model.generate(
+                **inputs_vlm,
+                max_new_tokens=512,
+                do_sample=True,
+                temperature=0.7
+            )
+            generation = generation[0][input_len:]
+        decoded = self.processor.decode(generation, skip_special_tokens=True)
+        logger.info(f"Vision query response: {decoded}")
+        return decoded
+    async def chat_v2(self, image: Image.Image, query: str) -> str:
+        if not self.is_loaded:
+            await self.load()
+        messages_vlm = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Answer questions considering India as base country and Karnataka as base state."}]
+            },
+            {
+                "role": "user",
+                "content": []
+            }
+        ]
+        messages_vlm[1]["content"].append({"type": "text", "text": query})
+        if image and image.size[0] > 0 and image.size[1] > 0:
+            messages_vlm[1]["content"].insert(0, {"type": "image", "image": image})
+            logger.info(f"Received valid image for processing")
+        else:
+            logger.info("No valid image provided, processing text only")
+        try:
+            inputs_vlm = self.processor.apply_chat_template(
+                messages_vlm,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            ).to(self.device, dtype=torch.bfloat16)
+        except Exception as e:
+            logger.error(f"Error in apply_chat_template: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Failed to process input: {str(e)}")
+        input_len = inputs_vlm["input_ids"].shape[-1]
+        with torch.inference_mode():
+            generation = self.model.generate(
+                **inputs_vlm,
+                max_new_tokens=512,
+                do_sample=True,
+                temperature=0.7
+            )
+            generation = generation[0][input_len:]
+        decoded = self.processor.decode(generation, skip_special_tokens=True)
+        logger.info(f"Chat_v2 response: {decoded}")
+        return decoded
+# TTS Manager
+class TTSManager:
+    def __init__(self, device_type=device):
+        self.device_type = device_type
+        self.model = None
+        self.repo_id = "ai4bharat/IndicF5"
+    async def load(self):
+        if not self.model:
+            logger.info("Loading TTS model from local path asynchronously...")
+            local_path = "/app/models/tts_model"
+            self.model = await asyncio.to_thread(
+                AutoModel.from_pretrained,
+                local_path,
+                trust_remote_code=True
+            )
+            self.model = self.model.to(self.device_type)
+            logger.info("TTS model loaded from local path asynchronously")
+    def synthesize(self, text, ref_audio_path, ref_text):
+        if not self.model:
+            raise ValueError("TTS model not loaded")
+        return self.model(text, ref_audio_path=ref_audio_path, ref_text=ref_text)
+# TTS Constants
+EXAMPLES = [
+    {
+        "audio_name": "KAN_F (Happy)",
+        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
+        "ref_text": "ನಮ್‌ ಫ್ರಿಜ್ಜಲ್ಲಿ  ಕೂಲಿಂಗ್‌ ಸಮಸ್ಯೆ ಆಗಿ ನಾನ್‌ ಭಾಳ ದಿನದಿಂದ ಒದ್ದಾಡ್ತಿದ್ದೆ, ಆದ್ರೆ ಅದ್ನೀಗ ಮೆಕಾನಿಕ್ ಆಗಿರೋ ನಿಮ್‌ ಸಹಾಯ್ದಿಂದ ಬಗೆಹರಿಸ್ಕೋಬೋದು ಅಂತಾಗಿ ನಿರಾಳ ಆಯ್ತು ನಂಗೆ.",
+        "synth_text": "ಚೆನ್ನೈನ ಶೇರ್ ಆಟೋ ಪ್ರಯಾಣಿಕರ ನಡುವೆ ಆಹಾರವನ್ನು ಹಂಚಿಕೊಂಡು ತಿನ್ನುವುದು ನನಗೆ ಮನಸ್ಸಿಗೆ ತುಂಬಾ ಒಳ್ಳೆಯದೆನಿಸುವ ವಿಷಯ."
+    },
+]
+# Pydantic models for TTS
+class SynthesizeRequest(BaseModel):
+    text: str
+    ref_audio_name: str
+    ref_text: str = None
+class KannadaSynthesizeRequest(BaseModel):
+    text: str
+# TTS Functions
+def load_audio_from_url(url: str):
+    response = requests.get(url)
+    if response.status_code == 200:
+        audio_data, sample_rate = sf.read(io.BytesIO(response.content))
+        return sample_rate, audio_data
+    raise HTTPException(status_code=500, detail="Failed to load reference audio from URL.")
+def synthesize_speech(tts_manager: TTSManager, text: str, ref_audio_name: str, ref_text: str):
+    ref_audio_url = None
+    for example in EXAMPLES:
+        if example["audio_name"] == ref_audio_name:
+            ref_audio_url = example["audio_url"]
+            if not ref_text:
+                ref_text = example["ref_text"]
+            break
+    if not ref_audio_url:
+        raise HTTPException(status_code=400, detail="Invalid reference audio name.")
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
+    if not ref_text or not ref_text.strip():
+        raise HTTPException(status_code=400, detail="Reference text cannot be empty.")
+    sample_rate, audio_data = load_audio_from_url(ref_audio_url)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
+        sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
+        temp_audio.flush()
+        audio = tts_manager.synthesize(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
+    if audio.dtype == np.int16:
+        audio = audio.astype(np.float32) / 32768.0
+    buffer = io.BytesIO()
+    sf.write(buffer, audio, 24000, format='WAV')
+    buffer.seek(0)
+    return buffer
+# Supported languages
+SUPPORTED_LANGUAGES = {
+    "asm_Beng", "kas_Arab", "pan_Guru", "ben_Beng", "kas_Deva", "san_Deva",
+    "brx_Deva", "mai_Deva", "sat_Olck", "doi_Deva", "mal_Mlym", "snd_Arab",
+    "eng_Latn", "mar_Deva", "snd_Deva", "gom_Deva", "mni_Beng", "tam_Taml",
+    "guj_Gujr", "mni_Mtei", "tel_Telu", "hin_Deva", "npi_Deva", "urd_Arab",
+    "kan_Knda", "ory_Orya",
+    "deu_Latn", "fra_Latn", "nld_Latn", "spa_Latn", "ita_Latn",
+    "por_Latn", "rus_Cyrl", "pol_Latn"
+}
+# Translation Manager
+class TranslateManager:
+    def __init__(self, src_lang, tgt_lang, device_type=device, use_distilled=True):
+        self.device_type = device_type
+        self.tokenizer = None
+        self.model = None
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        self.use_distilled = use_distilled
+    async def load(self):
+        if not self.tokenizer or not self.model:
+            if self.src_lang.startswith("eng") and not self.tgt_lang.startswith("eng"):
+                local_path = "/app/models/trans_en_indic"
+            elif not self.src_lang.startswith("eng") and self.tgt_lang.startswith("eng"):
+                local_path = "/app/models/trans_indic_en"
+            elif not self.src_lang.startswith("eng") and not self.tgt_lang.startswith("eng"):
+                local_path = "/app/models/trans_indic_indic"
+            else:
+                raise ValueError("Invalid language combination")
+            self.tokenizer = await asyncio.to_thread(
+                AutoTokenizer.from_pretrained,
+                local_path,
+                trust_remote_code=True
+            )
+            self.model = await asyncio.to_thread(
+                AutoModelForSeq2SeqLM.from_pretrained,
+                local_path,
+                trust_remote_code=True,
+                torch_dtype=torch.float16,
+                attn_implementation="flash_attention_2"
+            )
+            self.model = self.model.to(self.device_type)
+            self.model = torch.compile(self.model, mode="reduce-overhead")
+            logger.info(f"Translation model loaded from {local_path} asynchronously")
+class ModelManager:
+    def __init__(self, device_type=device, use_distilled=True, is_lazy_loading=False):
+        self.models = {}
+        self.device_type = device_type
+        self.use_distilled = use_distilled
+        self.is_lazy_loading = is_lazy_loading
+    async def load_model(self, src_lang, tgt_lang, key):
+        logger.info(f"Loading translation model for {src_lang} -> {tgt_lang} from local path")
+        translate_manager = TranslateManager(src_lang, tgt_lang, self.device_type, self.use_distilled)
+        await translate_manager.load()
+        self.models[key] = translate_manager
+        logger.info(f"Loaded translation model for {key} from local path")
+    def get_model(self, src_lang, tgt_lang):
+        key = self._get_model_key(src_lang, tgt_lang)
+        if key not in self.models:
+            if self.is_lazy_loading:
+                asyncio.create_task(self.load_model(src_lang, tgt_lang, key))
+            else:
+                raise ValueError(f"Model for {key} is not preloaded and lazy loading is disabled.")
+        return self.models.get(key)
+    def _get_model_key(self, src_lang, tgt_lang):
+        if src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
+            return 'eng_indic'
+        elif not src_lang.startswith("eng") and tgt_lang.startswith("eng"):
+            return 'indic_eng'
+        elif not src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
+            return 'indic_indic'
+        raise ValueError("Invalid language combination")
+# ASR Manager
+class ASRModelManager:
+    def __init__(self, device_type="cuda"):
+        self.device_type = device_type
+        self.model = None
+        self.model_language = {"kannada": "kn"}
+    async def load(self):
+        if not self.model:
+            logger.info("Loading ASR model from local path asynchronously...")
+            local_path = "/app/models/asr_model"
+            self.model = await asyncio.to_thread(
+                AutoModel.from_pretrained,
+                local_path,
+                trust_remote_code=True
+            )
+            self.model = self.model.to(self.device_type)
+            logger.info("ASR model loaded from local path asynchronously")
+# Global Managers
+llm_manager = LLMManager(settings.llm_model_name)
+model_manager = ModelManager()
+asr_manager = ASRModelManager()
+tts_manager = TTSManager()
+ip = IndicProcessor(inference=True)
+# Pydantic Models
+class ChatRequest(BaseModel):
+    prompt: str
+    src_lang: str = "kan_Knda"
+    tgt_lang: str = "kan_Knda"
+    @field_validator("prompt")
+    def prompt_must_be_valid(cls, v):
+        if len(v) > 1000:
+            raise ValueError("Prompt cannot exceed 1000 characters")
+        return v.strip()
+    @field_validator("src_lang", "tgt_lang")
+    def validate_language(cls, v):
+        if v not in SUPPORTED_LANGUAGES:
+            raise ValueError(f"Unsupported language code: {v}. Supported codes: {', '.join(SUPPORTED_LANGUAGES)}")
+        return v
+class ChatResponse(BaseModel):
+    response: str
+class TranslationRequest(BaseModel):
+    sentences: List[str]
+    src_lang: str
+    tgt_lang: str
+class TranscriptionResponse(BaseModel):
+    text: str
+class TranslationResponse(BaseModel):
+    translations: List[str]
+# Dependency
+def get_translate_manager(src_lang: str, tgt_lang: str) -> TranslateManager:
+    return model_manager.get_model(src_lang, tgt_lang)
+# Lifespan Event Handler
+translation_configs = []
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    async def load_all_models():
+        try:
+            tasks = [
+                llm_manager.load(),
+                tts_manager.load(),
+                asr_manager.load(),
+            ]
+            translation_tasks = [
+                model_manager.load_model('eng_Latn', 'kan_Knda', 'eng_indic'),
+                model_manager.load_model('kan_Knda', 'eng_Latn', 'indic_eng'),
+                model_manager.load_model('kan_Knda', 'hin_Deva', 'indic_indic'),
+            ]
+            for config in translation_configs:
+                src_lang = config["src_lang"]
+                tgt_lang = config["tgt_lang"]
+                key = model_manager._get_model_key(src_lang, tgt_lang)
+                translation_tasks.append(model_manager.load_model(src_lang, tgt_lang, key))
+            await asyncio.gather(*tasks, *translation_tasks)
+            logger.info("All models loaded successfully from local paths")
+        except Exception as e:
+            logger.error(f"Error loading models: {str(e)}")
+            raise
+    logger.info("Starting asynchronous model loading from local paths...")
+    await load_all_models()
+    yield
+    llm_manager.unload()
+    logger.info("Server shutdown complete")
+# FastAPI App
+app = FastAPI(
+    title="Dhwani API",
+    description="AI Chat API supporting Indian languages",
+    version="1.0.0",
+    redirect_slashes=False,
+    lifespan=lifespan
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=False,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+limiter = Limiter(key_func=get_remote_address)
+app.state.limiter = limiter
+# API Endpoints
+@app.post("/audio/speech", response_class=StreamingResponse)
+async def synthesize_kannada(request: KannadaSynthesizeRequest):
+    if not tts_manager.model:
+        raise HTTPException(status_code=503, detail="TTS model not loaded")
+    kannada_example = next(ex for ex in EXAMPLES if ex["audio_name"] == "KAN_F (Happy)")
+    if not request.text.strip():
+        raise HTTPException(status_code=400, detail="Text to synthesize cannot be empty.")
+    audio_buffer = synthesize_speech(
+        tts_manager,
+        text=request.text,
+        ref_audio_name="KAN_F (Happy)",
+        ref_text=kannada_example["ref_text"]
+    )
+    return StreamingResponse(
+        audio_buffer,
+        media_type="audio/wav",
+        headers={"Content-Disposition": "attachment; filename=synthesized_kannada_speech.wav"}
+    )
+@app.post("/translate", response_model=TranslationResponse)
+async def translate(request: TranslationRequest, translate_manager: TranslateManager = Depends(get_translate_manager)):
+    input_sentences = request.sentences
+    src_lang = request.src_lang
+    tgt_lang = request.tgt_lang
+    if not input_sentences:
+        raise HTTPException(status_code=400, detail="Input sentences are required")
+    batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
+    inputs = translate_manager.tokenizer(
+        batch,
+        truncation=True,
+        padding="longest",
+        return_tensors="pt",
+        return_attention_mask=True,
+    ).to(translate_manager.device_type)
+    with torch.no_grad():
+        generated_tokens = translate_manager.model.generate(
+            **inputs,
+            use_cache=True,
+            min_length=0,
+            max_length=256,
+            num_beams=5,
+            num_return_sequences=1,
+        )
+    with translate_manager.tokenizer.as_target_tokenizer():
+        generated_tokens = translate_manager.tokenizer.batch_decode(
+            generated_tokens.detach().cpu().tolist(),
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=True,
+        )
+    translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
+    return TranslationResponse(translations=translations)
+async def perform_internal_translation(sentences: List[str], src_lang: str, tgt_lang: str) -> List[str]:
+    try:
+        translate_manager = model_manager.get_model(src_lang, tgt_lang)
+    except ValueError as e:
+        logger.info(f"Model not preloaded: {str(e)}, loading now...")
+        key = model_manager._get_model_key(src_lang, tgt_lang)
+        await model_manager.load_model(src_lang, tgt_lang, key)
+        translate_manager = model_manager.get_model(src_lang, tgt_lang)
+    if not translate_manager.model:
+        await translate_manager.load()
+    request = TranslationRequest(sentences=sentences, src_lang=src_lang, tgt_lang=tgt_lang)
+    response = await translate(request, translate_manager)
+    return response.translations
+@app.get("/v1/health")
+async def health_check():
+    return {"status": "healthy", "model": settings.llm_model_name}
+@app.get("/")
+async def home():
+    return RedirectResponse(url="/docs")
+@app.post("/v1/unload_all_models")
+async def unload_all_models():
+    try:
+        logger.info("Starting to unload all models...")
+        llm_manager.unload()
+        logger.info("All models unloaded successfully")
+        return {"status": "success", "message": "All models unloaded"}
+    except Exception as e:
+        logger.error(f"Error unloading models: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Failed to unload models: {str(e)}")
+@app.post("/v1/load_all_models")
+async def load_all_models():
+    try:
+        logger.info("Starting to load all models...")
+        await llm_manager.load()
+        logger.info("All models loaded successfully")
+        return {"status": "success", "message": "All models loaded"}
+    except Exception as e:
+        logger.error(f"Error loading models: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Failed to load models: {str(e)}")
+@app.post("/v1/translate", response_model=TranslationResponse)
+async def translate_endpoint(request: TranslationRequest):
+    logger.info(f"Received translation request: {request.dict()}")
+    try:
+        translations = await perform_internal_translation(
+            sentences=request.sentences,
+            src_lang=request.src_lang,
+            tgt_lang=request.tgt_lang
+        )
+        logger.info(f"Translation successful: {translations}")
+        return TranslationResponse(translations=translations)
+    except Exception as e:
+        logger.error(f"Unexpected error during translation: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
+@app.post("/v1/chat", response_model=ChatResponse)
+@limiter.limit(settings.chat_rate_limit)
+async def chat(request: Request, chat_request: ChatRequest):
+    if not chat_request.prompt:
+        raise HTTPException(status_code=400, detail="Prompt cannot be empty")
+    logger.info(f"Received prompt: {chat_request.prompt}, src_lang: {chat_request.src_lang}, tgt_lang: {chat_request.tgt_lang}")
+    EUROPEAN_LANGUAGES = {"deu_Latn", "fra_Latn", "nld_Latn", "spa_Latn", "ita_Latn", "por_Latn", "rus_Cyrl", "pol_Latn"}
+    try:
+        if chat_request.src_lang != "eng_Latn" and chat_request.src_lang not in EUROPEAN_LANGUAGES:
+            translated_prompt = await perform_internal_translation(
+                sentences=[chat_request.prompt],
+                src_lang=chat_request.src_lang,
+                tgt_lang="eng_Latn"
+            )
+            prompt_to_process = translated_prompt[0]
+            logger.info(f"Translated prompt to English: {prompt_to_process}")
+        else:
+            prompt_to_process = chat_request.prompt
+            logger.info("Prompt in English or European language, no translation needed")
+        response = await llm_manager.generate(prompt_to_process, settings.max_tokens)
+        logger.info(f"Generated response: {response}")
+        if chat_request.tgt_lang != "eng_Latn" and chat_request.tgt_lang not in EUROPEAN_LANGUAGES:
+            translated_response = await perform_internal_translation(
+                sentences=[response],
+                src_lang="eng_Latn",
+                tgt_lang=chat_request.tgt_lang
+            )
+            final_response = translated_response[0]
+            logger.info(f"Translated response to {chat_request.tgt_lang}: {final_response}")
+        else:
+            final_response = response
+            logger.info(f"Response in {chat_request.tgt_lang}, no translation needed")
+        return ChatResponse(response=final_response)
+    except Exception as e:
+        logger.error(f"Error processing request: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
+@app.post("/v1/visual_query/")
+async def visual_query(
+    file: UploadFile = File(...),
+    query: str = Body(...),
+    src_lang: str = Query("kan_Knda", enum=list(SUPPORTED_LANGUAGES)),
+    tgt_lang: str = Query("kan_Knda", enum=list(SUPPORTED_LANGUAGES)),
+):
+    try:
+        image = Image.open(file.file)
+        if image.size == (0, 0):
+            raise HTTPException(status_code=400, detail="Uploaded image is empty or invalid")
+        if src_lang != "eng_Latn":
+            translated_query = await perform_internal_translation(
+                sentences=[query],
+                src_lang=src_lang,
+                tgt_lang="eng_Latn"
+            )
+            query_to_process = translated_query[0]
+            logger.info(f"Translated query to English: {query_to_process}")
+        else:
+            query_to_process = query
+            logger.info("Query already in English, no translation needed")
+        answer = await llm_manager.vision_query(image, query_to_process)
+        logger.info(f"Generated English answer: {answer}")
+        if tgt_lang != "eng_Latn":
+            translated_answer = await perform_internal_translation(
+                sentences=[answer],
+                src_lang="eng_Latn",
+                tgt_lang=tgt_lang
+            )
+            final_answer = translated_answer[0]
+            logger.info(f"Translated answer to {tgt_lang}: {final_answer}")
+        else:
+            final_answer = answer
+            logger.info("Answer kept in English, no translation needed")
+        return {"answer": final_answer}
+    except Exception as e:
+        logger.error(f"Error processing request: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
+@app.post("/v1/chat_v2", response_model=ChatResponse)
+@limiter.limit(settings.chat_rate_limit)
+async def chat_v2(
+    request: Request,
+    prompt: str = Form(...),
+    image: UploadFile = File(default=None),
+    src_lang: str = Form("kan_Knda"),
+    tgt_lang: str = Form("kan_Knda"),
+):
+    if not prompt:
+        raise HTTPException(status_code=400, detail="Prompt cannot be empty")
+    if src_lang not in SUPPORTED_LANGUAGES or tgt_lang not in SUPPORTED_LANGUAGES:
+        raise HTTPException(status_code=400, detail=f"Unsupported language code. Supported codes: {', '.join(SUPPORTED_LANGUAGES)}")
+    logger.info(f"Received prompt: {prompt}, src_lang: {src_lang}, tgt_lang: {tgt_lang}, Image provided: {image is not None}")
+    try:
+        if image:
+            image_data = await image.read()
+            if not image_data:
+                raise HTTPException(status_code=400, detail="Uploaded image is empty")
+            img = Image.open(io.BytesIO(image_data))
+            if src_lang != "eng_Latn":
+                translated_prompt = await perform_internal_translation(
+                    sentences=[prompt],
+                    src_lang=src_lang,
+                    tgt_lang="eng_Latn"
+                )
+                prompt_to_process = translated_prompt[0]
+                logger.info(f"Translated prompt to English: {prompt_to_process}")
+            else:
+                prompt_to_process = prompt
+                logger.info("Prompt already in English, no translation needed")
+            decoded = await llm_manager.chat_v2(img, prompt_to_process)
+            logger.info(f"Generated English response: {decoded}")
+            if tgt_lang != "eng_Latn":
+                translated_response = await perform_internal_translation(
+                    sentences=[decoded],
+                    src_lang="eng_Latn",
+                    tgt_lang=tgt_lang
+                )
+                final_response = translated_response[0]
+                logger.info(f"Translated response to {tgt_lang}: {final_response}")
+            else:
+                final_response = decoded
+                logger.info("Response kept in English, no translation needed")
+        else:
+            if src_lang != "eng_Latn":
+                translated_prompt = await perform_internal_translation(
+                    sentences=[prompt],
+                    src_lang=src_lang,
+                    tgt_lang="eng_Latn"
+                )
+                prompt_to_process = translated_prompt[0]
+                logger.info(f"Translated prompt to English: {prompt_to_process}")
+            else:
+                prompt_to_process = prompt
+                logger.info("Prompt already in English, no translation needed")
+            decoded = await llm_manager.generate(prompt_to_process, settings.max_tokens)
+            logger.info(f"Generated English response: {decoded}")
+            if tgt_lang != "eng_Latn":
+                translated_response = await perform_internal_translation(
+                    sentences=[decoded],
+                    src_lang="eng_Latn",
+                    tgt_lang=tgt_lang
+                )
+                final_response = translated_response[0]
+                logger.info(f"Translated response to {tgt_lang}: {final_response}")
+            else:
+                final_response = decoded
+                logger.info("Response kept in English, no translation needed")
+        return ChatResponse(response=final_response)
+    except Exception as e:
+        logger.error(f"Error processing request: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
+@app.post("/transcribe/", response_model=TranscriptionResponse)
+async def transcribe_audio(file: UploadFile = File(...), language: str = Query(..., enum=list(asr_manager.model_language.keys()))):
+    if not asr_manager.model:
+        raise HTTPException(status_code=503, detail="ASR model not loaded")
+    try:
+        wav, sr = torchaudio.load(file.file)
+        wav = torch.mean(wav, dim=0, keepdim=True)
+        target_sample_rate = 16000
+        if sr != target_sample_rate:
+            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sample_rate)
+            wav = resampler(wav)
+        transcription_rnnt = asr_manager.model(wav, asr_manager.model_language[language], "rnnt")
+        return TranscriptionResponse(text=transcription_rnnt)
+    except Exception as e:
+        logger.error(f"Error in transcription: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
+@app.post("/v1/speech_to_speech")
+async def speech_to_speech(
+    request: Request,
+    file: UploadFile = File(...),
+    language: str = Query(..., enum=list(asr_manager.model_language.keys())),
+) -> StreamingResponse:
+    if not tts_manager.model:
+        raise HTTPException(status_code=503, detail="TTS model not loaded")
+    transcription = await transcribe_audio(file, language)
+    logger.info(f"Transcribed text: {transcription.text}")
+    chat_request = ChatRequest(
+        prompt=transcription.text,
+        src_lang=LANGUAGE_TO_SCRIPT.get(language, "kan_Knda"),
+        tgt_lang=LANGUAGE_TO_SCRIPT.get(language, "kan_Knda")
+    )
+    processed_text = await chat(request, chat_request)
+    logger.info(f"Processed text: {processed_text.response}")
+    voice_request = KannadaSynthesizeRequest(text=processed_text.response)
+    audio_response = await synthesize_kannada(voice_request)
+    return audio_response
+LANGUAGE_TO_SCRIPT = {
+    "kannada": "kan_Knda"
+}
+# Main Execution
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the FastAPI server.")
+    parser.add_argument("--port", type=int, default=settings.port, help="Port to run the server on.")
+    parser.add_argument("--host", type=str, default=settings.host, help="Host to run the server on.")
+    parser.add_argument("--config", type=str, default="config_one", help="Configuration to use")
+    args = parser.parse_args()
+    def load_config(config_path="dhwani_config.json"):
+        with open(config_path, "r") as f:
+            return json.load(f)
+    config_data = load_config()
+    if args.config not in config_data["configs"]:
+        raise ValueError(f"Invalid config: {args.config}. Available: {list(config_data['configs'].keys())}")
+    selected_config = config_data["configs"][args.config]
+    global_settings = config_data["global_settings"]
+    settings.llm_model_name = selected_config["components"]["LLM"]["model"]
+    settings.max_tokens = selected_config["components"]["LLM"]["max_tokens"]
+    settings.host = global_settings["host"]
+    settings.port = global_settings["port"]
+    settings.chat_rate_limit = global_settings["chat_rate_limit"]
+    settings.speech_rate_limit = global_settings["speech_rate_limit"]
+    llm_manager = LLMManager(settings.llm_model_name)
+    if selected_config["components"]["ASR"]:
+        asr_model_name = selected_config["components"]["ASR"]["model"]
+        asr_manager.model_language[selected_config["language"]] = selected_config["components"]["ASR"]["language_code"]
+    if selected_config["components"]["Translation"]:
+        translation_configs.extend(selected_config["components"]["Translation"])
+    host = args.host if args.host != settings.host else settings.host
+    port = args.port if args.port != settings.port else settings.port
+    uvicorn.run(app, host=host, port=port)