Spaces:

slabstech
/

dhwani-internal-api-server

Paused

App Files Files Community

sachin commited on 16 days ago

Commit

773ab72

1 Parent(s): 0a0efec

asybc

Browse files

Files changed (1) hide show

src/server/main.py +215 -7

src/server/main.py CHANGED Viewed

@@ -14,7 +14,7 @@ from pydantic_settings import BaseSettings
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel
 from IndicTransToolkit import IndicProcessor
 import json
 import asyncio
@@ -25,7 +25,6 @@ import requests
 from starlette.responses import StreamingResponse
 from logging_config import logger
 from tts_config import SPEED, ResponseFormat, config as tts_config
-from gemma_llm import LLMManager  # Assuming this is your custom LLMManager
 # Device setup
 if torch.cuda.is_available():
@@ -69,6 +68,209 @@ class Settings(BaseSettings):
 settings = Settings()
 # TTS Manager
 class TTSManager:
     def __init__(self, device_type=device):
@@ -197,7 +399,7 @@ class ModelManager:
         if src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
             model_name = "ai4bharat/indictrans2-en-indic-dist-200M" if self.use_distilled else "ai4bharat/indictrans2-en-indic-1B"
         elif not src_lang.startswith("eng") and tgt_lang.startswith("eng"):
-            model_name = "ai4bharat/indictrans2-indic-en-dist-200M" if self.use_distilled else "ai4bharat/indictrans2-indic-en-1B"
         else:
             model_name = "ai4bharat/indictrans2-indic-indic-dist-320M" if self.use_distilled else "ai4bharat/indictrans2-indic-indic-1B"
@@ -292,6 +494,8 @@ def get_translate_manager(src_lang: str, tgt_lang: str) -> TranslateManager:
     return model_manager.get_model(src_lang, tgt_lang)
 # Lifespan Event Handler
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     async def load_all_models():
@@ -303,6 +507,12 @@ async def lifespan(app: FastAPI):
             asyncio.create_task(model_manager.load_model('kan_Knda', 'eng_Latn', 'indic_eng')),
             asyncio.create_task(model_manager.load_model('kan_Knda', 'hin_Deva', 'indic_indic')),
         ]
         await asyncio.gather(*tasks)
         logger.info("All models loaded successfully")
@@ -616,6 +826,7 @@ async def transcribe_audio(file: UploadFile = File(...), language: str = Query(.
     if not asr_manager.model:
         raise HTTPException(status_code=503, detail="ASR model still loading, please try again later")
     try:
         wav, sr = torchaudio.load(file.file)
         wav = torch.mean(wav, dim=0, keepdim=True)
         target_sample_rate = 16000
@@ -688,10 +899,7 @@ if __name__ == "__main__":
         asr_manager.model_language[selected_config["language"]] = selected_config["components"]["ASR"]["language_code"]
     if selected_config["components"]["Translation"]:
-        for translation_config in selected_config["components"]["Translation"]:
-            src_lang = translation_config["src_lang"]
-            tgt_lang = translation_config["tgt_lang"]
-            asyncio.create_task(model_manager.load_model(src_lang, tgt_lang, model_manager._get_model_key(src_lang, tgt_lang)))
     host = args.host if args.host != settings.host else settings.host
     port = args.port if args.port != settings.port else settings.port

 from slowapi import Limiter
 from slowapi.util import get_remote_address
 import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel, AutoProcessor, BitsAndBytesConfig
 from IndicTransToolkit import IndicProcessor
 import json
 import asyncio
 from starlette.responses import StreamingResponse
 from logging_config import logger
 from tts_config import SPEED, ResponseFormat, config as tts_config
 # Device setup
 if torch.cuda.is_available():
 settings = Settings()
+# Quantization config for LLM
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+# LLM Manager (adapted from gemma_llm.py)
+class LLMManager:
+    def __init__(self, model_name: str, device: str = "cuda" if torch.cuda.is_available() else "cpu"):
+        self.model_name = model_name
+        self.device = torch.device(device)
+        self.torch_dtype = torch.bfloat16 if self.device.type != "cpu" else torch.float32
+        self.model = None
+        self.is_loaded = False
+        self.processor = None
+        logger.info(f"LLMManager initialized with model {model_name} on {self.device}")
+    async def unload(self):
+        if self.is_loaded:
+            await asyncio.to_thread(self._unload_sync)
+            self.is_loaded = False
+            logger.info(f"LLM {self.model_name} unloaded from {self.device}")
+    def _unload_sync(self):
+        del self.model
+        del self.processor
+        if self.device.type == "cuda":
+            torch.cuda.empty_cache()
+            logger.info(f"GPU memory allocated after unload: {torch.cuda.memory_allocated()}")
+    async def load(self):
+        if not self.is_loaded:
+            try:
+                self.model = await asyncio.to_thread(
+                    AutoModel.from_pretrained,
+                    self.model_name,
+                    device_map="auto",
+                    quantization_config=quantization_config,
+                    torch_dtype=self.torch_dtype
+                )
+                self.model.eval()
+                self.processor = await asyncio.to_thread(AutoProcessor.from_pretrained, self.model_name)
+                self.is_loaded = True
+                logger.info(f"LLM {self.model_name} loaded on {self.device} with 4-bit quantization")
+            except Exception as e:
+                logger.error(f"Failed to load model: {str(e)}")
+                raise HTTPException(status_code=500, detail=f"Model loading failed: {str(e)}")
+    async def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
+        if not self.is_loaded:
+            await self.load()
+        messages_vlm = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Answer questions considering India as base country and Karnataka as base state. Provide a concise response in one sentence maximum."}]
+            },
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt}]
+            }
+        ]
+        try:
+            inputs_vlm = await asyncio.to_thread(
+                self.processor.apply_chat_template,
+                messages_vlm,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            )
+            inputs_vlm = inputs_vlm.to(self.device, dtype=torch.bfloat16)
+            logger.info(f"Input IDs: {inputs_vlm['input_ids']}")
+            logger.info(f"Decoded input: {self.processor.decode(inputs_vlm['input_ids'][0])}")
+        except Exception as e:
+            logger.error(f"Error in tokenization: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Tokenization failed: {str(e)}")
+        input_len = inputs_vlm["input_ids"].shape[-1]
+        with torch.inference_mode():
+            generation = await asyncio.to_thread(
+                self.model.generate,
+                **inputs_vlm,
+                max_new_tokens=max_tokens,
+                do_sample=True,
+                temperature=temperature
+            )
+            generation = generation[0][input_len:]
+        response = self.processor.decode(generation, skip_special_tokens=True)
+        logger.info(f"Generated response: {response}")
+        return response
+    async def vision_query(self, image: Image.Image, query: str) -> str:
+        if not self.is_loaded:
+            await self.load()
+        messages_vlm = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Summarize your answer in maximum 1 sentence."}]
+            },
+            {
+                "role": "user",
+                "content": []
+            }
+        ]
+        messages_vlm[1]["content"].append({"type": "text", "text": query})
+        if image and image.size[0] > 0 and image.size[1] > 0:
+            messages_vlm[1]["content"].insert(0, {"type": "image", "image": image})
+            logger.info(f"Received valid image for processing")
+        else:
+            logger.info("No valid image provided, processing text only")
+        try:
+            inputs_vlm = await asyncio.to_thread(
+                self.processor.apply_chat_template,
+                messages_vlm,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            )
+            inputs_vlm = inputs_vlm.to(self.device, dtype=torch.bfloat16)
+            logger.info(f"Input IDs: {inputs_vlm['input_ids']}")
+        except Exception as e:
+            logger.error(f"Error in apply_chat_template: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Failed to process input: {str(e)}")
+        input_len = inputs_vlm["input_ids"].shape[-1]
+        with torch.inference_mode():
+            generation = await asyncio.to_thread(
+                self.model.generate,
+                **inputs_vlm,
+                max_new_tokens=512,
+                do_sample=True,
+                temperature=0.7
+            )
+            generation = generation[0][input_len:]
+        decoded = self.processor.decode(generation, skip_special_tokens=True)
+        logger.info(f"Vision query response: {decoded}")
+        return decoded
+    async def chat_v2(self, image: Image.Image, query: str) -> str:
+        if not self.is_loaded:
+            await self.load()
+        messages_vlm = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are Dhwani, a helpful assistant. Answer questions considering India as base country and Karnataka as base state."}]
+            },
+            {
+                "role": "user",
+                "content": []
+            }
+        ]
+        messages_vlm[1]["content"].append({"type": "text", "text": query})
+        if image and image.size[0] > 0 and image.size[1] > 0:
+            messages_vlm[1]["content"].insert(0, {"type": "image", "image": image})
+            logger.info(f"Received valid image for processing")
+        else:
+            logger.info("No valid image provided, processing text only")
+        try:
+            inputs_vlm = await asyncio.to_thread(
+                self.processor.apply_chat_template,
+                messages_vlm,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            )
+            inputs_vlm = inputs_vlm.to(self.device, dtype=torch.bfloat16)
+            logger.info(f"Input IDs: {inputs_vlm['input_ids']}")
+        except Exception as e:
+            logger.error(f"Error in apply_chat_template: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Failed to process input: {str(e)}")
+        input_len = inputs_vlm["input_ids"].shape[-1]
+        with torch.inference_mode():
+            generation = await asyncio.to_thread(
+                self.model.generate,
+                **inputs_vlm,
+                max_new_tokens=512,
+                do_sample=True,
+                temperature=0.7
+            )
+            generation = generation[0][input_len:]
+        decoded = self.processor.decode(generation, skip_special_tokens=True)
+        logger.info(f"Chat_v2 response: {decoded}")
+        return decoded
 # TTS Manager
 class TTSManager:
     def __init__(self, device_type=device):
         if src_lang.startswith("eng") and not tgt_lang.startswith("eng"):
             model_name = "ai4bharat/indictrans2-en-indic-dist-200M" if self.use_distilled else "ai4bharat/indictrans2-en-indic-1B"
         elif not src_lang.startswith("eng") and tgt_lang.startswith("eng"):
+            model_name = "ai4bharat/indictrans2-indic-en-dist-200M" if use_distilled else "ai4bharat/indictrans2-indic-en-1B"
         else:
             model_name = "ai4bharat/indictrans2-indic-indic-dist-320M" if self.use_distilled else "ai4bharat/indictrans2-indic-indic-1B"
     return model_manager.get_model(src_lang, tgt_lang)
 # Lifespan Event Handler
+translation_configs = []
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     async def load_all_models():
             asyncio.create_task(model_manager.load_model('kan_Knda', 'eng_Latn', 'indic_eng')),
             asyncio.create_task(model_manager.load_model('kan_Knda', 'hin_Deva', 'indic_indic')),
         ]
+        for config in translation_configs:
+            src_lang = config["src_lang"]
+            tgt_lang = config["tgt_lang"]
+            key = model_manager._get_model_key(src_lang, tgt_lang)
+            tasks.append(asyncio.create_task(model_manager.load_model(src_lang, tgt_lang, key)))
         await asyncio.gather(*tasks)
         logger.info("All models loaded successfully")
     if not asr_manager.model:
         raise HTTPException(status_code=503, detail="ASR model still loading, please try again later")
     try:
+        import torchaudio  # Added here for clarity
         wav, sr = torchaudio.load(file.file)
         wav = torch.mean(wav, dim=0, keepdim=True)
         target_sample_rate = 16000
         asr_manager.model_language[selected_config["language"]] = selected_config["components"]["ASR"]["language_code"]
     if selected_config["components"]["Translation"]:
+        translation_configs.extend(selected_config["components"]["Translation"])
     host = args.host if args.host != settings.host else settings.host
     port = args.port if args.port != settings.port else settings.port