Spaces:

CamiloVega
/

NewsIA

Sleeping

App Files Files Community

CamiloVega commited on Nov 3, 2024

Commit

c59e337

verified ·

1 Parent(s): 951c395

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -27

app.py CHANGED Viewed

@@ -40,9 +40,9 @@ class ModelManager:
             self.whisper_model = None
             self._initialized = True
-    @spaces.GPU(duration=60)
     def initialize_models(self):
-        """Initialize models with Zero GPU optimizations"""
         try:
             # Get HuggingFace token
             HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')
@@ -52,18 +52,19 @@ class ModelManager:
             logger.info("Starting model initialization...")
             model_name = "meta-llama/Llama-2-7b-chat-hf"
-            # Load tokenizer
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
                 token=HUGGINGFACE_TOKEN,
-                use_fast=False
             )
             if self.tokenizer is None:
                 raise RuntimeError("Failed to initialize tokenizer")
             self.tokenizer.pad_token = self.tokenizer.eos_token
-            # Load model with specific GPU memory settings
             logger.info("Loading model...")
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
@@ -71,12 +72,13 @@ class ModelManager:
                 torch_dtype=torch.float16,
                 device_map="auto",
                 low_cpu_mem_usage=True,
-                max_memory={0: "8GiB"}
             )
             if self.model is None:
                 raise RuntimeError("Failed to initialize model")
-            # Create pipeline
             logger.info("Creating pipeline...")
             self.news_generator = pipeline(
                 "text-generation",
@@ -84,18 +86,24 @@ class ModelManager:
                 tokenizer=self.tokenizer,
                 device_map="auto",
                 torch_dtype=torch.float16,
-                max_length=2048,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.95,
-                repetition_penalty=1.2
             )
             if self.news_generator is None:
                 raise RuntimeError("Failed to initialize news generator pipeline")
-            # Load Whisper model
             logger.info("Loading Whisper model...")
-            self.whisper_model = whisper.load_model("base", device="cuda")
             if self.whisper_model is None:
                 raise RuntimeError("Failed to initialize Whisper model")
@@ -108,15 +116,25 @@ class ModelManager:
             raise
     def reset_models(self):
-        """Reset all models to None"""
-        self.tokenizer = None
-        self.model = None
-        self.news_generator = None
-        self.whisper_model = None
-        # Clear CUDA cache
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
     def check_models_initialized(self):
         """Check if all models are properly initialized"""
@@ -184,7 +202,7 @@ def preprocess_audio(audio_file):
         logger.error(f"Error preprocessing audio: {str(e)}")
         raise
-@spaces.GPU(duration=60)
 def transcribe_audio(file):
     """Transcribe an audio or video file."""
     try:
@@ -262,7 +280,7 @@ def process_social_content(url):
         logger.error(f"Error processing social content: {str(e)}")
         return None
-@spaces.GPU(duration=60)
 def generate_news(instructions, facts, size, tone, *args):
     try:
         # Get initialized models
@@ -371,18 +389,21 @@ Follow these requirements:
 - Do not invent information
 - Be rigorous with the provided facts [/INST]"""
-        # Generate article with specific handling for Zero GPU
         with torch.inference_mode():
             outputs = news_generator(
                 prompt,
-                max_new_tokens=min(int(size * 2), 1024),
-                return_full_text=False,
-                pad_token_id=tokenizer.eos_token_id,
                 num_return_sequences=1,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.95,
-                repetition_penalty=1.2
             )
         news_article = outputs[0]['generated_text']

             self.whisper_model = None
             self._initialized = True
+    @spaces.GPU(duration=120)
     def initialize_models(self):
+        """Initialize models with optimized settings"""
         try:
             # Get HuggingFace token
             HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')
             logger.info("Starting model initialization...")
             model_name = "meta-llama/Llama-2-7b-chat-hf"
+            # Load tokenizer with optimized settings
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
                 token=HUGGINGFACE_TOKEN,
+                use_fast=True,
+                model_max_length=512
             )
             if self.tokenizer is None:
                 raise RuntimeError("Failed to initialize tokenizer")
             self.tokenizer.pad_token = self.tokenizer.eos_token
+            # Load model with optimized memory settings
             logger.info("Loading model...")
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 torch_dtype=torch.float16,
                 device_map="auto",
                 low_cpu_mem_usage=True,
+                max_memory={0: "6GiB"},
+                load_in_8bit=True
             )
             if self.model is None:
                 raise RuntimeError("Failed to initialize model")
+            # Create pipeline with optimized settings
             logger.info("Creating pipeline...")
             self.news_generator = pipeline(
                 "text-generation",
                 tokenizer=self.tokenizer,
                 device_map="auto",
                 torch_dtype=torch.float16,
+                max_new_tokens=512,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.95,
+                repetition_penalty=1.2,
+                num_return_sequences=1,
+                early_stopping=True
             )
             if self.news_generator is None:
                 raise RuntimeError("Failed to initialize news generator pipeline")
+            # Load Whisper model with optimized settings
             logger.info("Loading Whisper model...")
+            self.whisper_model = whisper.load_model(
+                "tiny",
+                device="cuda",
+                download_root="/tmp/whisper"
+            )
             if self.whisper_model is None:
                 raise RuntimeError("Failed to initialize Whisper model")
             raise
     def reset_models(self):
+        """Reset all models and clear GPU memory"""
+        try:
+            del self.tokenizer
+            del self.model
+            del self.news_generator
+            del self.whisper_model
+            self.tokenizer = None
+            self.model = None
+            self.news_generator = None
+            self.whisper_model = None
+            # Clear CUDA cache
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+        except Exception as e:
+            logger.error(f"Error during model reset: {str(e)}")
     def check_models_initialized(self):
         """Check if all models are properly initialized"""
         logger.error(f"Error preprocessing audio: {str(e)}")
         raise
+@spaces.GPU(duration=120)
 def transcribe_audio(file):
     """Transcribe an audio or video file."""
     try:
         logger.error(f"Error processing social content: {str(e)}")
         return None
+@spaces.GPU(duration=120)
 def generate_news(instructions, facts, size, tone, *args):
     try:
         # Get initialized models
 - Do not invent information
 - Be rigorous with the provided facts [/INST]"""
+        # Optimize size and max tokens
+        max_tokens = min(int(size * 1.5), 512)
+        # Generate article with optimized settings
         with torch.inference_mode():
             outputs = news_generator(
                 prompt,
+                max_new_tokens=max_tokens,
                 num_return_sequences=1,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.95,
+                repetition_penalty=1.2,
+                early_stopping=True,
+                pad_token_id=tokenizer.eos_token_id
             )
         news_article = outputs[0]['generated_text']