CamiloVega commited on
Commit
c59e337
·
verified ·
1 Parent(s): 951c395

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -27
app.py CHANGED
@@ -40,9 +40,9 @@ class ModelManager:
40
  self.whisper_model = None
41
  self._initialized = True
42
 
43
- @spaces.GPU(duration=60)
44
  def initialize_models(self):
45
- """Initialize models with Zero GPU optimizations"""
46
  try:
47
  # Get HuggingFace token
48
  HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')
@@ -52,18 +52,19 @@ class ModelManager:
52
  logger.info("Starting model initialization...")
53
  model_name = "meta-llama/Llama-2-7b-chat-hf"
54
 
55
- # Load tokenizer
56
  logger.info("Loading tokenizer...")
57
  self.tokenizer = AutoTokenizer.from_pretrained(
58
  model_name,
59
  token=HUGGINGFACE_TOKEN,
60
- use_fast=False
 
61
  )
62
  if self.tokenizer is None:
63
  raise RuntimeError("Failed to initialize tokenizer")
64
  self.tokenizer.pad_token = self.tokenizer.eos_token
65
 
66
- # Load model with specific GPU memory settings
67
  logger.info("Loading model...")
68
  self.model = AutoModelForCausalLM.from_pretrained(
69
  model_name,
@@ -71,12 +72,13 @@ class ModelManager:
71
  torch_dtype=torch.float16,
72
  device_map="auto",
73
  low_cpu_mem_usage=True,
74
- max_memory={0: "8GiB"}
 
75
  )
76
  if self.model is None:
77
  raise RuntimeError("Failed to initialize model")
78
 
79
- # Create pipeline
80
  logger.info("Creating pipeline...")
81
  self.news_generator = pipeline(
82
  "text-generation",
@@ -84,18 +86,24 @@ class ModelManager:
84
  tokenizer=self.tokenizer,
85
  device_map="auto",
86
  torch_dtype=torch.float16,
87
- max_length=2048,
88
  do_sample=True,
89
  temperature=0.7,
90
  top_p=0.95,
91
- repetition_penalty=1.2
 
 
92
  )
93
  if self.news_generator is None:
94
  raise RuntimeError("Failed to initialize news generator pipeline")
95
 
96
- # Load Whisper model
97
  logger.info("Loading Whisper model...")
98
- self.whisper_model = whisper.load_model("base", device="cuda")
 
 
 
 
99
  if self.whisper_model is None:
100
  raise RuntimeError("Failed to initialize Whisper model")
101
 
@@ -108,15 +116,25 @@ class ModelManager:
108
  raise
109
 
110
  def reset_models(self):
111
- """Reset all models to None"""
112
- self.tokenizer = None
113
- self.model = None
114
- self.news_generator = None
115
- self.whisper_model = None
116
-
117
- # Clear CUDA cache
118
- if torch.cuda.is_available():
119
- torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
 
120
 
121
  def check_models_initialized(self):
122
  """Check if all models are properly initialized"""
@@ -184,7 +202,7 @@ def preprocess_audio(audio_file):
184
  logger.error(f"Error preprocessing audio: {str(e)}")
185
  raise
186
 
187
- @spaces.GPU(duration=60)
188
  def transcribe_audio(file):
189
  """Transcribe an audio or video file."""
190
  try:
@@ -262,7 +280,7 @@ def process_social_content(url):
262
  logger.error(f"Error processing social content: {str(e)}")
263
  return None
264
 
265
- @spaces.GPU(duration=60)
266
  def generate_news(instructions, facts, size, tone, *args):
267
  try:
268
  # Get initialized models
@@ -371,18 +389,21 @@ Follow these requirements:
371
  - Do not invent information
372
  - Be rigorous with the provided facts [/INST]"""
373
 
374
- # Generate article with specific handling for Zero GPU
 
 
 
375
  with torch.inference_mode():
376
  outputs = news_generator(
377
  prompt,
378
- max_new_tokens=min(int(size * 2), 1024),
379
- return_full_text=False,
380
- pad_token_id=tokenizer.eos_token_id,
381
  num_return_sequences=1,
382
  do_sample=True,
383
  temperature=0.7,
384
  top_p=0.95,
385
- repetition_penalty=1.2
 
 
386
  )
387
 
388
  news_article = outputs[0]['generated_text']
 
40
  self.whisper_model = None
41
  self._initialized = True
42
 
43
+ @spaces.GPU(duration=120)
44
  def initialize_models(self):
45
+ """Initialize models with optimized settings"""
46
  try:
47
  # Get HuggingFace token
48
  HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')
 
52
  logger.info("Starting model initialization...")
53
  model_name = "meta-llama/Llama-2-7b-chat-hf"
54
 
55
+ # Load tokenizer with optimized settings
56
  logger.info("Loading tokenizer...")
57
  self.tokenizer = AutoTokenizer.from_pretrained(
58
  model_name,
59
  token=HUGGINGFACE_TOKEN,
60
+ use_fast=True,
61
+ model_max_length=512
62
  )
63
  if self.tokenizer is None:
64
  raise RuntimeError("Failed to initialize tokenizer")
65
  self.tokenizer.pad_token = self.tokenizer.eos_token
66
 
67
+ # Load model with optimized memory settings
68
  logger.info("Loading model...")
69
  self.model = AutoModelForCausalLM.from_pretrained(
70
  model_name,
 
72
  torch_dtype=torch.float16,
73
  device_map="auto",
74
  low_cpu_mem_usage=True,
75
+ max_memory={0: "6GiB"},
76
+ load_in_8bit=True
77
  )
78
  if self.model is None:
79
  raise RuntimeError("Failed to initialize model")
80
 
81
+ # Create pipeline with optimized settings
82
  logger.info("Creating pipeline...")
83
  self.news_generator = pipeline(
84
  "text-generation",
 
86
  tokenizer=self.tokenizer,
87
  device_map="auto",
88
  torch_dtype=torch.float16,
89
+ max_new_tokens=512,
90
  do_sample=True,
91
  temperature=0.7,
92
  top_p=0.95,
93
+ repetition_penalty=1.2,
94
+ num_return_sequences=1,
95
+ early_stopping=True
96
  )
97
  if self.news_generator is None:
98
  raise RuntimeError("Failed to initialize news generator pipeline")
99
 
100
+ # Load Whisper model with optimized settings
101
  logger.info("Loading Whisper model...")
102
+ self.whisper_model = whisper.load_model(
103
+ "tiny",
104
+ device="cuda",
105
+ download_root="/tmp/whisper"
106
+ )
107
  if self.whisper_model is None:
108
  raise RuntimeError("Failed to initialize Whisper model")
109
 
 
116
  raise
117
 
118
  def reset_models(self):
119
+ """Reset all models and clear GPU memory"""
120
+ try:
121
+ del self.tokenizer
122
+ del self.model
123
+ del self.news_generator
124
+ del self.whisper_model
125
+
126
+ self.tokenizer = None
127
+ self.model = None
128
+ self.news_generator = None
129
+ self.whisper_model = None
130
+
131
+ # Clear CUDA cache
132
+ if torch.cuda.is_available():
133
+ torch.cuda.empty_cache()
134
+ torch.cuda.synchronize()
135
+
136
+ except Exception as e:
137
+ logger.error(f"Error during model reset: {str(e)}")
138
 
139
  def check_models_initialized(self):
140
  """Check if all models are properly initialized"""
 
202
  logger.error(f"Error preprocessing audio: {str(e)}")
203
  raise
204
 
205
+ @spaces.GPU(duration=120)
206
  def transcribe_audio(file):
207
  """Transcribe an audio or video file."""
208
  try:
 
280
  logger.error(f"Error processing social content: {str(e)}")
281
  return None
282
 
283
+ @spaces.GPU(duration=120)
284
  def generate_news(instructions, facts, size, tone, *args):
285
  try:
286
  # Get initialized models
 
389
  - Do not invent information
390
  - Be rigorous with the provided facts [/INST]"""
391
 
392
+ # Optimize size and max tokens
393
+ max_tokens = min(int(size * 1.5), 512)
394
+
395
+ # Generate article with optimized settings
396
  with torch.inference_mode():
397
  outputs = news_generator(
398
  prompt,
399
+ max_new_tokens=max_tokens,
 
 
400
  num_return_sequences=1,
401
  do_sample=True,
402
  temperature=0.7,
403
  top_p=0.95,
404
+ repetition_penalty=1.2,
405
+ early_stopping=True,
406
+ pad_token_id=tokenizer.eos_token_id
407
  )
408
 
409
  news_article = outputs[0]['generated_text']