rootxhacker commited on
Commit
2d8cff9
Β·
verified Β·
1 Parent(s): cc6dd78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -33
app.py CHANGED
@@ -13,7 +13,7 @@ import random
13
  import numpy as np
14
  import re
15
  import time
16
- from typing import List, Tuple, Generator
17
  import os
18
  import gc
19
  import spaces
@@ -21,14 +21,8 @@ import spaces
21
  # Global model variables for memory efficiency
22
  tokenizer = None
23
  model = None
24
- current_generator = None
25
  device = None
26
 
27
- def get_noising_schedule(i, max_it, sharpness=5.0):
28
- """Exponential noise schedule for denoising"""
29
- x = i / max_it
30
- return (np.exp(-sharpness * x) - np.exp(-sharpness)) / (1 - np.exp(-sharpness))
31
-
32
  class ARDiffusionGenerator:
33
  """Base AR-Diffusion generator with shared functionality"""
34
 
@@ -58,7 +52,7 @@ class ARDiffusionGenerator:
58
  """
59
 
60
  class QualityGenerator(ARDiffusionGenerator):
61
- """Quality-focused AR-Diffusion generator (from first script)"""
62
 
63
  def filter_logits(self, logits: torch.Tensor, top_k: int = 0, top_p: float = 1.0,
64
  temperature: float = 1.0) -> torch.Tensor:
@@ -194,8 +188,6 @@ class QualityGenerator(ARDiffusionGenerator):
194
  start_time = time.time()
195
 
196
  for step in range(steps):
197
- step_start = time.time()
198
-
199
  if progress_callback:
200
  progress = 0.2 + (step / steps) * 0.7
201
  elapsed = time.time() - start_time
@@ -222,7 +214,6 @@ class QualityGenerator(ARDiffusionGenerator):
222
  max_replacements = min(3, len(mask_positions))
223
 
224
  sorted_positions = sorted(mask_positions.tolist())
225
- step_replacements = 0
226
 
227
  for pos in sorted_positions[:max_replacements]:
228
  if pos < len(logits):
@@ -257,7 +248,6 @@ class QualityGenerator(ARDiffusionGenerator):
257
  break
258
 
259
  current_ids[pos] = new_token
260
- step_replacements += 1
261
  total_replacements += 1
262
 
263
  if progress_callback:
@@ -307,7 +297,7 @@ class QualityGenerator(ARDiffusionGenerator):
307
  return response
308
 
309
  class SpeedGenerator(ARDiffusionGenerator):
310
- """Speed-focused AR-Diffusion generator (from second script)"""
311
 
312
  def filter_logits(self, logits: torch.Tensor, top_k: int = 15, top_p: float = 0.8,
313
  temperature: float = 1.0) -> torch.Tensor:
@@ -425,8 +415,6 @@ class SpeedGenerator(ARDiffusionGenerator):
425
  # Use mixed precision for speed on GPU
426
  with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.device.type == 'cuda'):
427
  for step in range(steps):
428
- step_start = time.time()
429
-
430
  if progress_callback:
431
  progress = 0.2 + (step / steps) * 0.7
432
  elapsed = time.time() - start_time
@@ -448,7 +436,6 @@ class SpeedGenerator(ARDiffusionGenerator):
448
  max_replace = min(8, len(mask_pos))
449
  positions = sorted(mask_pos.tolist())[:max_replace]
450
 
451
- step_replacements = 0
452
  for pos in positions:
453
  if pos < len(logits):
454
  token_logits = logits[pos].clone()
@@ -475,7 +462,6 @@ class SpeedGenerator(ARDiffusionGenerator):
475
  new_token = top_indices[1].item()
476
 
477
  current_ids[pos] = new_token
478
- step_replacements += 1
479
  total_replacements += 1
480
 
481
  if progress_callback:
@@ -519,21 +505,61 @@ class SpeedGenerator(ARDiffusionGenerator):
519
 
520
  return response
521
 
522
- {device}...")
523
-
524
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
525
- if tokenizer.pad_token is None:
526
- tokenizer.pad_token = tokenizer.eos_token
527
 
528
- model = AutoModelForCausalLM.from_pretrained(
529
- model_path,
530
- torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
531
- device_map="auto" if device.type == "cuda" else None,
532
- trust_remote_code=True,
533
- low_cpu_mem_usage=True
534
- )
535
 
536
- return tokenizer, model, device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
 
538
  def cleanup_memory():
539
  """Clean up GPU memory"""
@@ -563,7 +589,6 @@ def chat_function(message, history, mode, progress=gr.Progress()):
563
  # Generate response with progress callback
564
  def progress_callback(pct, status_msg):
565
  progress(pct)
566
- # We'll show status in the performance display instead
567
 
568
  response, stats = generator.generate(message, progress_callback)
569
 
@@ -711,11 +736,11 @@ if __name__ == "__main__":
711
  show_error=True
712
  )
713
 
714
- # Updated requirements.txt should include:
715
  # torch>=2.0.0
716
  # transformers>=4.30.0
717
  # gradio
718
  # numpy
719
  # accelerate
720
  # spaces
721
- # peft # For LoRA adapter support
 
13
  import numpy as np
14
  import re
15
  import time
16
+ from typing import List, Tuple
17
  import os
18
  import gc
19
  import spaces
 
21
  # Global model variables for memory efficiency
22
  tokenizer = None
23
  model = None
 
24
  device = None
25
 
 
 
 
 
 
26
  class ARDiffusionGenerator:
27
  """Base AR-Diffusion generator with shared functionality"""
28
 
 
52
  """
53
 
54
  class QualityGenerator(ARDiffusionGenerator):
55
+ """Quality-focused AR-Diffusion generator"""
56
 
57
  def filter_logits(self, logits: torch.Tensor, top_k: int = 0, top_p: float = 1.0,
58
  temperature: float = 1.0) -> torch.Tensor:
 
188
  start_time = time.time()
189
 
190
  for step in range(steps):
 
 
191
  if progress_callback:
192
  progress = 0.2 + (step / steps) * 0.7
193
  elapsed = time.time() - start_time
 
214
  max_replacements = min(3, len(mask_positions))
215
 
216
  sorted_positions = sorted(mask_positions.tolist())
 
217
 
218
  for pos in sorted_positions[:max_replacements]:
219
  if pos < len(logits):
 
248
  break
249
 
250
  current_ids[pos] = new_token
 
251
  total_replacements += 1
252
 
253
  if progress_callback:
 
297
  return response
298
 
299
  class SpeedGenerator(ARDiffusionGenerator):
300
+ """Speed-focused AR-Diffusion generator"""
301
 
302
  def filter_logits(self, logits: torch.Tensor, top_k: int = 15, top_p: float = 0.8,
303
  temperature: float = 1.0) -> torch.Tensor:
 
415
  # Use mixed precision for speed on GPU
416
  with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.device.type == 'cuda'):
417
  for step in range(steps):
 
 
418
  if progress_callback:
419
  progress = 0.2 + (step / steps) * 0.7
420
  elapsed = time.time() - start_time
 
436
  max_replace = min(8, len(mask_pos))
437
  positions = sorted(mask_pos.tolist())[:max_replace]
438
 
 
439
  for pos in positions:
440
  if pos < len(logits):
441
  token_logits = logits[pos].clone()
 
462
  new_token = top_indices[1].item()
463
 
464
  current_ids[pos] = new_token
 
465
  total_replacements += 1
466
 
467
  if progress_callback:
 
505
 
506
  return response
507
 
508
+ @spaces.GPU
509
+ def load_model():
510
+ """Load model with Zero GPU optimization using @spaces.GPU"""
511
+ global tokenizer, model, device
 
512
 
513
+ if tokenizer is not None and model is not None:
514
+ return tokenizer, model, device
 
 
 
 
 
515
 
516
+ try:
517
+ # This appears to be a LoRA adapter
518
+ adapter_path = "rootxhacker/llama-3B-diffusion-exp-fixed"
519
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
520
+
521
+ print(f"Loading AR-Diffusion model on {device}...")
522
+
523
+ # Load tokenizer from adapter
524
+ tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
525
+ if tokenizer.pad_token is None:
526
+ tokenizer.pad_token = tokenizer.eos_token
527
+
528
+ # Load the adapter model
529
+ print("Loading adapter model...")
530
+ model = AutoModelForCausalLM.from_pretrained(
531
+ adapter_path,
532
+ torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
533
+ device_map="auto" if device.type == "cuda" else None,
534
+ trust_remote_code=True,
535
+ low_cpu_mem_usage=True
536
+ )
537
+
538
+ print("βœ… AR-Diffusion model loaded successfully!")
539
+ return tokenizer, model, device
540
+
541
+ except Exception as e:
542
+ print(f"❌ Error loading {adapter_path}: {e}")
543
+
544
+ # Fallback to a working model for demonstration
545
+ print("πŸ”„ Falling back to demonstration model...")
546
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
547
+ fallback_model = "gpt2-medium"
548
+
549
+ tokenizer = AutoTokenizer.from_pretrained(fallback_model)
550
+ if tokenizer.pad_token is None:
551
+ tokenizer.pad_token = tokenizer.eos_token
552
+
553
+ model = AutoModelForCausalLM.from_pretrained(
554
+ fallback_model,
555
+ torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
556
+ device_map="auto" if device.type == "cuda" else None,
557
+ low_cpu_mem_usage=True
558
+ )
559
+
560
+ print(f"βœ… Fallback model {fallback_model} loaded successfully!")
561
+ print("⚠️ Note: Using fallback model - AR-Diffusion features may not work as expected")
562
+ return tokenizer, model, device
563
 
564
  def cleanup_memory():
565
  """Clean up GPU memory"""
 
589
  # Generate response with progress callback
590
  def progress_callback(pct, status_msg):
591
  progress(pct)
 
592
 
593
  response, stats = generator.generate(message, progress_callback)
594
 
 
736
  show_error=True
737
  )
738
 
739
+ # Requirements.txt should include:
740
  # torch>=2.0.0
741
  # transformers>=4.30.0
742
  # gradio
743
  # numpy
744
  # accelerate
745
  # spaces
746
+ # peft