mamkkl commited on
Commit
f2a1799
·
verified ·
1 Parent(s): 5c3469e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -17
app.py CHANGED
@@ -15,22 +15,6 @@ print(accelerate.__version__) # Should be >= 0.12.0
15
  num_gpus = torch.cuda.device_count()
16
  print(f"Number of available GPUs: {num_gpus}")
17
 
18
- # List details for each GPU
19
- for i in range(num_gpus):
20
- print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
21
- print(f" Total Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
22
- print(f" CUDA Capability: {torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}")
23
- for i in range(num_gpus):
24
- print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
25
- print(f" Allocated Memory: {torch.cuda.memory_allocated(i) / 1e9:.2f} GB")
26
- print(f" Cached Memory: {torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
27
- print(f" Free Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9 - torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
28
-
29
- quantization_config = BitsAndBytesConfig(
30
- load_in_8bit=True, # Enable 8-bit quantization
31
- llm_int8_enable_fp32_cpu_offload=True # Enable FP32 CPU offloading
32
- )
33
-
34
  """
35
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
36
  """
@@ -71,6 +55,24 @@ PROMPT_DICT = {
71
  }
72
  model = None
73
  tokenizer = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def generate_prompt(instruction, input=None):
75
  if input:
76
  return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
@@ -118,6 +120,7 @@ def loadModel():
118
  # )
119
  tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
120
  tokenizer.pad_token = tokenizer.unk_token
 
121
  return model, tokenizer
122
 
123
  model, tokenizer = loadModel()
@@ -130,11 +133,13 @@ def respond(
130
  max_tokens,
131
  temperature,
132
  top_p,
133
- ):
134
  ins_f = generate_prompt(message,None)
135
  inputs = tokenizer(ins_f, return_tensors="pt")
 
136
  input_ids = inputs["input_ids"].cuda()
137
  max_new_tokens = 512
 
138
  generation_config = GenerationConfig(
139
  temperature=0.1,
140
  top_p=0.75,
 
15
  num_gpus = torch.cuda.device_count()
16
  print(f"Number of available GPUs: {num_gpus}")
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  """
19
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
20
  """
 
55
  }
56
  model = None
57
  tokenizer = None
58
+
59
+ quantization_config = BitsAndBytesConfig(
60
+ load_in_8bit=True, # Enable 8-bit quantization
61
+ llm_int8_enable_fp32_cpu_offload=True # Enable FP32 CPU offloading
62
+ )
63
+
64
+ def print_resources():
65
+ # List details for each GPU
66
+ for i in range(num_gpus):
67
+ print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
68
+ print(f" Total Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
69
+ print(f" CUDA Capability: {torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}")
70
+ for i in range(num_gpus):
71
+ print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
72
+ print(f" Allocated Memory: {torch.cuda.memory_allocated(i) / 1e9:.2f} GB")
73
+ print(f" Cached Memory: {torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
74
+ print(f" Free Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9 - torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
75
+
76
  def generate_prompt(instruction, input=None):
77
  if input:
78
  return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
 
120
  # )
121
  tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
122
  tokenizer.pad_token = tokenizer.unk_token
123
+ print_resources()
124
  return model, tokenizer
125
 
126
  model, tokenizer = loadModel()
 
133
  max_tokens,
134
  temperature,
135
  top_p,
136
+ ):
137
  ins_f = generate_prompt(message,None)
138
  inputs = tokenizer(ins_f, return_tensors="pt")
139
+ print_resources()
140
  input_ids = inputs["input_ids"].cuda()
141
  max_new_tokens = 512
142
+ print_resources()
143
  generation_config = GenerationConfig(
144
  temperature=0.1,
145
  top_p=0.75,