Update app.py
Browse files
app.py
CHANGED
@@ -15,22 +15,6 @@ print(accelerate.__version__) # Should be >= 0.12.0
|
|
15 |
num_gpus = torch.cuda.device_count()
|
16 |
print(f"Number of available GPUs: {num_gpus}")
|
17 |
|
18 |
-
# List details for each GPU
|
19 |
-
for i in range(num_gpus):
|
20 |
-
print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
21 |
-
print(f" Total Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
|
22 |
-
print(f" CUDA Capability: {torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}")
|
23 |
-
for i in range(num_gpus):
|
24 |
-
print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
25 |
-
print(f" Allocated Memory: {torch.cuda.memory_allocated(i) / 1e9:.2f} GB")
|
26 |
-
print(f" Cached Memory: {torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
|
27 |
-
print(f" Free Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9 - torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
|
28 |
-
|
29 |
-
quantization_config = BitsAndBytesConfig(
|
30 |
-
load_in_8bit=True, # Enable 8-bit quantization
|
31 |
-
llm_int8_enable_fp32_cpu_offload=True # Enable FP32 CPU offloading
|
32 |
-
)
|
33 |
-
|
34 |
"""
|
35 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
36 |
"""
|
@@ -71,6 +55,24 @@ PROMPT_DICT = {
|
|
71 |
}
|
72 |
model = None
|
73 |
tokenizer = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
def generate_prompt(instruction, input=None):
|
75 |
if input:
|
76 |
return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
|
@@ -118,6 +120,7 @@ def loadModel():
|
|
118 |
# )
|
119 |
tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
|
120 |
tokenizer.pad_token = tokenizer.unk_token
|
|
|
121 |
return model, tokenizer
|
122 |
|
123 |
model, tokenizer = loadModel()
|
@@ -130,11 +133,13 @@ def respond(
|
|
130 |
max_tokens,
|
131 |
temperature,
|
132 |
top_p,
|
133 |
-
):
|
134 |
ins_f = generate_prompt(message,None)
|
135 |
inputs = tokenizer(ins_f, return_tensors="pt")
|
|
|
136 |
input_ids = inputs["input_ids"].cuda()
|
137 |
max_new_tokens = 512
|
|
|
138 |
generation_config = GenerationConfig(
|
139 |
temperature=0.1,
|
140 |
top_p=0.75,
|
|
|
15 |
num_gpus = torch.cuda.device_count()
|
16 |
print(f"Number of available GPUs: {num_gpus}")
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
"""
|
19 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
20 |
"""
|
|
|
55 |
}
|
56 |
model = None
|
57 |
tokenizer = None
|
58 |
+
|
59 |
+
quantization_config = BitsAndBytesConfig(
|
60 |
+
load_in_8bit=True, # Enable 8-bit quantization
|
61 |
+
llm_int8_enable_fp32_cpu_offload=True # Enable FP32 CPU offloading
|
62 |
+
)
|
63 |
+
|
64 |
+
def print_resources():
|
65 |
+
# List details for each GPU
|
66 |
+
for i in range(num_gpus):
|
67 |
+
print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
68 |
+
print(f" Total Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
|
69 |
+
print(f" CUDA Capability: {torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}")
|
70 |
+
for i in range(num_gpus):
|
71 |
+
print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
72 |
+
print(f" Allocated Memory: {torch.cuda.memory_allocated(i) / 1e9:.2f} GB")
|
73 |
+
print(f" Cached Memory: {torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
|
74 |
+
print(f" Free Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9 - torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
|
75 |
+
|
76 |
def generate_prompt(instruction, input=None):
|
77 |
if input:
|
78 |
return PROMPT_DICT["prompt_input"].format(instruction=instruction,input=input)
|
|
|
120 |
# )
|
121 |
tokenizer = AutoTokenizer.from_pretrained(base_model,use_fast=False,cache_dir=cache_dir)
|
122 |
tokenizer.pad_token = tokenizer.unk_token
|
123 |
+
print_resources()
|
124 |
return model, tokenizer
|
125 |
|
126 |
model, tokenizer = loadModel()
|
|
|
133 |
max_tokens,
|
134 |
temperature,
|
135 |
top_p,
|
136 |
+
):
|
137 |
ins_f = generate_prompt(message,None)
|
138 |
inputs = tokenizer(ins_f, return_tensors="pt")
|
139 |
+
print_resources()
|
140 |
input_ids = inputs["input_ids"].cuda()
|
141 |
max_new_tokens = 512
|
142 |
+
print_resources()
|
143 |
generation_config = GenerationConfig(
|
144 |
temperature=0.1,
|
145 |
top_p=0.75,
|