Wedyan2023 commited on
Commit
b51ed4e
·
verified ·
1 Parent(s): 0b8c1dd

Update app104.py

Browse files
Files changed (1) hide show
  1. app104.py +130 -60
app104.py CHANGED
@@ -31,73 +31,73 @@ client = OpenAI(
31
  # from transformers import AutoModelForCausalLM, AutoTokenizer
32
  # import torch
33
 
34
- # Model selection dropdown
35
- selected_model = st.selectbox(
36
- "Select Model",
37
- ["meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
38
- "meta-llama/Llama-3.3-70B-Instruct",
39
- "meta-llama/Llama-3.2-3B-Instruct",
40
- "meta-llama/Llama-4-Scout-17B-16E-Instruct",
41
- "meta-llama/Meta-Llama-3-8B-Instruct",
42
- "meta-llama/Llama-3.1-70B-Instruct"],
43
- key='model_select'
44
- )
45
 
46
- @st.cache_resource # Cache the model to prevent reloading
47
- def load_model(model_name):
48
- try:
49
- # Optimized model loading configuration
50
- model = AutoModelForCausalLM.from_pretrained(
51
- model_name,
52
- torch_dtype=torch.float16, # Use half precision
53
- device_map="auto", # Automatic device mapping
54
- load_in_8bit=True, # Enable 8-bit quantization
55
- low_cpu_mem_usage=True, # Optimize CPU memory usage
56
- max_memory={0: "10GB"} # Limit GPU memory usage
57
- )
58
 
59
- tokenizer = AutoTokenizer.from_pretrained(
60
- model_name,
61
- padding_side="left",
62
- truncation_side="left"
63
- )
64
 
65
- return model, tokenizer
66
 
67
- except Exception as e:
68
- st.error(f"Error loading model: {str(e)}")
69
- return None, None
70
-
71
- # Load the selected model with optimizations
72
- if selected_model:
73
- model, tokenizer = load_model(selected_model)
74
-
75
- # Check if model loaded successfully
76
- if model is not None:
77
- st.success(f"Successfully loaded {selected_model}")
78
- else:
79
- st.warning("Please select a different model or check your hardware capabilities")
80
-
81
- # Function to generate text
82
- def generate_response(prompt, model, tokenizer):
83
- try:
84
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
85
 
86
- with torch.no_grad():
87
- outputs = model.generate(
88
- inputs["input_ids"],
89
- max_length=256,
90
- num_return_sequences=1,
91
- temperature=0.7,
92
- do_sample=True,
93
- pad_token_id=tokenizer.pad_token_id
94
- )
95
 
96
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
97
- return response
98
 
99
- except Exception as e:
100
- return f"Error generating response: {str(e)}"
101
  ############################################################
102
 
103
  ####new
@@ -251,7 +251,77 @@ with st.sidebar:
251
  # key='model_select'
252
  # )
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  # model = AutoModelForCausalLM.from_pretrained(
257
  # "meta-llama/Meta-Llama-3-8B-Instruct",
 
31
  # from transformers import AutoModelForCausalLM, AutoTokenizer
32
  # import torch
33
 
34
+ # # Model selection dropdown
35
+ # selected_model = st.selectbox(
36
+ # "Select Model",
37
+ # ["meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
38
+ # "meta-llama/Llama-3.3-70B-Instruct",
39
+ # "meta-llama/Llama-3.2-3B-Instruct",
40
+ # "meta-llama/Llama-4-Scout-17B-16E-Instruct",
41
+ # "meta-llama/Meta-Llama-3-8B-Instruct",
42
+ # "meta-llama/Llama-3.1-70B-Instruct"],
43
+ # key='model_select'
44
+ # )
45
 
46
+ # @st.cache_resource # Cache the model to prevent reloading
47
+ # def load_model(model_name):
48
+ # try:
49
+ # # Optimized model loading configuration
50
+ # model = AutoModelForCausalLM.from_pretrained(
51
+ # model_name,
52
+ # torch_dtype=torch.float16, # Use half precision
53
+ # device_map="auto", # Automatic device mapping
54
+ # load_in_8bit=True, # Enable 8-bit quantization
55
+ # low_cpu_mem_usage=True, # Optimize CPU memory usage
56
+ # max_memory={0: "10GB"} # Limit GPU memory usage
57
+ # )
58
 
59
+ # tokenizer = AutoTokenizer.from_pretrained(
60
+ # model_name,
61
+ # padding_side="left",
62
+ # truncation_side="left"
63
+ # )
64
 
65
+ # return model, tokenizer
66
 
67
+ # except Exception as e:
68
+ # st.error(f"Error loading model: {str(e)}")
69
+ # return None, None
70
+
71
+ # # Load the selected model with optimizations
72
+ # if selected_model:
73
+ # model, tokenizer = load_model(selected_model)
74
+
75
+ # # Check if model loaded successfully
76
+ # if model is not None:
77
+ # st.success(f"Successfully loaded {selected_model}")
78
+ # else:
79
+ # st.warning("Please select a different model or check your hardware capabilities")
80
+
81
+ # # Function to generate text
82
+ # def generate_response(prompt, model, tokenizer):
83
+ # try:
84
+ # inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
85
 
86
+ # with torch.no_grad():
87
+ # outputs = model.generate(
88
+ # inputs["input_ids"],
89
+ # max_length=256,
90
+ # num_return_sequences=1,
91
+ # temperature=0.7,
92
+ # do_sample=True,
93
+ # pad_token_id=tokenizer.pad_token_id
94
+ # )
95
 
96
+ # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
97
+ # return response
98
 
99
+ # except Exception as e:
100
+ # return f"Error generating response: {str(e)}"
101
  ############################################################
102
 
103
  ####new
 
251
  # key='model_select'
252
  # )
253
 
254
+ #################new oooo
255
+
256
+ # Model selection dropdown
257
+ selected_model = st.selectbox(
258
+ "Select Model",
259
+ [#"meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
260
+ "meta-llama/Llama-3.2-3B-Instruct",
261
+ "meta-llama/Llama-3.3-70B-Instruct",
262
+ "meta-llama/Llama-3.2-3B-Instruct",
263
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct",
264
+ "meta-llama/Meta-Llama-3-8B-Instruct",
265
+ "meta-llama/Llama-3.1-70B-Instruct"],
266
+ key='model_select'
267
+ )
268
+
269
+ @st.cache_resource # Cache the model to prevent reloading
270
+ def load_model(model_name):
271
+ try:
272
+ # Optimized model loading configuration
273
+ model = AutoModelForCausalLM.from_pretrained(
274
+ model_name,
275
+ torch_dtype=torch.float16, # Use half precision
276
+ device_map="auto", # Automatic device mapping
277
+ load_in_8bit=True, # Enable 8-bit quantization
278
+ low_cpu_mem_usage=True, # Optimize CPU memory usage
279
+ max_memory={0: "10GB"} # Limit GPU memory usage
280
+ )
281
+
282
+ tokenizer = AutoTokenizer.from_pretrained(
283
+ model_name,
284
+ padding_side="left",
285
+ truncation_side="left"
286
+ )
287
+
288
+ return model, tokenizer
289
+
290
+ except Exception as e:
291
+ st.error(f"Error loading model: {str(e)}")
292
+ return None, None
293
+
294
+ # Load the selected model with optimizations
295
+ if selected_model:
296
+ model, tokenizer = load_model(selected_model)
297
+
298
+ # Check if model loaded successfully
299
+ if model is not None:
300
+ st.success(f"Successfully loaded {selected_model}")
301
+ else:
302
+ st.warning("Please select a different model or check your hardware capabilities")
303
 
304
+ # Function to generate text
305
+ def generate_response(prompt, model, tokenizer):
306
+ try:
307
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
308
+
309
+ with torch.no_grad():
310
+ outputs = model.generate(
311
+ inputs["input_ids"],
312
+ max_length=256,
313
+ num_return_sequences=1,
314
+ temperature=0.7,
315
+ do_sample=True,
316
+ pad_token_id=tokenizer.pad_token_id
317
+ )
318
+
319
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
320
+ return response
321
+
322
+ except Exception as e:
323
+ return f"Error generating response: {str(e)}"
324
+ ################
325
 
326
  # model = AutoModelForCausalLM.from_pretrained(
327
  # "meta-llama/Meta-Llama-3-8B-Instruct",