ryanpdwyer commited on
Commit
7592fb5
·
1 Parent(s): 3618983

Switched back to running on GPU

Browse files
Files changed (1) hide show
  1. app.py +2 -5
app.py CHANGED
@@ -6,8 +6,7 @@ import torch
6
  @st.cache_resource
7
  def load_model_and_tokenizer(model_name):
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
- # Load the model in 8-bit quantization
10
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", load_in_8bit=True)
11
  return model, tokenizer
12
 
13
  model_8b, tokenizer_8b = load_model_and_tokenizer("huggyllama/llama-3.1-8b")
@@ -19,15 +18,13 @@ def generate_text(model, tokenizer, prompt, max_length=100):
19
  outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)
20
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
21
 
22
- st.title("LLaMA-3.1-8B vs LLaMA-3.1-8B-Instruct Comparison (CPU Version)")
23
 
24
  prompt = st.text_area("Enter your prompt:", height=100)
25
  max_length = st.slider("Max output length:", min_value=50, max_value=500, value=100)
26
 
27
  if st.button("Generate"):
28
  if prompt:
29
- st.warning("Generation may take several minutes. Please be patient.")
30
-
31
  col1, col2 = st.columns(2)
32
 
33
  with col1:
 
6
  @st.cache_resource
7
  def load_model_and_tokenizer(model_name):
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+ model = AutoModelForCausalLM.from_pretrained(model_name)
 
10
  return model, tokenizer
11
 
12
  model_8b, tokenizer_8b = load_model_and_tokenizer("huggyllama/llama-3.1-8b")
 
18
  outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)
19
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
20
 
21
+ st.title("LLaMA-3.1-8B vs LLaMA-3.1-8B-Instruct Comparison")
22
 
23
  prompt = st.text_area("Enter your prompt:", height=100)
24
  max_length = st.slider("Max output length:", min_value=50, max_value=500, value=100)
25
 
26
  if st.button("Generate"):
27
  if prompt:
 
 
28
  col1, col2 = st.columns(2)
29
 
30
  with col1: