AIdeaText commited on
Commit
a16e1cf
verified
1 Parent(s): 232b6e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -34
app.py CHANGED
@@ -1,35 +1,22 @@
1
  import streamlit as st
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
- from huggingface_hub import login
5
- import os
6
 
7
- def init_huggingface():
8
- """Initialize Hugging Face authentication either from secrets or user input"""
9
- if 'HUGGING_FACE_TOKEN' not in st.session_state:
10
- # First try to get from environment variable
11
- token = os.getenv('HUGGINGFACE_TOKEN')
12
-
13
- # If not in environment, check streamlit secrets
14
- if not token and 'huggingface_token' in st.secrets:
15
- token = st.secrets['huggingface_token']
16
-
17
- # If still not found, ask user
18
- if not token:
19
- token = st.text_input('Enter your Hugging Face token:', type='password')
20
- if not token:
21
- st.warning('Please enter your Hugging Face token to proceed')
22
- st.stop()
23
-
24
- st.session_state['HUGGING_FACE_TOKEN'] = token
25
-
26
- # Login to Hugging Face
27
- login(st.session_state['HUGGING_FACE_TOKEN'])
28
- return True
29
 
30
  class LlamaDemo:
31
  def __init__(self):
32
- self.model_name = "meta-llama/Llama-2-70b-chat-hf"
33
  self._model = None
34
  self._tokenizer = None
35
 
@@ -38,10 +25,9 @@ class LlamaDemo:
38
  if self._model is None:
39
  self._model = AutoModelForCausalLM.from_pretrained(
40
  self.model_name,
41
- torch_dtype=torch.float16,
42
  device_map="auto",
43
- trust_remote_code=True,
44
- load_in_8bit=True # Para optimizar memoria
45
  )
46
  return self._model
47
 
@@ -55,7 +41,6 @@ class LlamaDemo:
55
  return self._tokenizer
56
 
57
  def generate_response(self, prompt: str, max_new_tokens: int = 512) -> str:
58
- # Format prompt for Llama 2 chat
59
  formatted_prompt = f"[INST] {prompt} [/INST]"
60
 
61
  inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
@@ -71,21 +56,26 @@ class LlamaDemo:
71
  pad_token_id=self.tokenizer.eos_token_id
72
  )
73
 
 
 
 
74
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
75
  return response.split("[/INST]")[-1].strip()
76
 
77
  def main():
78
  st.set_page_config(
79
- page_title="Llama 2 Demo",
80
  page_icon="馃",
81
  layout="wide"
82
  )
83
 
84
  st.title("馃 Llama 2 Chat Demo")
85
 
86
- # Initialize Hugging Face authentication
87
- if init_huggingface():
88
- st.success("Successfully authenticated with Hugging Face!")
 
 
89
 
90
  # Initialize model
91
  if 'llama' not in st.session_state:
@@ -123,6 +113,14 @@ def main():
123
  st.error(f"Error: {str(e)}")
124
 
125
  with st.sidebar:
 
 
 
 
 
 
 
 
126
  if st.button("Clear Chat History"):
127
  st.session_state.chat_history = []
128
  st.experimental_rerun()
 
1
  import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
 
 
4
 
5
+ # Verificar GPU al inicio
6
+ def check_gpu():
7
+ if torch.cuda.is_available():
8
+ gpu_info = {
9
+ "GPU Disponible": True,
10
+ "Nombre GPU": torch.cuda.get_device_name(0),
11
+ "Memoria Total (GB)": round(torch.cuda.get_device_properties(0).total_memory/1e9, 2),
12
+ "CUDA Version": torch.version.cuda
13
+ }
14
+ return gpu_info
15
+ return {"GPU Disponible": False}
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  class LlamaDemo:
18
  def __init__(self):
19
+ self.model_name = "meta-llama/Llama-2-7b-chat-hf"
20
  self._model = None
21
  self._tokenizer = None
22
 
 
25
  if self._model is None:
26
  self._model = AutoModelForCausalLM.from_pretrained(
27
  self.model_name,
28
+ torch_dtype=torch.float16, # Usar float16 para optimizar memoria
29
  device_map="auto",
30
+ load_in_8bit=True # Cuantizaci贸n 8-bit para optimizar memoria
 
31
  )
32
  return self._model
33
 
 
41
  return self._tokenizer
42
 
43
  def generate_response(self, prompt: str, max_new_tokens: int = 512) -> str:
 
44
  formatted_prompt = f"[INST] {prompt} [/INST]"
45
 
46
  inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
 
56
  pad_token_id=self.tokenizer.eos_token_id
57
  )
58
 
59
+ # Liberar memoria GPU despu茅s de generar
60
+ torch.cuda.empty_cache()
61
+
62
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
63
  return response.split("[/INST]")[-1].strip()
64
 
65
  def main():
66
  st.set_page_config(
67
+ page_title="Llama 2 Chat Demo",
68
  page_icon="馃",
69
  layout="wide"
70
  )
71
 
72
  st.title("馃 Llama 2 Chat Demo")
73
 
74
+ # Mostrar informaci贸n de GPU
75
+ gpu_info = check_gpu()
76
+ with st.expander("馃捇 GPU Info", expanded=False):
77
+ for key, value in gpu_info.items():
78
+ st.write(f"{key}: {value}")
79
 
80
  # Initialize model
81
  if 'llama' not in st.session_state:
 
113
  st.error(f"Error: {str(e)}")
114
 
115
  with st.sidebar:
116
+ st.markdown("""
117
+ ### Memory Management
118
+ To optimize GPU usage and costs:
119
+ - Model runs in 8-bit precision
120
+ - Memory is cleared after each generation
121
+ - Space sleeps after inactivity
122
+ """)
123
+
124
  if st.button("Clear Chat History"):
125
  st.session_state.chat_history = []
126
  st.experimental_rerun()