CamiloVega commited on
Commit
2b8a955
·
verified ·
1 Parent(s): 89bb689

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -9
app.py CHANGED
@@ -12,7 +12,7 @@ logging.basicConfig(
12
  level=logging.INFO,
13
  format='%(asctime)s - %(levelname)s - %(message)s'
14
  )
15
- logger = logging.getLogger(__name__)
16
 
17
  # Get HuggingFace token from environment variable
18
  hf_token = os.environ.get('HUGGINGFACE_TOKEN')
@@ -94,9 +94,12 @@ def generate_response(user_input, chat_history):
94
  conversation_history = ""
95
  if chat_history:
96
  for message in chat_history:
97
- conversation_history += f"{message[0]} {message[1]} "
 
 
 
98
 
99
- prompt = f"[INST] {system_message}\n\n{conversation_history}{user_input}"
100
 
101
  logger.info("Generating model response...")
102
  outputs = model_gen(
@@ -104,16 +107,38 @@ def generate_response(user_input, chat_history):
104
  max_new_tokens=256,
105
  return_full_text=False,
106
  pad_token_id=tokenizer.eos_token_id,
 
 
 
 
107
  )
108
  logger.info("Model response generated successfully")
109
 
 
110
  assistant_response = outputs[0]['generated_text'].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  # Calculate water consumption for output
113
  output_water_consumption = calculate_water_consumption(assistant_response, False)
114
  total_water_consumption += output_water_consumption
115
 
116
- # Update chat history with the new formatted messages
117
  chat_history.append([user_input, assistant_response])
118
 
119
  # Prepare water consumption message
@@ -138,7 +163,6 @@ def generate_response(user_input, chat_history):
138
  error_message = f"An error occurred: {str(e)}"
139
  chat_history.append([user_input, error_message])
140
  return chat_history, show_water
141
-
142
 
143
  # Constants for water consumption calculation
144
  WATER_PER_TOKEN = {
@@ -275,10 +299,9 @@ try:
275
  </div>
276
  <div style="border-top: 1px solid #ddd; padding-top: 15px;">
277
  <p style="color: #666; font-size: 14px;">
278
- <strong>Important note:</strong> This application uses Meta Llama-2-7b model
279
- instead of GPT-3 for availability and cost reasons. However,
280
- the water consumption calculations per token (input/output) are based on the
281
- conclusions from the cited paper.
282
  </p>
283
  </div>
284
  </div>
 
12
  level=logging.INFO,
13
  format='%(asctime)s - %(levelname)s - %(message)s'
14
  )
15
+ logger = logging.getLogger(_name_)
16
 
17
  # Get HuggingFace token from environment variable
18
  hf_token = os.environ.get('HUGGINGFACE_TOKEN')
 
94
  conversation_history = ""
95
  if chat_history:
96
  for message in chat_history:
97
+ # Remove any [INST] tags from the history
98
+ user_msg = message[0].replace("[INST]", "").replace("[/INST]", "").strip()
99
+ assistant_msg = message[1].replace("[INST]", "").replace("[/INST]", "").strip()
100
+ conversation_history += f"[INST] {user_msg} [/INST] {assistant_msg} "
101
 
102
+ prompt = f"<s>[INST] {system_message}\n\n{conversation_history}[INST] {user_input} [/INST]"
103
 
104
  logger.info("Generating model response...")
105
  outputs = model_gen(
 
107
  max_new_tokens=256,
108
  return_full_text=False,
109
  pad_token_id=tokenizer.eos_token_id,
110
+ do_sample=True,
111
+ temperature=0.7,
112
+ top_p=0.9,
113
+ repetition_penalty=1.1
114
  )
115
  logger.info("Model response generated successfully")
116
 
117
+ # Clean up the response by removing any [INST] tags and trimming
118
  assistant_response = outputs[0]['generated_text'].strip()
119
+ assistant_response = assistant_response.replace("[INST]", "").replace("[/INST]", "").strip()
120
+
121
+ # If the response is too short, try to generate a more detailed one
122
+ if len(assistant_response.split()) < 10:
123
+ prompt += "\nPlease provide a more detailed answer with context and explanation."
124
+ outputs = model_gen(
125
+ prompt,
126
+ max_new_tokens=256,
127
+ return_full_text=False,
128
+ pad_token_id=tokenizer.eos_token_id,
129
+ do_sample=True,
130
+ temperature=0.7,
131
+ top_p=0.9,
132
+ repetition_penalty=1.1
133
+ )
134
+ assistant_response = outputs[0]['generated_text'].strip()
135
+ assistant_response = assistant_response.replace("[INST]", "").replace("[/INST]", "").strip()
136
 
137
  # Calculate water consumption for output
138
  output_water_consumption = calculate_water_consumption(assistant_response, False)
139
  total_water_consumption += output_water_consumption
140
 
141
+ # Update chat history with the cleaned messages
142
  chat_history.append([user_input, assistant_response])
143
 
144
  # Prepare water consumption message
 
163
  error_message = f"An error occurred: {str(e)}"
164
  chat_history.append([user_input, error_message])
165
  return chat_history, show_water
 
166
 
167
  # Constants for water consumption calculation
168
  WATER_PER_TOKEN = {
 
299
  </div>
300
  <div style="border-top: 1px solid #ddd; padding-top: 15px;">
301
  <p style="color: #666; font-size: 14px;">
302
+ <strong>Important note:</strong> This application uses Meta's Llama 2 (7B parameters) model.
303
+ The water consumption calculations per token (input/output) are based on the
304
+ general conclusions from the cited paper about large language models.
 
305
  </p>
306
  </div>
307
  </div>