Spaces:

YALCINKAYA
/

opsgenius3

Sleeping

App Files Files Community

YALCINKAYA commited on Oct 24, 2024

Commit

f4c3c98

1 Parent(s): cffec04

stop_sequences User: and Assistant:

Browse files

Files changed (1) hide show

app.py +31 -61

app.py CHANGED Viewed

@@ -31,47 +31,10 @@ def get_model_and_tokenizer(model_id):
     except Exception as e:
         print(f"Error loading model: {e}")
-def extract_relevant_text(response):
-    """
-    This function extracts the first complete 'user' and 'assistant' blocks
-    between <|im_start|> and <|im_end|> in the generated response.
-    If the tags are corrupted, it returns the text up to the first <|im_end|> tag.
-    """
-    # Regex to match content between <|im_start|> and <|im_end|> tags
-    pattern = re.compile(r"<\|im_start\|>(.*?)<\|im_end\|>", re.DOTALL)
-    matches = pattern.findall(response)
-    # Debugging: print the matches found
-    print("Matches found:", matches)
-    # If complete matches found, extract them
-    if len(matches) >= 2:
-        user_message = matches[0].strip()  # First <|im_start|> block
-        assistant_message = matches[1].strip()  # Second <|im_start|> block
-        return f"user: {user_message}\nassistant: {assistant_message}"
-    # If no complete blocks found, check for a partial extraction
-    if '<|im_end|>' in response:
-        # Extract everything before the first <|im_end|>
-        partial_response = response.split('<|im_end|>')[0].strip()
-        return f"{partial_response}"
     return "No complete blocks found. Please check the format of the response."
-def generate_response(user_input, model_id):
-    prompt = formatted_prompt(user_input)
-    global model, tokenizer
-    # Load the model and tokenizer if they are not already loaded or if the model_id has changed
-    if model is None or tokenizer is None or (model.config._name_or_path != model_id):
-        get_model_and_tokenizer(model_id)  # Load model and tokenizer
-    # Prepare the input tensors
-    inputs = tokenizer(prompt, return_tensors="pt")  # Move inputs to GPU if available
-    generation_config = GenerationConfig(
         # max_new_tokens=100,
         # min_length=5,
         # do_sample=False,
@@ -97,31 +60,38 @@ def generate_response(user_input, model_id):
         #pad_token_id=tokenizer.eos_token_id,
         #truncation=True,                       # Enable truncation for input sequences
-        penalty_alpha=0.6,           # Maintain this for balance
-        do_sample=True,               # Allow sampling for variability
-        top_k=3,                    # Reduce top_k to narrow down options
-        temperature=0.7,             # Keep this low for more deterministic responses
-        repetition_penalty=1.2,      # Keep this moderate to avoid repetitive responses
-        max_new_tokens=60,           # Maintain this limit
         pad_token_id=tokenizer.eos_token_id,
-        truncation=True,              # Enable truncation for longer prompts
-        )
-    try:
-        # Generate response
-        #outputs = model.generate(**inputs, generation_config=generation_config)
-        outputs = model.generate(**inputs, generation_config=generation_config)
-        #response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        #use the slicing method
-        response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True)
-        return extract_relevant_text(response)
-    except Exception as e:
-        print(f"Error generating response: {e}")
-        return "Error generating response."
 def formatted_prompt(question) -> str:
-    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"
 @app.route("/", methods=["GET"])
 def handle_get_request():

     except Exception as e:
         print(f"Error loading model: {e}")
     return "No complete blocks found. Please check the format of the response."
         # max_new_tokens=100,
         # min_length=5,
         # do_sample=False,
         #pad_token_id=tokenizer.eos_token_id,
         #truncation=True,                       # Enable truncation for input sequences
+        #penalty_alpha=0.6,           # Maintain this for balance
+        #do_sample=True,               # Allow sampling for variability
+        #top_k=3,                    # Reduce top_k to narrow down options
+        #temperature=0.7,             # Keep this low for more deterministic responses
+        #repetition_penalty=1.2,      # Keep this moderate to avoid repetitive responses
+        #max_new_tokens=60,           # Maintain this limit
+        #pad_token_id=tokenizer.eos_token_id,
+        #truncation=True,              # Enable truncation for longer prompts
+       #
+def generate_response(user_input):
+    prompt = formatted_prompt(user_input)
+    inputs = tokenizer([prompt], return_tensors="pt")
+    generation_config = GenerationConfig(
+        penalty_alpha=0.6,
+        do_sample=True,
+        top_k=5,
+        temperature=0.6,
+        repetition_penalty=1.2,
+        max_new_tokens=30,  # Adjust as necessary
         pad_token_id=tokenizer.eos_token_id,
+        stop_sequences=["User:", "Assistant:"],
+    )
+    outputs = model.generate(**inputs, generation_config=generation_config)
+    response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True)
+    return response.strip().split("Assistant:")[-1].strip()  # Get the part after 'Assistant:'
 def formatted_prompt(question) -> str:
+    return f"<|startoftext|>User: {question}\nAssistant:"
 @app.route("/", methods=["GET"])
 def handle_get_request():