Spaces:

YongdongWang
/

DART-LLM-Multi-Model

Sleeping

yongdong commited on Jun 23

Commit

9720765

1 Parent(s): 9ffc795

feat: streamline inference output to JSON-only

Files changed (1) hide show

app.py CHANGED Viewed

@@ -109,7 +109,10 @@ def generate_response_gpu(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
     try:
         # Format input
-        formatted_prompt = f"### Human: {prompt.strip()}\n### Assistant:"
         # Encode input
         inputs = tokenizer(
@@ -138,8 +141,8 @@ def generate_response_gpu(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         # Extract generated part
-        if "### Assistant:" in response:
-            response = response.split("### Assistant:")[-1].strip()
         elif len(response) > len(formatted_prompt):
             response = response[len(formatted_prompt):].strip()

     try:
         # Format input
+        formatted_prompt = (
+            f"### Instruction:\n{prompt.strip()}\n\n"
+            "### Response:\n"
+        )
         # Encode input
         inputs = tokenizer(
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         # Extract generated part
+        if "### Response:" in response:
+            response = response.split("### Response:")[-1].strip()
         elif len(response) > len(formatted_prompt):
             response = response[len(formatted_prompt):].strip()