falcon-180b-demo

Sleeping

andyfe commited on Oct 29, 2023

Commit

0e27985

1 Parent(s): 9cefaa1

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -145,9 +145,31 @@ client = InferenceClient(
 #demo.launch(show_api=True, share=True)
 #demo.queue(concurrency_count=100, api_open=False).launch(show_api=True)
-def query(text):
-    print(text)
-    return text
 iface = gr.Interface(
     query,

 #demo.launch(show_api=True, share=True)
 #demo.queue(concurrency_count=100, api_open=False).launch(show_api=True)
+def query(system_prompt, user_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
+    seed = 42
+    generate_kwargs = dict(
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        stop_sequences=STOP_SEQUENCES,
+        do_sample=True,
+        seed=seed,
+    )
+    prompt = f"System: {system_prompt}\nUser: {user_prompt}\n"
+    stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+    output = ""
+    for response in stream:
+        output += response.token.text
+        for stop_str in STOP_SEQUENCES:
+            if output.endswith(stop_str):
+                output = output[:-len(stop_str)]
+                output = output.rstrip()
+                yield output
+        yield output
+    return output
 iface = gr.Interface(
     query,