Spaces:

id2223-SE
/

id2223-lab2

Sleeping

EPark25 commited on Nov 27, 2024

Commit

ccadd27

1 Parent(s): 7c47827

Committing until it works

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,10 +1,17 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 def respond(
@@ -27,16 +34,24 @@ def respond(
     response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
     ):
-        token = message.choices[0].delta.content
-        response += token
         yield response

 import gradio as gr
+from transformers import TextStreamer
+# Load model directly
+from transformers import AutoModel, AutoTokenizer
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
+model_name_or_path = "samlama111/lora_model"
+model = AutoModel.from_pretrained(model_name_or_path)
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
 def respond(
     response = ""
+    inputs = tokenizer.apply_chat_template(
+        messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
+    )
+    text_streamer = TextStreamer(tokenizer)
+    # TODO: Doesn't stream ATM
+    for message in model.generate(
+        input_ids=inputs, streamer=text_streamer, max_new_tokens=1024, use_cache=True
     ):
+        # Decode the tensor to a string
+        decoded_message = tokenizer.decode(message, skip_special_tokens=True)
+        # Manually getting the response
+        response = decoded_message.split("assistant")[
+            -1
+        ].strip()  # Extract only the assistant's response
+        print(response)
         yield response