Spaces:

genaforvena
/

huivam_finnegans_spaceship

Sleeping

App Files Files Community

genaforvena commited on Jan 6

Commit

1307336

1 Parent(s): 6384c62

fix

Browse files

Files changed (1) hide show

app.py +125 -20

app.py CHANGED Viewed

@@ -1,29 +1,134 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
 import torch
-base_model_name = "unsloth/Llama-3.2-1B-Instruct"
-base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
-tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-peft_model_path = "genaforvena/huivam_finnegan_llama3.2-1b"
-model = PeftModel.from_pretrained(base_model, peft_model_path)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
 def reply(prompt):
-    print("prompt: " + prompt)
-    inputs = tokenizer.encode(prompt, return_tensors="pt")
-    print("tokenized")
-    output = model.generate(inputs, max_new_tokens=100, do_sample=True, top_p=0.95, top_k=50)
-    print("generated")
-    text = tokenizer.decode(output[0], skip_special_tokens=True)
-    print("text: " + text)
-    return text
-demo = gr.Interface(fn=reply, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, GenerationConfig
 import torch
+import threading
+from queue import Queue
+# Custom Streamer Class
+class MyStreamer(TextStreamer):
+    def __init__(self, tokenizer, skip_prompt=True, **decode_kwargs):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = Queue()
+        self.stop_signal = None
+        self.skip_special_tokens = decode_kwargs.pop("skip_special_tokens", True)  # Default to True
+        self.token_cache = []  # Add a token cache
+    def on_finalized_text(self, text, stream_end=False):
+        """Put the new text in the queue."""
+        self.text_queue.put(text)
+    def put(self, value):
+        """Decode the token and add to buffer."""
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError("put() only supports a single sequence of tokens at a time.")
+        elif len(value.shape) > 1:
+            value = value[0]
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+        # Add the token to the cache
+        self.token_cache.extend(value.tolist())
+        # Decode the entire cache
+        text = self.tokenizer.decode(
+            self.token_cache,
+            skip_special_tokens=self.skip_special_tokens,
+            **self.decode_kwargs,
+        )
+        # Check for stop signal (e.g., end of text)
+        if self.stop_signal and text.endswith(self.stop_signal):
+            text = text[: -len(self.stop_signal)]
+            self.on_finalized_text(text, stream_end=True)
+            self.token_cache = []  # Clear the cache
+        else:
+            self.on_finalized_text(text, stream_end=False)
+    def end(self):
+        """Flush the buffer."""
+        if self.token_cache:
+            text = self.tokenizer.decode(
+                self.token_cache,
+                skip_special_tokens=self.skip_special_tokens,
+                **self.decode_kwargs,
+            )
+            self.on_finalized_text(text, stream_end=True)
+            self.token_cache = []  # Clear the cache
+        else:
+            self.on_finalized_text("", stream_end=True)
+# Load the model and tokenizer
+model_name = "genaforvena/huivam_finnegan_llama3.2-1b"
+model = None
+tokenizer = None
+try:
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    print("Model and tokenizer loaded successfully.")
+except Exception as e:
+    print(f"Error loading model/tokenizer: {e}")
+    exit()
+# Move the model to the appropriate device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+if model:
+    model.to(device)
+    print(f"Model moved to {device}.")
+# Function to generate a streaming response
 def reply(prompt):
+    messages = [{"role": "user", "content": prompt}]
+    try:
+        inputs = tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt",
+        ).to(device)
+        # Create a custom streamer
+        streamer = MyStreamer(tokenizer, skip_prompt=True)
+        generation_config = GenerationConfig(
+            pad_token_id=tokenizer.pad_token_id,
+        )
+        def generate():
+            model.generate(
+                inputs,
+                generation_config=generation_config,
+                streamer=streamer,
+                max_new_tokens=512,  # Adjust as needed
+            )
+        thread = threading.Thread(target=generate)
+        thread.start()
+        # Yield only the new tokens as they come in
+        while thread.is_alive():
+            try:
+                next_token = streamer.text_queue.get(timeout=0.1)
+                yield next_token  # Yield only the new token
+            except:
+                pass
+        # Yield any remaining text after generation finishes
+        while not streamer.text_queue.empty():
+            next_token = streamer.text_queue.get()
+            yield next_token  # Yield only the new token
+    except Exception as e:
+        print(f"Error during inference: {e}")
+        yield f"Error processing your request: {e}"
+# Gradio interface
+demo = gr.Interface(
+    fn=reply,
+    inputs="text",
+    outputs="text",
+)
+# Launch the Gradio app
+demo.launch(share=True)