Spaces:

Braszczynski
/

ID2223Lab2

Runtime error

App Files Files Community

Braszczynski commited on Dec 9, 2024

Commit

e1c82eb

verified ·

1 Parent(s): a9938e0

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -40

app.py CHANGED Viewed

@@ -11,22 +11,16 @@ max_seq_length = 512  # Adjust as needed
 dtype = None   # Example dtype, adjust based on your setup
 load_in_4bit = True     # Set to True if you want to use 4-bit quantization
-# Load the model and tokenizer using FastLanguageModel
-model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name=model_name,
-    max_seq_length=max_seq_length,
-    dtype=dtype,
-    load_in_4bit=load_in_4bit,
-)
-# Load the adapter
 model.load_adapter(lora_adapter)
-# Enable native 2x faster inference
-FastLanguageModel.for_inference(model)
-# Optional: Initialize TextStreamer if you plan to use streaming
-# text_streamer = TextStreamer(tokenizer, skip_prompt=True)
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     # Combine system message and chat history
@@ -34,34 +28,29 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
     for user_msg, bot_reply in history:
         chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
     chat_history += f"User: {message}\nAssistant:"
-    # Apply chat template and tokenize the input
-    inputs = tokenizer.apply_chat_template(
-        [{"role": "user", "content": message}] if not history else [
-            {"role": "system", "content": system_message}] + [
-            {"role": "user", "content": msg} for msg, _ in history
-        ] + [{"role": "assistant", "content": reply} for _, reply in history] + [
-            {"role": "user", "content": message}
-        ],
-        tokenize=True,
-        add_generation_prompt=True,  # Must add for generation
         return_tensors="pt",
-    ).to("cuda")
-    # Generate response
-    outputs = model.generate(
-        input_ids=inputs["input_ids"],
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        pad_token_id=tokenizer.eos_token_id,
-        use_cache=True
-        # streamer=text_streamer  # Uncomment if using streaming
-    )
-    # Decode and format the output
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    response = response[len(chat_history):].strip()  # Remove input context from output
     return response
 # Define the Gradio interface
@@ -76,4 +65,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    demo.launch()

 dtype = None   # Example dtype, adjust based on your setup
 load_in_4bit = True     # Set to True if you want to use 4-bit quantization
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+# Load the base model with adapters
+model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True)
 model.load_adapter(lora_adapter)
+# Move the model to CPU
+device = torch.device("cpu")
+model.to(device)
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     # Combine system message and chat history
     for user_msg, bot_reply in history:
         chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
     chat_history += f"User: {message}\nAssistant:"
+    # Prepare the input for the model
+    inputs = tokenizer(
+        chat_history,
         return_tensors="pt",
+        truncation=True,
+        max_length=max_seq_length,
+    ).to(device)
+    # Generate the response
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids=inputs["input_ids"],
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            pad_token_id=tokenizer.eos_token_id,
+            use_cache=True
+        )
+    # Decode and format the response
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    response = response[len(chat_history):].strip()  # Remove the input context
     return response
 # Define the Gradio interface
 )
 if __name__ == "__main__":
+    demo.launch()