Spaces:

raccoote
/

angry-birds-LLM-level-generator

Runtime error

App Files Files Community

raccoote commited on Aug 27, 2024

Commit

feec422

verified ·

1 Parent(s): 3605c25

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -14

app.py CHANGED Viewed

@@ -1,21 +1,48 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-model_name = "raccoote/angry-birds-v1"
-# Load model and tokenizer
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Ensure to use the model in evaluation mode to save memory
-model.eval()
-def generate_text(prompt):
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    with torch.no_grad():  # Disable gradient calculation for inference
-        outputs = model.generate(**inputs)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-# Example usage
-response = generate_text("Hello, world!")
-print(response)

 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load the model and tokenizer
+model_name = "raccoote/angry-birds-v1"
+# Use half-precision if running on GPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Load the model with half-precision and low memory usage options
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+    low_cpu_mem_usage=True
+).to(device)
+# Function to generate responses
+def generate_response(prompt):
+    # Tokenize input
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    # Generate output (inference mode with no gradient computation to save memory)
+    with torch.no_grad():
+        outputs = model.generate(
+            inputs["input_ids"],
+            max_length=150,  # You can adjust the max length based on your needs
+            num_return_sequences=1,
+            do_sample=True,  # Enable sampling to generate more varied responses
+            top_k=50,  # Limits the sampled tokens to the top k choices to avoid unlikely words
+            top_p=0.95,  # Nucleus sampling; keeps the cumulative probability of top tokens below a threshold
+        )
+    # Decode and return the response
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return response
+# Simple loop to interact with the chatbot
+if __name__ == "__main__":
+    print("Chatbot is ready! Type your message below (type 'exit' to quit):")
+    while True:
+        user_input = input("You: ")
+        if user_input.lower() == "exit":
+            break
+        response = generate_response(user_input)
+        print(f"Bot: {response}")