Spaces:

fajjos
/

keyword_extractor

Sleeping

fajjos commited on Oct 11, 2024

Commit

59410ee

1 Parent(s): f3f2c42

Add updated app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,6 +3,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
 import torch
 import bitsandbytes as bnb  # Required for 4-bit quantization
 # Load the tokenizer and the quantized LLaMA model
 model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -11,12 +14,9 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     load_in_4bit=True,    # Enable 4-bit quantization
-    device_map="auto"     # Automatically assigns to CPU/GPU
 )
-# Enable native 2x faster inference (if applicable, ensure this feature works)
-# FastLanguageModel.for_inference(model)  # Uncomment this if FastLanguageModel is available for your model
 # Streamlit interface
 st.title("Keyword Extractor using LLaMA 4-bit Model")
@@ -32,7 +32,7 @@ if user_input:
     alpaca_prompt = prompt_template.format(user_input)
     # Tokenize the input text
-    inputs = tokenizer([alpaca_prompt], return_tensors="pt").to("cuda")
     # Set up the text streamer to display the generated text as it streams
     text_streamer = TextStreamer(tokenizer)

 import torch
 import bitsandbytes as bnb  # Required for 4-bit quantization
+# Check if CUDA is available, and decide on the device
+device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load the tokenizer and the quantized LLaMA model
 model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     load_in_4bit=True,    # Enable 4-bit quantization
+    device_map="auto" if device == "cuda" else {"": "cpu"}  # Use auto if CUDA is available, else fallback to CPU
 )
 # Streamlit interface
 st.title("Keyword Extractor using LLaMA 4-bit Model")
     alpaca_prompt = prompt_template.format(user_input)
     # Tokenize the input text
+    inputs = tokenizer([alpaca_prompt], return_tensors="pt").to(device)
     # Set up the text streamer to display the generated text as it streams
     text_streamer = TextStreamer(tokenizer)