Spaces:

mannadamay12
/

rag-ros2

Running

App Files Files Community

mannadamay12 commited on Dec 7, 2024

Commit

ca64dfe

verified ·

1 Parent(s): e294c88

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -19

app.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import os
 import torch
-from transformers import (
-    AutoTokenizer,
-    TextStreamer,
-    pipeline,
-    BitsAndBytesConfig,
-    AutoModelForCausalLM
-)
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
@@ -18,25 +21,39 @@ DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
 model_id = "meta-llama/Llama-3.2-3B-Instruct"
 # Remove the spaces.GPU decorator since we'll handle GPU directly
 def initialize_model():
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
-        token=os.environ.get("HF_TOKEN"),
-        quantization_config=bnb_config if torch.cuda.is_available() else None,
-        device_map="auto" if torch.cuda.is_available() else "cpu",
-        torch_dtype=torch.float32 if not torch.cuda.is_available() else None
     )
     return model, tokenizer
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
         model, tokenizer = initialize_model()

 import os
+import spaces
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# import torch
+# from transformers import (
+#     AutoTokenizer,
+#     TextStreamer,
+#     pipeline,
+#     BitsAndBytesConfig,
+#     AutoModelForCausalLM
+# )
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 model_id = "meta-llama/Llama-3.2-3B-Instruct"
 # Remove the spaces.GPU decorator since we'll handle GPU directly
+# def initialize_model():
+#     bnb_config = BitsAndBytesConfig(
+#         load_in_4bit=True,
+#         bnb_4bit_use_double_quant=True,
+#         bnb_4bit_quant_type="nf4",
+#         bnb_4bit_compute_dtype=torch.bfloat16
+#     )
+#     tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
+#     model = AutoModelForCausalLM.from_pretrained(
+#         model_id,
+#         token=os.environ.get("HF_TOKEN"),
+#         quantization_config=bnb_config if torch.cuda.is_available() else None,
+#         device_map="auto" if torch.cuda.is_available() else "cpu",
+#         torch_dtype=torch.float32 if not torch.cuda.is_available() else None
+#     )
+#     return model, tokenizer
 def initialize_model():
+    model_id = "meta-llama/Llama-3.2-3B-Instruct"
+    token = os.environ.get("HF_TOKEN")
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
+        token=token,
+        device_map="auto"  # This works better with ZeroGPU
     )
     return model, tokenizer
+@spaces.GPU
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
         model, tokenizer = initialize_model()