mannadamay12 commited on
Commit
ca64dfe
·
verified ·
1 Parent(s): e294c88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -19
app.py CHANGED
@@ -1,12 +1,15 @@
1
  import os
 
2
  import torch
3
- from transformers import (
4
- AutoTokenizer,
5
- TextStreamer,
6
- pipeline,
7
- BitsAndBytesConfig,
8
- AutoModelForCausalLM
9
- )
 
 
10
  from langchain.embeddings import HuggingFaceInstructEmbeddings
11
  from langchain.vectorstores import Chroma
12
  from langchain.prompts import PromptTemplate
@@ -18,25 +21,39 @@ DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
18
  model_id = "meta-llama/Llama-3.2-3B-Instruct"
19
 
20
  # Remove the spaces.GPU decorator since we'll handle GPU directly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def initialize_model():
22
- bnb_config = BitsAndBytesConfig(
23
- load_in_4bit=True,
24
- bnb_4bit_use_double_quant=True,
25
- bnb_4bit_quant_type="nf4",
26
- bnb_4bit_compute_dtype=torch.bfloat16
27
- )
28
 
29
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
30
  model = AutoModelForCausalLM.from_pretrained(
31
  model_id,
32
- token=os.environ.get("HF_TOKEN"),
33
- quantization_config=bnb_config if torch.cuda.is_available() else None,
34
- device_map="auto" if torch.cuda.is_available() else "cpu",
35
- torch_dtype=torch.float32 if not torch.cuda.is_available() else None
36
  )
37
 
38
  return model, tokenizer
39
-
 
40
  def respond(message, history, system_message, max_tokens, temperature, top_p):
41
  try:
42
  model, tokenizer = initialize_model()
 
1
  import os
2
+ import spaces
3
  import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ # import torch
6
+ # from transformers import (
7
+ # AutoTokenizer,
8
+ # TextStreamer,
9
+ # pipeline,
10
+ # BitsAndBytesConfig,
11
+ # AutoModelForCausalLM
12
+ # )
13
  from langchain.embeddings import HuggingFaceInstructEmbeddings
14
  from langchain.vectorstores import Chroma
15
  from langchain.prompts import PromptTemplate
 
21
  model_id = "meta-llama/Llama-3.2-3B-Instruct"
22
 
23
  # Remove the spaces.GPU decorator since we'll handle GPU directly
24
+ # def initialize_model():
25
+ # bnb_config = BitsAndBytesConfig(
26
+ # load_in_4bit=True,
27
+ # bnb_4bit_use_double_quant=True,
28
+ # bnb_4bit_quant_type="nf4",
29
+ # bnb_4bit_compute_dtype=torch.bfloat16
30
+ # )
31
+
32
+ # tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
33
+ # model = AutoModelForCausalLM.from_pretrained(
34
+ # model_id,
35
+ # token=os.environ.get("HF_TOKEN"),
36
+ # quantization_config=bnb_config if torch.cuda.is_available() else None,
37
+ # device_map="auto" if torch.cuda.is_available() else "cpu",
38
+ # torch_dtype=torch.float32 if not torch.cuda.is_available() else None
39
+ # )
40
+
41
+ # return model, tokenizer
42
+
43
  def initialize_model():
44
+ model_id = "meta-llama/Llama-3.2-3B-Instruct"
45
+ token = os.environ.get("HF_TOKEN")
 
 
 
 
46
 
47
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
48
  model = AutoModelForCausalLM.from_pretrained(
49
  model_id,
50
+ token=token,
51
+ device_map="auto" # This works better with ZeroGPU
 
 
52
  )
53
 
54
  return model, tokenizer
55
+
56
+ @spaces.GPU
57
  def respond(message, history, system_message, max_tokens, temperature, top_p):
58
  try:
59
  model, tokenizer = initialize_model()