mannadamay12 commited on
Commit
55bd66e
·
verified ·
1 Parent(s): 3438e5b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -35
app.py CHANGED
@@ -1,61 +1,80 @@
1
- import gradio as gr
2
- import spaces
3
  import os
 
 
 
 
 
 
4
  from langchain.embeddings import HuggingFaceInstructEmbeddings
5
  from langchain.vectorstores import Chroma
6
  from langchain.prompts import PromptTemplate
7
  from langchain.chains import RetrievalQA
8
- from langchain.llms import HuggingFacePipeline
9
- from huggingface_hub import InferenceClient
10
 
11
- # GPU initialization moved into a function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def initialize_model():
13
- import torch
14
- from transformers import (
15
- AutoTokenizer,
16
- TextStreamer,
17
- pipeline,
18
- BitsAndBytesConfig,
19
- AutoModelForCausalLM
20
- )
21
 
22
  model_id = "meta-llama/Llama-3.2-3B-Instruct"
23
  token = os.environ.get("HF_TOKEN")
24
 
25
- bnb_config = BitsAndBytesConfig(
26
- load_in_4bit=True,
27
- bnb_4bit_use_double_quant=True,
28
- bnb_4bit_quant_type="nf4",
29
- bnb_4bit_compute_dtype=torch.bfloat16
30
- )
31
-
32
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
33
  model = AutoModelForCausalLM.from_pretrained(
34
  model_id,
35
  token=token,
36
- quantization_config=bnb_config
37
  )
38
 
 
 
 
39
  return model, tokenizer
40
 
41
- # Initialize non-GPU components
42
- embeddings = HuggingFaceInstructEmbeddings(
43
- model_name="hkunlp/instructor-base",
44
- model_kwargs={"device": "cpu"}
45
- )
46
-
47
- db = Chroma(
48
- persist_directory="db",
49
- embedding_function=embeddings
50
- )
51
-
52
- @spaces.GPU(duration=30)
53
  def respond(message, history, system_message, max_tokens, temperature, top_p):
54
  try:
55
- # Initialize model components inside the GPU scope
56
  model, tokenizer = initialize_model()
57
- streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
58
 
 
59
  text_pipeline = pipeline(
60
  "text-generation",
61
  model=model,
 
 
 
1
  import os
2
+ import spaces
3
+ import gradio as gr
4
+ import torch
5
+ torch.jit.script = lambda f: f # Avoid script error in lambda
6
+
7
+ # Initialize non-GPU components first
8
  from langchain.embeddings import HuggingFaceInstructEmbeddings
9
  from langchain.vectorstores import Chroma
10
  from langchain.prompts import PromptTemplate
11
  from langchain.chains import RetrievalQA
 
 
12
 
13
+ # System prompts
14
+ DEFAULT_SYSTEM_PROMPT = """
15
+ Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
16
+ respond with "I don't know" or a similar acknowledgment that the answer is not available.
17
+ """.strip()
18
+
19
+ SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
20
+
21
+ def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
22
+ return f"""
23
+ [INST] <<SYS>>
24
+ {system_prompt}
25
+ <</SYS>>
26
+
27
+ {prompt} [/INST]
28
+ """.strip()
29
+
30
+ template = generate_prompt(
31
+ """
32
+ {context}
33
+
34
+ Question: {question}
35
+ """,
36
+ system_prompt=SYSTEM_PROMPT,
37
+ )
38
+
39
+ prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
40
+
41
+ # Initialize database and embeddings
42
+ embeddings = HuggingFaceInstructEmbeddings(
43
+ model_name="hkunlp/instructor-base",
44
+ model_kwargs={"device": "cpu"}
45
+ )
46
+
47
+ db = Chroma(
48
+ persist_directory="db",
49
+ embedding_function=embeddings
50
+ )
51
+
52
  def initialize_model():
53
+ from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM
54
+ from langchain.llms import HuggingFacePipeline
 
 
 
 
 
 
55
 
56
  model_id = "meta-llama/Llama-3.2-3B-Instruct"
57
  token = os.environ.get("HF_TOKEN")
58
 
 
 
 
 
 
 
 
59
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
60
  model = AutoModelForCausalLM.from_pretrained(
61
  model_id,
62
  token=token,
 
63
  )
64
 
65
+ if torch.cuda.is_available():
66
+ model = model.to("cuda")
67
+
68
  return model, tokenizer
69
 
70
+ @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
71
  def respond(message, history, system_message, max_tokens, temperature, top_p):
72
  try:
73
+ # Initialize model components inside GPU context
74
  model, tokenizer = initialize_model()
75
+ from transformers import TextStreamer, pipeline
76
 
77
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
78
  text_pipeline = pipeline(
79
  "text-generation",
80
  model=model,