mannadamay12 commited on
Commit
e294c88
·
verified ·
1 Parent(s): 55bd66e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -65
app.py CHANGED
@@ -1,78 +1,45 @@
1
  import os
2
- import spaces
3
- import gradio as gr
4
  import torch
5
- torch.jit.script = lambda f: f # Avoid script error in lambda
6
-
7
- # Initialize non-GPU components first
 
 
 
 
8
  from langchain.embeddings import HuggingFaceInstructEmbeddings
9
  from langchain.vectorstores import Chroma
10
  from langchain.prompts import PromptTemplate
11
  from langchain.chains import RetrievalQA
 
 
12
 
13
- # System prompts
14
- DEFAULT_SYSTEM_PROMPT = """
15
- Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
16
- respond with "I don't know" or a similar acknowledgment that the answer is not available.
17
- """.strip()
18
-
19
- SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
20
-
21
- def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
22
- return f"""
23
- [INST] <<SYS>>
24
- {system_prompt}
25
- <</SYS>>
26
-
27
- {prompt} [/INST]
28
- """.strip()
29
-
30
- template = generate_prompt(
31
- """
32
- {context}
33
-
34
- Question: {question}
35
- """,
36
- system_prompt=SYSTEM_PROMPT,
37
- )
38
-
39
- prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
40
-
41
- # Initialize database and embeddings
42
- embeddings = HuggingFaceInstructEmbeddings(
43
- model_name="hkunlp/instructor-base",
44
- model_kwargs={"device": "cpu"}
45
- )
46
-
47
- db = Chroma(
48
- persist_directory="db",
49
- embedding_function=embeddings
50
- )
51
 
 
52
  def initialize_model():
53
- from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM
54
- from langchain.llms import HuggingFacePipeline
55
-
56
- model_id = "meta-llama/Llama-3.2-3B-Instruct"
57
- token = os.environ.get("HF_TOKEN")
 
58
 
59
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
60
  model = AutoModelForCausalLM.from_pretrained(
61
  model_id,
62
- token=token,
 
 
 
63
  )
64
 
65
- if torch.cuda.is_available():
66
- model = model.to("cuda")
67
-
68
  return model, tokenizer
69
 
70
- @spaces.GPU
71
  def respond(message, history, system_message, max_tokens, temperature, top_p):
72
  try:
73
- # Initialize model components inside GPU context
74
  model, tokenizer = initialize_model()
75
- from transformers import TextStreamer, pipeline
76
 
77
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
78
  text_pipeline = pipeline(
@@ -87,6 +54,7 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
87
  )
88
 
89
  llm = HuggingFacePipeline(pipeline=text_pipeline)
 
90
  qa_chain = RetrievalQA.from_chain_type(
91
  llm=llm,
92
  chain_type="stuff",
@@ -96,12 +64,12 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
96
  )
97
 
98
  response = qa_chain.invoke({"query": message})
99
- yield response["result"]
100
 
101
  except Exception as e:
102
- yield f"An error occurred: {str(e)}"
 
103
 
104
- # Create Gradio interface
105
  demo = gr.ChatInterface(
106
  respond,
107
  additional_inputs=[
@@ -120,7 +88,7 @@ demo = gr.ChatInterface(
120
  ),
121
  gr.Slider(
122
  minimum=0.1,
123
- maximum=4.0,
124
  value=0.1,
125
  step=0.1,
126
  label="Temperature"
@@ -130,12 +98,9 @@ demo = gr.ChatInterface(
130
  maximum=1.0,
131
  value=0.95,
132
  step=0.05,
133
- label="Top-p (nucleus sampling)"
134
  ),
135
  ],
136
  title="ROS2 Expert Assistant",
137
  description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
138
- )
139
-
140
- if __name__ == "__main__":
141
- demo.launch()
 
1
  import os
 
 
2
  import torch
3
+ from transformers import (
4
+ AutoTokenizer,
5
+ TextStreamer,
6
+ pipeline,
7
+ BitsAndBytesConfig,
8
+ AutoModelForCausalLM
9
+ )
10
  from langchain.embeddings import HuggingFaceInstructEmbeddings
11
  from langchain.vectorstores import Chroma
12
  from langchain.prompts import PromptTemplate
13
  from langchain.chains import RetrievalQA
14
+ from langchain.llms import HuggingFacePipeline
15
+ import gradio as gr
16
 
17
+ DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
18
+ model_id = "meta-llama/Llama-3.2-3B-Instruct"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ # Remove the spaces.GPU decorator since we'll handle GPU directly
21
  def initialize_model():
22
+ bnb_config = BitsAndBytesConfig(
23
+ load_in_4bit=True,
24
+ bnb_4bit_use_double_quant=True,
25
+ bnb_4bit_quant_type="nf4",
26
+ bnb_4bit_compute_dtype=torch.bfloat16
27
+ )
28
 
29
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
30
  model = AutoModelForCausalLM.from_pretrained(
31
  model_id,
32
+ token=os.environ.get("HF_TOKEN"),
33
+ quantization_config=bnb_config if torch.cuda.is_available() else None,
34
+ device_map="auto" if torch.cuda.is_available() else "cpu",
35
+ torch_dtype=torch.float32 if not torch.cuda.is_available() else None
36
  )
37
 
 
 
 
38
  return model, tokenizer
39
 
 
40
  def respond(message, history, system_message, max_tokens, temperature, top_p):
41
  try:
 
42
  model, tokenizer = initialize_model()
 
43
 
44
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
45
  text_pipeline = pipeline(
 
54
  )
55
 
56
  llm = HuggingFacePipeline(pipeline=text_pipeline)
57
+
58
  qa_chain = RetrievalQA.from_chain_type(
59
  llm=llm,
60
  chain_type="stuff",
 
64
  )
65
 
66
  response = qa_chain.invoke({"query": message})
67
+ return response["result"]
68
 
69
  except Exception as e:
70
+ return f"An error occurred: {str(e)}"
71
+
72
 
 
73
  demo = gr.ChatInterface(
74
  respond,
75
  additional_inputs=[
 
88
  ),
89
  gr.Slider(
90
  minimum=0.1,
91
+ maximum=1.0,
92
  value=0.1,
93
  step=0.1,
94
  label="Temperature"
 
98
  maximum=1.0,
99
  value=0.95,
100
  step=0.05,
101
+ label="Top-p"
102
  ),
103
  ],
104
  title="ROS2 Expert Assistant",
105
  description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
106
+ )