Spaces:

mannadamay12
/

rag-ros2

Running

App Files Files Community

mannadamay12 commited on Dec 7, 2024

Commit

e294c88

verified ·

1 Parent(s): 55bd66e

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -65

app.py CHANGED Viewed

@@ -1,78 +1,45 @@
 import os
-import spaces
-import gradio as gr
 import torch
-torch.jit.script = lambda f: f  # Avoid script error in lambda
-# Initialize non-GPU components first
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
-# System prompts
-DEFAULT_SYSTEM_PROMPT = """
-Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
-respond with "I don't know" or a similar acknowledgment that the answer is not available.
-""".strip()
-SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
-def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
-    return f"""
-[INST] <<SYS>>
-{system_prompt}
-<</SYS>>
-{prompt} [/INST]
-""".strip()
-template = generate_prompt(
-    """
-{context}
-Question: {question}
-""",
-    system_prompt=SYSTEM_PROMPT,
-)
-prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
-# Initialize database and embeddings
-embeddings = HuggingFaceInstructEmbeddings(
-    model_name="hkunlp/instructor-base",
-    model_kwargs={"device": "cpu"}
-)
-db = Chroma(
-    persist_directory="db",
-    embedding_function=embeddings
-)
 def initialize_model():
-    from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM
-    from langchain.llms import HuggingFacePipeline
-    model_id = "meta-llama/Llama-3.2-3B-Instruct"
-    token = os.environ.get("HF_TOKEN")
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
-        token=token,
     )
-    if torch.cuda.is_available():
-        model = model.to("cuda")
     return model, tokenizer
-@spaces.GPU
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
-        # Initialize model components inside GPU context
         model, tokenizer = initialize_model()
-        from transformers import TextStreamer, pipeline
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         text_pipeline = pipeline(
@@ -87,6 +54,7 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
         )
         llm = HuggingFacePipeline(pipeline=text_pipeline)
         qa_chain = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",
@@ -96,12 +64,12 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
         )
         response = qa_chain.invoke({"query": message})
-        yield response["result"]
     except Exception as e:
-        yield f"An error occurred: {str(e)}"
-# Create Gradio interface
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
@@ -120,7 +88,7 @@ demo = gr.ChatInterface(
         ),
         gr.Slider(
             minimum=0.1,
-            maximum=4.0,
             value=0.1,
             step=0.1,
             label="Temperature"
@@ -130,12 +98,9 @@ demo = gr.ChatInterface(
             maximum=1.0,
             value=0.95,
             step=0.05,
-            label="Top-p (nucleus sampling)"
         ),
     ],
     title="ROS2 Expert Assistant",
     description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
-)
-if __name__ == "__main__":
-    demo.launch()

 import os
 import torch
+from transformers import (
+    AutoTokenizer,
+    TextStreamer,
+    pipeline,
+    BitsAndBytesConfig,
+    AutoModelForCausalLM
+)
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
+from langchain.llms import HuggingFacePipeline
+import gradio as gr
+DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
+model_id = "meta-llama/Llama-3.2-3B-Instruct"
+# Remove the spaces.GPU decorator since we'll handle GPU directly
 def initialize_model():
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
+        token=os.environ.get("HF_TOKEN"),
+        quantization_config=bnb_config if torch.cuda.is_available() else None,
+        device_map="auto" if torch.cuda.is_available() else "cpu",
+        torch_dtype=torch.float32 if not torch.cuda.is_available() else None
     )
     return model, tokenizer
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
         model, tokenizer = initialize_model()
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         text_pipeline = pipeline(
         )
         llm = HuggingFacePipeline(pipeline=text_pipeline)
         qa_chain = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",
         )
         response = qa_chain.invoke({"query": message})
+        return response["result"]
     except Exception as e:
+        return f"An error occurred: {str(e)}"
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
         ),
         gr.Slider(
             minimum=0.1,
+            maximum=1.0,
             value=0.1,
             step=0.1,
             label="Temperature"
             maximum=1.0,
             value=0.95,
             step=0.05,
+            label="Top-p"
         ),
     ],
     title="ROS2 Expert Assistant",
     description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
+)