Spaces:

mannadamay12
/

rag-ros2

Sleeping

App Files Files Community

mannadamay12 commited on Dec 7, 2024

Commit

85b8a02

verified ·

1 Parent(s): b005487

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -32

app.py CHANGED Viewed

@@ -3,20 +3,25 @@ import torch
 import gradio as gr
 import spaces
 from huggingface_hub import InferenceClient
-from langchain_community.embeddings import HuggingFaceInstructEmbeddings
-from langchain_community.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
-# Configure ZeroGPU client
-client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
-# Initialize embeddings
-embeddings = HuggingFaceInstructEmbeddings(
-    model_name="hkunlp/instructor-base",
-    model_kwargs={"device": "cpu"}  # Use CPU for Spaces
 )
-# Load the persisted database
 db = Chroma(
     persist_directory="db",
     embedding_function=embeddings
@@ -24,11 +29,31 @@ db = Chroma(
 # Prompt templates
 DEFAULT_SYSTEM_PROMPT = """
-You are a ROS2 expert assistant. Based on the information provided in the context, answer questions
-accurately and concisely. If the information is not in the context, acknowledge that you don't know.
 """.strip()
-@spaces.GPU(duration=60)
 def respond(
     message,
     history,
@@ -37,34 +62,28 @@ def respond(
     temperature,
     top_p,
 ):
     try:
-        # Retrieve relevant context
         docs = db.similarity_search(message, k=2)
         context = "\n".join([doc.page_content for doc in docs])
-        # Build messages
-        messages = [{"role": "system", "content": system_message}]
-        for val in history:
-            if val[0]:
-                messages.append({"role": "user", "content": val[0]})
-            if val[1]:
-                messages.append({"role": "assistant", "content": val[1]})
-        # Add context to the user message
-        augmented_message = f"Context: {context}\n\nQuestion: {message}"
-        messages.append({"role": "user", "content": augmented_message})
-        # Stream the response
         response = ""
-        for message in client.chat_completion(
-            messages,
-            max_tokens=max_tokens,
             stream=True,
             temperature=temperature,
             top_p=top_p,
         ):
-            token = message.choices[0].delta.content
-            response += token
             yield response
     except Exception as e:
@@ -76,7 +95,9 @@ demo = gr.ChatInterface(
     additional_inputs=[
         gr.Textbox(
             value=DEFAULT_SYSTEM_PROMPT,
-            label="System message"
         ),
         gr.Slider(
             minimum=1,
@@ -101,7 +122,7 @@ demo = gr.ChatInterface(
         ),
     ],
     title="ROS2 Expert Assistant",
-    description="Ask questions about ROS2, navigation, and robotics. I'll answer based on my knowledge base.",
 )
 if __name__ == "__main__":

 import gradio as gr
 import spaces
 from huggingface_hub import InferenceClient
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
+# Verify PyTorch version compatibility
+TORCH_VERSION = torch.__version__
+SUPPORTED_TORCH_VERSIONS = ['2.0.1', '2.1.2', '2.2.2', '2.4.0']
+if TORCH_VERSION.rsplit('+')[0] not in SUPPORTED_TORCH_VERSIONS:
+    print(f"Warning: Current PyTorch version {TORCH_VERSION} may not be compatible with ZeroGPU. "
+          f"Supported versions are: {', '.join(SUPPORTED_TORCH_VERSIONS)}")
+# Initialize components outside of GPU scope
+client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
+embeddings = HuggingFaceEmbeddings(
+    model_name="sentence-transformers/all-MiniLM-L6-v2",
+    model_kwargs={"device": "cpu"}  # Keep embeddings on CPU
 )
+# Load database
 db = Chroma(
     persist_directory="db",
     embedding_function=embeddings
 # Prompt templates
 DEFAULT_SYSTEM_PROMPT = """
+Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
+respond with "I don't know" or a similar acknowledgment that the answer is not available.
+""".strip()
+def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
+    return f"""
+[INST] <<SYS>>
+{system_prompt}
+<</SYS>>
+{prompt} [/INST]
 """.strip()
+template = generate_prompt(
+    """
+{context}
+Question: {question}
+""",
+    system_prompt="Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
+)
+prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
+@spaces.GPU(duration=30)  # Reduced duration for faster queue priority
 def respond(
     message,
     history,
     temperature,
     top_p,
 ):
+    """GPU-accelerated response generation"""
     try:
+        # Retrieve context (CPU operation)
         docs = db.similarity_search(message, k=2)
         context = "\n".join([doc.page_content for doc in docs])
+        # Format prompt
+        formatted_prompt = prompt_template.format(
+            context=context,
+            question=message
+        )
+        # Stream response (GPU operation)
         response = ""
+        for message in client.text_generation(
+            prompt=formatted_prompt,
+            max_new_tokens=max_tokens,
             stream=True,
             temperature=temperature,
             top_p=top_p,
         ):
+            response += message
             yield response
     except Exception as e:
     additional_inputs=[
         gr.Textbox(
             value=DEFAULT_SYSTEM_PROMPT,
+            label="System Message",
+            lines=3,
+            visible=False
         ),
         gr.Slider(
             minimum=1,
         ),
     ],
     title="ROS2 Expert Assistant",
+    description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
 )
 if __name__ == "__main__":