Spaces:
Running
Running
File size: 3,838 Bytes
d30c02a 96d766a d30c02a 96d766a 85b8a02 d30c02a 96d766a 85b8a02 96d766a 85b8a02 d30c02a 85b8a02 d30c02a 96d766a d30c02a 85b8a02 d30c02a 85b8a02 96d766a d30c02a 96d766a 85b8a02 d30c02a 85b8a02 d30c02a 17a9f49 d30c02a 85b8a02 17a9f49 d30c02a 85b8a02 d30c02a 85b8a02 d30c02a 85b8a02 d30c02a 96d766a d30c02a 96d766a d30c02a 85b8a02 d30c02a 96d766a d30c02a 96d766a d30c02a 85b8a02 96d766a d30c02a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
import torch
import gradio as gr
import spaces
from huggingface_hub import InferenceClient
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
# Verify PyTorch version compatibility
TORCH_VERSION = torch.__version__
SUPPORTED_TORCH_VERSIONS = ['2.0.1', '2.1.2', '2.2.2', '2.4.0']
if TORCH_VERSION.rsplit('+')[0] not in SUPPORTED_TORCH_VERSIONS:
print(f"Warning: Current PyTorch version {TORCH_VERSION} may not be compatible with ZeroGPU. "
f"Supported versions are: {', '.join(SUPPORTED_TORCH_VERSIONS)}")
# Initialize components outside of GPU scope
client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={"device": "cpu"} # Keep embeddings on CPU
)
# Load database
db = Chroma(
persist_directory="db",
embedding_function=embeddings
)
# Prompt templates
DEFAULT_SYSTEM_PROMPT = """
Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
respond with "I don't know" or a similar acknowledgment that the answer is not available.
""".strip()
def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
return f"""
[INST] <<SYS>>
{system_prompt}
<</SYS>>
{prompt} [/INST]
""".strip()
template = generate_prompt(
"""
{context}
Question: {question}
""",
system_prompt="Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
)
prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
@spaces.GPU(duration=30) # Reduced duration for faster queue priority
def respond(
message,
history,
system_message,
max_tokens,
temperature,
top_p,
):
"""GPU-accelerated response generation"""
try:
# Retrieve context (CPU operation)
docs = db.similarity_search(message, k=2)
context = "\n".join([doc.page_content for doc in docs])
print(f"Retrieved context: {context[:200]}...")
# Format prompt
formatted_prompt = prompt_template.format(
context=context,
question=message
)
print(f"Full prompt: {formatted_prompt}")
# Stream response (GPU operation)
response = ""
for message in client.text_generation(
prompt=formatted_prompt,
max_new_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
response += message
yield response
except Exception as e:
yield f"An error occurred: {str(e)}"
# Create Gradio interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(
value=DEFAULT_SYSTEM_PROMPT,
label="System Message",
lines=3,
visible=False
),
gr.Slider(
minimum=1,
maximum=2048,
value=500,
step=1,
label="Max new tokens"
),
gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.1,
step=0.1,
label="Temperature"
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)"
),
],
title="ROS2 Expert Assistant",
description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
)
if __name__ == "__main__":
demo.launch() |