Spaces:

mannadamay12
/

rag-ros2

Running

App Files Files Community

rag-ros2 / app.py

mannadamay12

Update app.py

55bd66e verified 5 months ago

raw

history blame

4.1 kB

	import os
	import spaces
	import gradio as gr
	import torch
	torch.jit.script = lambda f: f # Avoid script error in lambda

	# Initialize non-GPU components first
	from langchain.embeddings import HuggingFaceInstructEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.prompts import PromptTemplate
	from langchain.chains import RetrievalQA

	# System prompts
	DEFAULT_SYSTEM_PROMPT = """
	Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
	respond with "I don't know" or a similar acknowledgment that the answer is not available.
	""".strip()

	SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"

	def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
	return f"""
	[INST] <<SYS>>
	{system_prompt}
	<</SYS>>

	{prompt} [/INST]
	""".strip()

	template = generate_prompt(
	"""
	{context}

	Question: {question}
	""",
	system_prompt=SYSTEM_PROMPT,
	)

	prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])

	# Initialize database and embeddings
	embeddings = HuggingFaceInstructEmbeddings(
	model_name="hkunlp/instructor-base",
	model_kwargs={"device": "cpu"}
	)

	db = Chroma(
	persist_directory="db",
	embedding_function=embeddings
	)

	def initialize_model():
	from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM
	from langchain.llms import HuggingFacePipeline

	model_id = "meta-llama/Llama-3.2-3B-Instruct"
	token = os.environ.get("HF_TOKEN")

	tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	token=token,
	)

	if torch.cuda.is_available():
	model = model.to("cuda")

	return model, tokenizer

	@spaces.GPU
	def respond(message, history, system_message, max_tokens, temperature, top_p):
	try:
	# Initialize model components inside GPU context
	model, tokenizer = initialize_model()
	from transformers import TextStreamer, pipeline

	streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
	text_pipeline = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	repetition_penalty=1.15,
	streamer=streamer,
	)

	llm = HuggingFacePipeline(pipeline=text_pipeline)
	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=db.as_retriever(search_kwargs={"k": 2}),
	return_source_documents=False,
	chain_type_kwargs={"prompt": prompt_template}
	)

	response = qa_chain.invoke({"query": message})
	yield response["result"]

	except Exception as e:
	yield f"An error occurred: {str(e)}"

	# Create Gradio interface
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(
	value=DEFAULT_SYSTEM_PROMPT,
	label="System Message",
	lines=3,
	visible=False
	),
	gr.Slider(
	minimum=1,
	maximum=2048,
	value=500,
	step=1,
	label="Max new tokens"
	),
	gr.Slider(
	minimum=0.1,
	maximum=4.0,
	value=0.1,
	step=0.1,
	label="Temperature"
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)"
	),
	],
	title="ROS2 Expert Assistant",
	description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.",
	)

	if __name__ == "__main__":
	demo.launch()