Spaces:

mannadamay12
/

rag-ros2

Running

App Files Files Community

mannadamay12 commited on Dec 7, 2024

Commit

55bd66e

verified ·

1 Parent(s): 3438e5b

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -35

app.py CHANGED Viewed

@@ -1,61 +1,80 @@
-import gradio as gr
-import spaces
 import os
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
-from langchain.llms import HuggingFacePipeline
-from huggingface_hub import InferenceClient
-# GPU initialization moved into a function
 def initialize_model():
-    import torch
-    from transformers import (
-        AutoTokenizer,
-        TextStreamer,
-        pipeline,
-        BitsAndBytesConfig,
-        AutoModelForCausalLM
-    )
     model_id = "meta-llama/Llama-3.2-3B-Instruct"
     token = os.environ.get("HF_TOKEN")
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16
-    )
     tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         token=token,
-        quantization_config=bnb_config
     )
     return model, tokenizer
-# Initialize non-GPU components
-embeddings = HuggingFaceInstructEmbeddings(
-    model_name="hkunlp/instructor-base",
-    model_kwargs={"device": "cpu"}
-)
-db = Chroma(
-    persist_directory="db",
-    embedding_function=embeddings
-)
-@spaces.GPU(duration=30)
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
-        # Initialize model components inside the GPU scope
         model, tokenizer = initialize_model()
-        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         text_pipeline = pipeline(
             "text-generation",
             model=model,

 import os
+import spaces
+import gradio as gr
+import torch
+torch.jit.script = lambda f: f  # Avoid script error in lambda
+# Initialize non-GPU components first
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
+# System prompts
+DEFAULT_SYSTEM_PROMPT = """
+Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
+respond with "I don't know" or a similar acknowledgment that the answer is not available.
+""".strip()
+SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
+def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
+    return f"""
+[INST] <<SYS>>
+{system_prompt}
+<</SYS>>
+{prompt} [/INST]
+""".strip()
+template = generate_prompt(
+    """
+{context}
+Question: {question}
+""",
+    system_prompt=SYSTEM_PROMPT,
+)
+prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
+# Initialize database and embeddings
+embeddings = HuggingFaceInstructEmbeddings(
+    model_name="hkunlp/instructor-base",
+    model_kwargs={"device": "cpu"}
+)
+db = Chroma(
+    persist_directory="db",
+    embedding_function=embeddings
+)
 def initialize_model():
+    from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM
+    from langchain.llms import HuggingFacePipeline
     model_id = "meta-llama/Llama-3.2-3B-Instruct"
     token = os.environ.get("HF_TOKEN")
     tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         token=token,
     )
+    if torch.cuda.is_available():
+        model = model.to("cuda")
     return model, tokenizer
+@spaces.GPU
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
+        # Initialize model components inside GPU context
         model, tokenizer = initialize_model()
+        from transformers import TextStreamer, pipeline
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         text_pipeline = pipeline(
             "text-generation",
             model=model,