Spaces:

gufett0
/

chatbot-llamaindex

Sleeping

App Files Files Community

gufett0 commited on Sep 15, 2024

Commit

baf000f

1 Parent(s): ed51056

added async

Browse files

Files changed (1) hide show

backend.py +21 -32

backend.py CHANGED Viewed

@@ -24,11 +24,14 @@ model_id = "google/gemma-2-2b-it"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    device_map="auto",
     torch_dtype= torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    token=True
-)
 model.eval()
 # what models will be used by LlamaIndex:
 Settings.embed_model = InstructorEmbedding(model_name="hkunlp/instructor-base")
 Settings.llm  = GemmaLLMInterface(model=model, tokenizer=tokenizer)
@@ -54,8 +57,7 @@ def build_index():
 @spaces.GPU(duration=20)
-def handle_query(query_str, chathistory):
     index = build_index()
     qa_prompt_str = (
@@ -71,45 +73,32 @@ def handle_query(query_str, chathistory):
     chat_text_qa_msgs = [
         (
             "system",
-            "Sei un assistente italiano di nome Ossy che risponde solo alle domande o richieste pertinenti. ",
         ),
         ("user", qa_prompt_str),
     ]
     text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
     try:
-        # Create a streaming query engine
-        """query_engine = index.as_query_engine(text_qa_template=text_qa_template, streaming=False, similarity_top_k=1)
-        # Execute the query
-        streaming_response = query_engine.query(query_str)
-        r = streaming_response.response
-        cleaned_result = r.replace("<end_of_turn>", "").strip()
-        yield cleaned_result"""
-        # Stream the response
-        """outputs = []
-        for text in streaming_response.response_gen:
-            outputs.append(str(text))
-            yield "".join(outputs)"""
         memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
         chat_engine = index.as_chat_engine(
-        chat_mode="context",
-        memory=memory,
-        system_prompt=(
-            "Sei un assistente italiano di nome Ossy che risponde solo alle domande o richieste pertinenti. "
-          ),
         )
         response = chat_engine.stream_chat(query_str)
-        #response = chat_engine.chat(query_str)
-        for token in response.response_gen:
-            yield token
     except Exception as e:
         yield f"Error processing query: {str(e)}"

 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    device_map="auto", ## change this back to auto!!!
     torch_dtype= torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    token=True)
 model.eval()
+#from accelerate import disk_offload
+#disk_offload(model=model, offload_dir="offload")
 # what models will be used by LlamaIndex:
 Settings.embed_model = InstructorEmbedding(model_name="hkunlp/instructor-base")
 Settings.llm  = GemmaLLMInterface(model=model, tokenizer=tokenizer)
 @spaces.GPU(duration=20)
+async def handle_query(query_str, chathistory):
     index = build_index()
     qa_prompt_str = (
     chat_text_qa_msgs = [
         (
             "system",
+            "Sei un assistente italiano di nome Ossy che risponde solo alle domande o richieste pertinenti.",
         ),
         ("user", qa_prompt_str),
     ]
     text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
     try:
         memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
         chat_engine = index.as_chat_engine(
+            chat_mode="context",
+            memory=memory,
+            system_prompt=(
+                "Sei un assistente italiano di nome Ossy che risponde solo alle domande o richieste pertinenti."
+            ),
         )
+        # Stream the response
         response = chat_engine.stream_chat(query_str)
+        outputs = []
+        async for token in response.response_gen:
+            outputs.append(token)
+            yield "".join(outputs)
+    except StopAsyncIteration:
+        yield "No more responses to stream."
     except Exception as e:
         yield f"Error processing query: {str(e)}"