WebSearchLLM

Runtime error

App Files Files Community

cnmoro commited on Feb 15, 2024

Commit

22f2310

verified ·

1 Parent(s): d387356

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -48

app.py CHANGED Viewed

@@ -8,8 +8,11 @@ import gradio as gr
 torch.set_num_threads(2)
-openrouter_key = os.environ.get("OPENROUTER_KEY")
 model = EmbeddingModel(use_quantized_onnx_model=True)
 def fetch_links(query, max_results=5):
     with DDGS() as ddgs:
@@ -53,7 +56,34 @@ def retrieval_pipeline(query):
     return context, websearch_time, webcrawl_time, embedding_time, retrieval_time, links
-async def predict(message, history):
     context, websearch_time, webcrawl_time, embedding_time, retrieval_time, links = retrieval_pipeline(message)
     if detect_language(message) == Language.ptbr:
@@ -61,56 +91,14 @@ async def predict(message, history):
     else:
         prompt = f"Context:\n\n{context}\n\nBased on the context, answer: {message}"
-    print(prompt)
-    url = "https://openrouter.ai/api/v1/chat/completions"
-    headers = { "Content-Type": "application/json",
-                "Authorization": f"Bearer {openrouter_key}" }
-    body = { "stream": True,
-             "models": [
-                    "huggingfaceh4/zephyr-7b-beta:free",
-                    "mistralai/mistral-7b-instruct:free",
-                    "nousresearch/nous-capybara-7b:free",
-                    "openchat/openchat-7b:free"
-             ],
-             "route": "fallback",
-             "max_tokens": 512,
-             "messages": [
-                 {"role": "user", "content": prompt}
-             ] }
     full_response = ""
-    async with aiohttp.ClientSession() as session:
-        async with session.post(url, headers=headers, json=body) as response:
-            buffer = ""  # A buffer to hold incomplete lines of data
-            async for chunk in response.content.iter_any():
-                buffer += chunk.decode()
-                while "\n" in buffer:  # Process as long as there are complete lines in the buffer
-                    line, buffer = buffer.split("\n", 1)
-                    print(line)
-                    if line.startswith("data: "):
-                        event_data = line[len("data: "):]
-                        if event_data != '[DONE]':
-                            try:
-                                current_text = json.loads(event_data)['choices'][0]['delta']['content']
-                                full_response += current_text
-                                yield full_response
-                                await asyncio.sleep(0.01)
-                            except Exception as e:
-                                print("Error event 1", e)
-                                try:
-                                    current_text = json.loads(event_data)['choices'][0]['text']
-                                    full_response += current_text
-                                    yield full_response
-                                    await asyncio.sleep(0.01)
-                                except Exception as e:
-                                    print("Error event 2", e)
     final_metadata_block = ""
     final_metadata_block += f"Links visited:\n"
     for link in links:
         final_metadata_block += f"{link}\n"

 torch.set_num_threads(2)
 model = EmbeddingModel(use_quantized_onnx_model=True)
+tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
+llm = AutoModelForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.5")
+prompt_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: $PROMPT ASSISTANT: "
 def fetch_links(query, max_results=5):
     with DDGS() as ddgs:
     return context, websearch_time, webcrawl_time, embedding_time, retrieval_time, links
+@spaces.GPU(enable_queue=True)
+def ask_open_llm(prompt):
+    device = torch.device('cuda')
+    llm.to(device)
+    model_inputs = tokenizer([
+        prompt
+    ], return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        model_inputs,
+        streamer=streamer,
+        max_new_tokens=512,
+        top_p=0.2,
+        top_k=20,
+        temperature=0.4,
+        repetition_penalty=1.1
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()  # Starting the generation in a separate thread.
+    partial_message = ""
+    for new_token in streamer:
+        partial_message += new_token
+        yield partial_message
+def predict(message, history):
     context, websearch_time, webcrawl_time, embedding_time, retrieval_time, links = retrieval_pipeline(message)
     if detect_language(message) == Language.ptbr:
     else:
         prompt = f"Context:\n\n{context}\n\nBased on the context, answer: {message}"
+    prompt = prompt_template.replace("$PROMPT", prompt)
     full_response = ""
+    for partial_message in ask_open_llm(prompt):
+        full_response += partial_message
+        yield full_response
     final_metadata_block = ""
     final_metadata_block += f"Links visited:\n"
     for link in links:
         final_metadata_block += f"{link}\n"