langchain-llama-2-70b-guanaco-qlora-ggml

Sleeping

App Files Files Community

ffreemt commited on Jul 29, 2023

Commit

bc41479

1 Parent(s): 1774daa

Update memory=None, prelude

Browse files

Files changed (1) hide show

app.py +24 -12

app.py CHANGED Viewed

@@ -166,6 +166,8 @@ if "forindo" in platform.node().lower():
     url = "https://huggingface.co/TheBloke/llama-2-70b-Guanaco-QLoRA-GGML/blob/main/llama-2-70b-guanaco-qlora.ggmlv3.q3_K_S.bin"  # 29.7G
 else:
     url = "https://huggingface.co/TheBloke/llama-2-13B-Guanaco-QLoRA-GGML/blob/main/llama-2-13b-guanaco-qlora.ggmlv3.q4_K_S.bin"  # 8.14G
 logger.debug(f"{url=}")
 try:
     model_loc, file_size = dl_hf_model(url)
@@ -209,7 +211,7 @@ memory = ConversationBufferWindowMemory(
 conversation = ConversationChain(
     llm=LLM,
     prompt=prompt,
-    memory=memory,
     verbose=True,
 )
 logger.debug(f"{conversation.prompt.template=}")  # type: ignore
@@ -221,6 +223,7 @@ config.stop = stop
 config.threads = cpu_count
 try:
     LLM_api = CTransformers(
         model=model_loc,
         model_type="llama",
@@ -282,11 +285,13 @@ def bot(history):
     flag = 1
     then = time.time()
     prefix = ""  # to please pyright
     with about_time() as atime:  # type: ignore
         while True:
             if deq:
                 if flag:
-                    prefix = f"({time.time() - then:.2f}s) "
                     flag = 0
                 _ = deq.popleft()
                 if _ is sig_end:
@@ -299,7 +304,7 @@ def bot(history):
                 time.sleep(0.01)
     _ = (
         f"(time elapsed: {atime.duration_human}, "  # type: ignore
-        f"{atime.duration/len(''.join(response)):.2f}s/char)"  # type: ignore
     )
     history[-1][1] = "".join(response) + f"\n{_}"
@@ -343,8 +348,8 @@ css = """
 """
 etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
 examples_list = [
-    ["Hello I am mike."],
-    ["What's my name?"],
     ["What NFL team won the Super Bowl in the year Justin Bieber was born?"],
     [
         "What NFL team won the Super Bowl in the year Justin Bieber was born? Think step by step."
@@ -388,10 +393,16 @@ examples_list = [
 logger.info("start block")
 with gr.Blocks(
     title=f"{Path(model_loc).name}",
     theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
     css=css,
 ) as block:
     # buff_var = gr.State("")
     with gr.Accordion("🎈 Info", open=False):
@@ -399,13 +410,14 @@ with gr.Blocks(
         #     """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
         # )
         gr.Markdown(
-            f"""<h5><center>{Path(model_loc).name}</center></h4>
-            The bot can conduct multi-turn conversations, i.e. it remembers past dialogs. The process time is longer.
-            It typically takes about xxx  seconds for the first response to appear.
-            Most examples are meant for another model.
-            You probably should try to test
-            some related prompts.""",
             elem_classes="xsmall",
         )

     url = "https://huggingface.co/TheBloke/llama-2-70b-Guanaco-QLoRA-GGML/blob/main/llama-2-70b-guanaco-qlora.ggmlv3.q3_K_S.bin"  # 29.7G
 else:
     url = "https://huggingface.co/TheBloke/llama-2-13B-Guanaco-QLoRA-GGML/blob/main/llama-2-13b-guanaco-qlora.ggmlv3.q4_K_S.bin"  # 8.14G
+# url = "https://huggingface.co/TheBloke/llama-2-13B-Guanaco-QLoRA-GGML/blob/main/llama-2-13b-guanaco-qlora.ggmlv3.q4_K_S.bin"  # 8.14G
 logger.debug(f"{url=}")
 try:
     model_loc, file_size = dl_hf_model(url)
 conversation = ConversationChain(
     llm=LLM,
     prompt=prompt,
+    # memory=memory,  # default memory=None
     verbose=True,
 )
 logger.debug(f"{conversation.prompt.template=}")  # type: ignore
 config.threads = cpu_count
 try:
+    # raise Exception  # disable api
     LLM_api = CTransformers(
         model=model_loc,
         model_type="llama",
     flag = 1
     then = time.time()
     prefix = ""  # to please pyright
+    prelude = 0.0
     with about_time() as atime:  # type: ignore
         while True:
             if deq:
                 if flag:
+                    prelude = time.time() - then
+                    prefix = f"({prelude:.2f}s) "
                     flag = 0
                 _ = deq.popleft()
                 if _ is sig_end:
                 time.sleep(0.01)
     _ = (
         f"(time elapsed: {atime.duration_human}, "  # type: ignore
+        f"{(atime.duration - prelude)/len(''.join(response)):.2f}s/char)"  # type: ignore
     )
     history[-1][1] = "".join(response) + f"\n{_}"
 """
 etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
 examples_list = [
+    # ["Hello I am mike."],
+    # ["What's my name?"],
     ["What NFL team won the Super Bowl in the year Justin Bieber was born?"],
     [
         "What NFL team won the Super Bowl in the year Justin Bieber was born? Think step by step."
 logger.info("start block")
+port = 7860
+if "forindo" in platform.node():
+    port = 7861
 with gr.Blocks(
     title=f"{Path(model_loc).name}",
     theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
     css=css,
+    port=port,
 ) as block:
     # buff_var = gr.State("")
     with gr.Accordion("🎈 Info", open=False):
         #     """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
         # )
         gr.Markdown(
+            (
+                f"""<h5><center>{Path(model_loc).name}</center></h4>"""
+                # The bot can conduct multi-turn conversations, i.e. it remembers past dialogs. The process time is longer.
+                # It typically takes about xxx  seconds for the first response to appear.
+                "Most examples are meant for another model. "
+                "You probably should try to test "
+                "some related prompts. "
+            ),
             elem_classes="xsmall",
         )