Update app.py
Browse files
app.py
CHANGED
@@ -8,13 +8,7 @@ import json
|
|
8 |
import pandas as pd
|
9 |
from llama_cpp import Llama
|
10 |
from langchain_community.llms import LlamaCpp
|
11 |
-
|
12 |
-
from transformers import AutoTokenizer, GenerationConfig #, AutoModelForCausalLM
|
13 |
-
#from transformers import AutoModelForCausalLM, AutoModel
|
14 |
-
from transformers import TextIteratorStreamer
|
15 |
from threading import Thread
|
16 |
-
from ctransformers import AutoModelForCausalLM, AutoConfig, Config #, AutoTokenizer
|
17 |
-
|
18 |
from huggingface_hub import Repository, upload_file
|
19 |
import os
|
20 |
|
@@ -50,27 +44,6 @@ If you don't know the answer, just say "I do not know." Don't make up an answer.
|
|
50 |
llm_model = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
51 |
# TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF and tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf not working, TinyLlama/TinyLlama-1.1B-Chat-v0.6, andrijdavid/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
52 |
|
53 |
-
#tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
54 |
-
#initiate model and tokenizer
|
55 |
-
|
56 |
-
#generation_config = AutoConfig.from_pretrained(
|
57 |
-
# "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
58 |
-
# max_new_tokens= 300,
|
59 |
-
# do_sample=True,
|
60 |
-
# stream = streamer,
|
61 |
-
# top_p=0.95,
|
62 |
-
# temperature=0.4,
|
63 |
-
# stream = True
|
64 |
-
# eos_token_id=terminators
|
65 |
-
#)
|
66 |
-
# send additional parameters to model for generation
|
67 |
-
#terminators = [
|
68 |
-
# tokenizer.eos_token_id, # End-of-Sequence Token that indicates where the model should consider the text sequence to be complete
|
69 |
-
# tokenizer.convert_tokens_to_ids("<|eot_id|>") # Converts a token strings in a single/ sequence of integer id using the vocabulary
|
70 |
-
#]
|
71 |
-
# indicates the end of a sequence
|
72 |
-
|
73 |
-
#model = llama_cpp.Llama(model_path = tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf AutoModelForCausalLM.from_pretrained(llm_model, model_file = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", model_type="llama", gpu_layers=0, config = generation_config)
|
74 |
model = Llama(
|
75 |
model_path="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
76 |
# chat_format="llama-2",
|
@@ -82,6 +55,7 @@ model = Llama(
|
|
82 |
# callback_manager=callback_manager,
|
83 |
# verbose=True, # Verbose is required to pass to the callback manager
|
84 |
)
|
|
|
85 |
|
86 |
def search(query: str, k: int = 2 ):
|
87 |
"""a function that embeds a new query and returns the most probable results"""
|
@@ -136,15 +110,11 @@ def talk(prompt, history):
|
|
136 |
# formatted_prompt_with_history = formatted_prompt_with_history[:600] # to avoid memory issue
|
137 |
# print(formatted_prompt_with_history)
|
138 |
messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
|
139 |
-
# messages = "role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
|
140 |
-
print(messages)
|
141 |
# binding the system context and new prompt for LLM
|
142 |
# the chat template structure should be based on text generation model format
|
143 |
-
print("check6")
|
144 |
|
145 |
# indicates the end of a sequence
|
146 |
stream = model.create_chat_completion(messages = messages, max_tokens=1000, stop=["</s>"], stream=False)
|
147 |
-
# stream = model(prompt = messages, max_tokens=1000, stop=["</s>"],echo=True, stream=False)
|
148 |
print(f"{stream}")
|
149 |
print("check 7")
|
150 |
print(stream['choices'][0]['message']['content'])
|
|
|
8 |
import pandas as pd
|
9 |
from llama_cpp import Llama
|
10 |
from langchain_community.llms import LlamaCpp
|
|
|
|
|
|
|
|
|
11 |
from threading import Thread
|
|
|
|
|
12 |
from huggingface_hub import Repository, upload_file
|
13 |
import os
|
14 |
|
|
|
44 |
llm_model = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
45 |
# TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF and tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf not working, TinyLlama/TinyLlama-1.1B-Chat-v0.6, andrijdavid/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
model = Llama(
|
48 |
model_path="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
49 |
# chat_format="llama-2",
|
|
|
55 |
# callback_manager=callback_manager,
|
56 |
# verbose=True, # Verbose is required to pass to the callback manager
|
57 |
)
|
58 |
+
#initiate model and tokenizer
|
59 |
|
60 |
def search(query: str, k: int = 2 ):
|
61 |
"""a function that embeds a new query and returns the most probable results"""
|
|
|
110 |
# formatted_prompt_with_history = formatted_prompt_with_history[:600] # to avoid memory issue
|
111 |
# print(formatted_prompt_with_history)
|
112 |
messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
|
|
|
|
|
113 |
# binding the system context and new prompt for LLM
|
114 |
# the chat template structure should be based on text generation model format
|
|
|
115 |
|
116 |
# indicates the end of a sequence
|
117 |
stream = model.create_chat_completion(messages = messages, max_tokens=1000, stop=["</s>"], stream=False)
|
|
|
118 |
print(f"{stream}")
|
119 |
print("check 7")
|
120 |
print(stream['choices'][0]['message']['content'])
|