Spaces:

Namitg02
/

Test

Runtime error

App Files Files Community

Namitg02 commited on Jun 19, 2024

Commit

b0d53fc

verified ·

1 Parent(s): bee7d75

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -113

app.py CHANGED Viewed

@@ -1,19 +1,17 @@
 from datasets import load_dataset
 from datasets import Dataset
-#from langchain.docstore.document import Document as LangchainDocument
-# from langchain.memory import ConversationBufferMemory
 from sentence_transformers import SentenceTransformer
 import faiss
 import time
 #import torch
 import pandas as pd
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import TextIteratorStreamer
 from threading import Thread
-#from ctransformers import AutoModelForCausalLM, AutoConfig, Config, AutoTokenizer
-#from huggingface_hub import InferenceClient
 from huggingface_hub import Repository, upload_file
 import os
@@ -26,16 +24,6 @@ historylog = [{
         "Output": ''
 }]
-llm_model = "TinyLlama/TinyLlama-1.1B-Chat-v0.6"
-# TheBloke/Llama-2-7B-Chat-GGML , TinyLlama/TinyLlama-1.1B-Chat-v1.0 , microsoft/Phi-3-mini-4k-instruct, health360/Healix-1.1B-V1-Chat-dDPO
-# TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF and tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf not working
-model = AutoModelForCausalLM.from_pretrained(llm_model)
-tokenizer = AutoTokenizer.from_pretrained(llm_model)
-#initiate model and tokenizer
 data = load_dataset("Namitg02/Test", split='train', streaming=False)
 #Returns a list of dictionaries, each representing a row in the dataset.
 length = len(data)
@@ -49,7 +37,7 @@ index =  faiss.IndexFlatL2(embedding_dim)
 data.add_faiss_index("embeddings", custom_index=index)
 # adds an index column for the embeddings
-print("check1d")
 #question = "How can I reverse Diabetes?"
 SYS_PROMPT = """You are an assistant for answering questions.
@@ -57,18 +45,25 @@ You are given the extracted parts of documents and a question. Provide a convers
 If you don't know the answer, just say "I do not know." Don't make up an answer."""
 # Provides context of how to answer the question
-print("check2")
-# memory = ConversationBufferMemory(return_messages=True)
-terminators = [
-    tokenizer.eos_token_id, # End-of-Sequence Token that indicates where the model should consider the text sequence to be complete
-    tokenizer.convert_tokens_to_ids("<|eot_id|>") # Converts a token strings in a single/ sequence of integer id using the vocabulary
-]
-# indicates the end of a sequence
 def search(query: str, k: int = 2 ):
     """a function that embeds a new query and returns the most probable results"""
@@ -82,8 +77,6 @@ def search(query: str, k: int = 2 ):
 # called by talk function that passes prompt
 #print(scores, retrieved_examples)
-print("check2A")
 def format_prompt(prompt,retrieved_documents,k):
     """using the retrieved documents we will prompt the model to generate our responses"""
@@ -94,109 +87,114 @@ def format_prompt(prompt,retrieved_documents,k):
 # Called by talk function to add retrieved documents to the prompt. Keeps adding text of retrieved documents to string taht are retreived
-print("check3")
 def talk(prompt, history):
     k = 2 # number of retrieved documents
     scores , retrieved_documents = search(prompt, k) # get retrival scores and examples in dictionary format based on the prompt passed
     print(retrieved_documents.keys())
     formatted_prompt = format_prompt(prompt,retrieved_documents,k) # create a new prompt using the retrieved documents
     print(retrieved_documents['0'])
     print(formatted_prompt)
     formatted_prompt = formatted_prompt[:600] # to avoid memory issue
- #   print(retrieved_documents['0'][1]
- #   print(retrieved_documents['0'][2]
     print(formatted_prompt)
     messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
     # binding the system context and new prompt for LLM
     # the chat template structure should be based on text generation model format
-    print("check3B")
-    input_ids = tokenizer.apply_chat_template(
-      messages,
-      add_generation_prompt=True,
-      return_tensors="pt"
-    ).to(model.device)
-    # tell the model to generate
-    # add_generation_prompt argument tells the template to add tokens that indicate the start of a bot response
-    print("check3C")
-    outputs = model.generate(
-      input_ids,
-      max_new_tokens=300,
-      eos_token_id=terminators,
-      do_sample=True,
-      temperature=0.4,
-      top_p=0.95,
     )
     # calling the model to generate response based on message/ input
     # do_sample if set to True uses strategies to select the next token from the probability distribution over the entire vocabulary
     # temperature controls randomness. more renadomness with higher temperature
     # only the tokens comprising the top_p probability mass are considered for responses
     # This output is a data structure containing all the information returned by generate(), but that can also be used as tuple or dictionary.
-    print("check3D")
-    streamer = TextIteratorStreamer(
-            tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
-            )
-    # stores print-ready text in a queue, to be used by a downstream application as an iterator. removes specail tokens in generated text.
-    # timeout for text queue. tokenizer for decoding tokens
-    # called by generate_kwargs
-    print("check3E")
-    generate_kwargs = dict(
-        input_ids= input_ids,
-        streamer=streamer,
-        max_new_tokens= 200,
-        do_sample=True,
-        top_p=0.95,
-        temperature=0.4,
-        eos_token_id=terminators,
-    )
-    # send additional parameters to model for generation
-    print("check3F")
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
     # to process multiple instances
-    t.start()
     # start a thread
-    print("check3G")
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        print(outputs)
-        yield "".join(outputs)
-        print("check3H")
     pd.options.display.max_colwidth = 800
-    outputstring = ''.join(outputs)
-    global historylog
-    historynew = {
-       "Prompt": prompt,
-       "Output": outputstring
-    }
-    historylog.append(historynew)
-    return historylog
-    print(historylog)
-#    history.update({prompt: outputstring})
-#    print(history)
-    #print(memory_string2)
-    #with open(logfile, 'a', encoding='utf-8') as f:
-      #  f.write(memory_string2)
-     #   f.write('\n')
-    #f.close()
-    #print(logfile)
-    #logfile.push_to_hub("Namitg02/",token = HF_TOKEN)
-    #memory_panda = pd.DataFrame()
-    #if len(memory_panda) == 0:
-    #    memory_panda = pd.DataFrame(memory_string)
-    #else:
-    #    memory_panda = memory_panda.append(memory_string, ignore_index=True)
-    #print(memory_panda.iloc[[0]])
-    #memory_panda.loc[len(memory_panda.index)] = ['prompt', outputstring]
-    #print(memory_panda.iloc[[1]])
-    #Logfile = Dataset.from_pandas(memory_panda)
-    #Logfile.push_to_hub("Namitg02/Logfile",token = HF_TOKEN)
 TITLE = "AI Copilot for Diabetes Patients"
@@ -218,12 +216,11 @@ demo = gr.ChatInterface(
     examples=[["what is Diabetes? "]],
     title=TITLE,
     description=DESCRIPTION,
 )
 # launch chatbot and calls the talk function which in turn calls other functions
-print("check3I")
-print(historylog)
-memory_panda = pd.DataFrame(historylog)
-Logfile = Dataset.from_pandas(memory_panda)
-Logfile.push_to_hub("Namitg02/Logfile",token = HF_TOKEN)
 demo.launch()

 from datasets import load_dataset
 from datasets import Dataset
 from sentence_transformers import SentenceTransformer
 import faiss
 import time
 #import torch
 import pandas as pd
+from transformers import AutoTokenizer, GenerationConfig #, AutoModelForCausalLM
+#from transformers import AutoModelForCausalLM, AutoModel
 from transformers import TextIteratorStreamer
 from threading import Thread
+from ctransformers import AutoModelForCausalLM, AutoConfig, Config #, AutoTokenizer
 from huggingface_hub import Repository, upload_file
 import os
         "Output": ''
 }]
 data = load_dataset("Namitg02/Test", split='train', streaming=False)
 #Returns a list of dictionaries, each representing a row in the dataset.
 length = len(data)
 data.add_faiss_index("embeddings", custom_index=index)
 # adds an index column for the embeddings
+print("check1")
 #question = "How can I reverse Diabetes?"
 SYS_PROMPT = """You are an assistant for answering questions.
 If you don't know the answer, just say "I do not know." Don't make up an answer."""
 # Provides context of how to answer the question
+llm_model = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+# TheBloke/Llama-2-7B-Chat-GGML , TinyLlama/TinyLlama-1.1B-Chat-v1.0 , microsoft/Phi-3-mini-4k-instruct, health360/Healix-1.1B-V1-Chat-dDPO
+# TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF and tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf not working, TinyLlama/TinyLlama-1.1B-Chat-v0.6, andrijdavid/TinyLlama-1.1B-Chat-v1.0-GGUF"
+tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+#initiate model and tokenizer
+generation_config = AutoConfig.from_pretrained(
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+    max_new_tokens= 300,
+  #  do_sample=True,
+  #  stream = streamer,
+    top_p=0.95,
+    temperature=0.4
+  #  eos_token_id=terminators
+)
+# send additional parameters to model for generation
+model = AutoModelForCausalLM.from_pretrained(llm_model, model_file = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", model_type="llama", gpu_layers=0, config = generation_config)
 def search(query: str, k: int = 2 ):
     """a function that embeds a new query and returns the most probable results"""
 # called by talk function that passes prompt
 #print(scores, retrieved_examples)
 def format_prompt(prompt,retrieved_documents,k):
     """using the retrieved documents we will prompt the model to generate our responses"""
 # Called by talk function to add retrieved documents to the prompt. Keeps adding text of retrieved documents to string taht are retreived
 def talk(prompt, history):
     k = 2 # number of retrieved documents
     scores , retrieved_documents = search(prompt, k) # get retrival scores and examples in dictionary format based on the prompt passed
     print(retrieved_documents.keys())
+    print("check4")
     formatted_prompt = format_prompt(prompt,retrieved_documents,k) # create a new prompt using the retrieved documents
+    print("check5")
     print(retrieved_documents['0'])
     print(formatted_prompt)
     formatted_prompt = formatted_prompt[:600] # to avoid memory issue
     print(formatted_prompt)
     messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
     # binding the system context and new prompt for LLM
     # the chat template structure should be based on text generation model format
+    print("check6")
+    streamer = TextIteratorStreamer(
+    tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
     )
+    # stores print-ready text in a queue, to be used by a downstream application as an iterator. removes special tokens in generated text.
+    # timeout for text queue. tokenizer for decoding tokens
+    # called by generate_kwargs
+    terminators = [
+      tokenizer.eos_token_id, # End-of-Sequence Token that indicates where the model should consider the text sequence to be complete
+      tokenizer.convert_tokens_to_ids("<|eot_id|>") # Converts a token strings in a single/ sequence of integer id using the vocabulary
+    ]
+# indicates the end of a sequence
+#    input_ids = tokenizer.apply_chat_template(
+#      "hello",
+#      add_generation_prompt=True,
+#      return_tensors="pt"
+#     )
+    # preparing tokens for model input
+    # add_generation_prompt argument tells the template to add tokens that indicate the start of a bot response
+#    print(input_ids)
+#    print("check7")
+#    print(input_ids.dtype)
+#    generate_kwargs = dict(
+#        tokens= input_ids) #,
+#        streamer=streamer,
+#        do_sample=True,
+#        eos_token_id=terminators,
+#    )
+  #  outputs = model.generate(
+  #  )
+  #  print(outputs)
     # calling the model to generate response based on message/ input
     # do_sample if set to True uses strategies to select the next token from the probability distribution over the entire vocabulary
     # temperature controls randomness. more renadomness with higher temperature
     # only the tokens comprising the top_p probability mass are considered for responses
     # This output is a data structure containing all the information returned by generate(), but that can also be used as tuple or dictionary.
+ #
+ #   print("check10")
+ #   t = Thread(target=model.generate, kwargs=generate_kwargs)
     # to process multiple instances
+ #   t.start()
+  #  print("check11")
     # start a thread
+    outputs = []
+    input_ids = llm.tokenize(*messages)
+    start = time.time()
+    NUM_TOKENS=0
+    print('-'*4+'Start Generation'+'-'*4)
+    for token in model.generate(input_ids):
+    	print(model.detokenize(input_ids), end='', flush=True)
+    	NUM_TOKENS+=1
+   time_generate = time.time() - start
+   print('\n')
+   print('-'*4+'End Generation'+'-'*4)
+   print(f'Num of generated tokens: {NUM_TOKENS}')
+   print(f'Time for complete generation: {time_generate}s')
+   print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
+   print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
+    #outputtokens =  model.generate(input_ids)
+    print("check9")
+    #print(outputtokens)
+    #outputs = model.detokenize(outputtokens, decode = True)
+    #print(outputs)
+#    for token in model.generate(input_ids):
+#      print(model.detokenize(token))
+  #      outputs.append(model.detokenize(token))
+  #     output = model.detokenize(token)
+  #     print(outputs)
+       # yield "".join(outputs)
+       # print("check12")
     pd.options.display.max_colwidth = 800
+    print("check13")
+#    outputstring = ''.join(outputs)
+#    global historylog
+#    historynew = {
+#       "Prompt": prompt,
+#       "Output": outputstring
+#    }
+#    historylog.append(historynew)
+#    return historylog
+#    print(historylog)
 TITLE = "AI Copilot for Diabetes Patients"
     examples=[["what is Diabetes? "]],
     title=TITLE,
     description=DESCRIPTION,
 )
 # launch chatbot and calls the talk function which in turn calls other functions
+print("check14")
+#print(historylog)
+#memory_panda = pd.DataFrame(historylog)
+#Logfile = Dataset.from_pandas(memory_panda)
+#Logfile.push_to_hub("Namitg02/Logfile",token = HF_TOKEN)
 demo.launch()