|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import datetime |
|
from typing import Dict, List, Tuple |
|
from itertools import compress |
|
import pandas as pd |
|
|
|
from langchain import PromptTemplate |
|
from langchain.chains import LLMChain |
|
from langchain.chains.base import Chain |
|
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain |
|
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings |
|
from langchain.chains.qa_with_sources import load_qa_with_sources_chain |
|
from langchain.prompts import PromptTemplate |
|
from langchain.retrievers import TFIDFRetriever, SVMRetriever |
|
from langchain.vectorstores import FAISS |
|
from langchain.llms import HuggingFacePipeline |
|
|
|
from pydantic import BaseModel |
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
|
|
import torch |
|
|
|
from optimum.pipelines import pipeline |
|
from transformers import AutoTokenizer, TextStreamer, AutoModelForSeq2SeqLM, TextIteratorStreamer |
|
from threading import Thread |
|
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_faiss_store(faiss_vstore_folder,embeddings): |
|
import zipfile |
|
with zipfile.ZipFile(faiss_vstore_folder + '/faiss_lambeth_census_embedding.zip', 'r') as zip_ref: |
|
zip_ref.extractall(faiss_vstore_folder) |
|
|
|
faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings) |
|
os.remove(faiss_vstore_folder + "/index.faiss") |
|
os.remove(faiss_vstore_folder + "/index.pkl") |
|
|
|
return faiss_vstore |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_hf_model(model_name = "declare-lab/flan-alpaca-large"): |
|
|
|
model_id = model_name |
|
torch_device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print("Running on device:", torch_device) |
|
print("CPU threads:", torch.get_num_threads()) |
|
|
|
|
|
|
|
if torch_device == "cuda": |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto") |
|
else: |
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_id) |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
return model, tokenizer, torch_device |
|
|
|
|
|
|
|
|
|
nltk.download('stopwords') |
|
stopwords = nltk.corpus.stopwords.words('english') |
|
|
|
newStopWords = ['what','how', 'when', 'which', 'who', 'change', 'changed', 'do', 'did', 'increase', 'decrease', 'increased', |
|
'decreased', 'proportion', 'percentage', 'report', 'reporting','say', 'said'] |
|
stopwords.extend(newStopWords) |
|
|
|
|
|
|
|
|
|
|
|
embed_model_name = "hkunlp/instructor-large" |
|
embeddings = HuggingFaceInstructEmbeddings(model_name=embed_model_name) |
|
vectorstore = get_faiss_store(faiss_vstore_folder="faiss_lambeth_census_embedding",embeddings=embeddings) |
|
|
|
|
|
|
|
|
|
|
|
checkpoint = 'declare-lab/flan-alpaca-large' |
|
|
|
model, tokenizer, torch_device = create_hf_model(model_name = checkpoint) |
|
|
|
|
|
|
|
|
|
streamer = TextStreamer(tokenizer, skip_prompt=True) |
|
|
|
pipe = pipeline('text2text-generation', |
|
model = checkpoint, |
|
|
|
max_length=512, |
|
|
|
temperature=0.000001, |
|
|
|
|
|
accelerator="bettertransformer", |
|
streamer=streamer |
|
) |
|
|
|
checkpoint_keywords = 'ml6team/keyphrase-generation-t5-small-inspec' |
|
|
|
keyword_model = pipeline('text2text-generation', |
|
model = checkpoint_keywords, |
|
accelerator="bettertransformer" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def clear_chat(chat_history_state, sources, chat_message): |
|
chat_history_state = [] |
|
sources = '' |
|
chat_message = '' |
|
return chat_history_state, sources, chat_message |
|
|
|
|
|
def _get_chat_history(chat_history: List[Tuple[str, str]]): |
|
max_chat_length = 3 |
|
|
|
if len(chat_history) > max_chat_length: |
|
chat_history = chat_history[-max_chat_length:] |
|
|
|
print(chat_history) |
|
|
|
first_q = "" |
|
for human_s, ai_s in chat_history: |
|
first_q = human_s |
|
break |
|
|
|
conversation = "" |
|
for human_s, ai_s in chat_history: |
|
human = f"Human: " + human_s |
|
ai = f"Assistant: " + ai_s |
|
conversation += "\n" + "\n".join([human, ai]) |
|
|
|
return conversation, first_q |
|
|
|
|
|
def adapt_q_from_chat_history(keyword_model, new_question_keywords, question, chat_history): |
|
t5_small_keyphrase = HuggingFacePipeline(pipeline=keyword_model) |
|
memory_llm = t5_small_keyphrase |
|
new_q_memory_llm = t5_small_keyphrase |
|
|
|
|
|
memory_prompt = PromptTemplate( |
|
template = "{chat_history_first_q}", |
|
input_variables=["chat_history_first_q"] |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
memory_extractor = LLMChain(llm=memory_llm, prompt=memory_prompt) |
|
|
|
|
|
|
|
print("new_question_keywords:") |
|
print(new_question_keywords) |
|
|
|
chat_history_str, chat_history_first_q = _get_chat_history(chat_history) |
|
if chat_history_str: |
|
|
|
extracted_memory = memory_extractor.run( |
|
chat_history_first_q=chat_history_first_q |
|
) |
|
|
|
new_question_kworded = extracted_memory + " " + new_question_keywords |
|
new_question = extracted_memory + " " + question |
|
|
|
else: |
|
new_question = question |
|
new_question_kworded = new_question_keywords |
|
|
|
return new_question, new_question_kworded |
|
|
|
|
|
|
|
|
|
def remove_q_stopwords(question): |
|
|
|
text = question.lower() |
|
text_tokens = word_tokenize(text) |
|
tokens_without_sw = [word for word in text_tokens if not word in stopwords] |
|
new_question_keywords = ' '.join(tokens_without_sw) |
|
return new_question_keywords, question |
|
|
|
|
|
def create_final_prompt(inputs: Dict[str, str], vectorstore, instruction_prompt, content_prompt): |
|
|
|
question = inputs["question"] |
|
chat_history = inputs["chat_history"] |
|
|
|
new_question_keywords, question = remove_q_stopwords(question) |
|
|
|
new_question, new_question_kworded = adapt_q_from_chat_history(keyword_model, new_question_keywords, question, chat_history) |
|
|
|
|
|
print("The question passed to the vector search is:") |
|
print(new_question_kworded) |
|
|
|
docs_keep_as_doc, docs_content, docs_url = find_relevant_passages(new_question_kworded, embeddings, k_val = 3, out_passages = 2, vec_score_cut_off = 1.3, vec_weight = 1, tfidf_weight = 0.5, svm_weight = 1) |
|
|
|
if docs_keep_as_doc == []: |
|
{"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."} |
|
|
|
|
|
|
|
|
|
|
|
string_docs_content = '\n\n\n'.join(docs_content) |
|
|
|
|
|
|
|
|
|
instruction_prompt_out = instruction_prompt.format(question=new_question, summaries=string_docs_content) |
|
|
|
|
|
|
|
|
|
return instruction_prompt_out, string_docs_content |
|
|
|
|
|
|
|
def create_prompt_templates(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
CONTENT_PROMPT = PromptTemplate( |
|
template="{page_content}\n\n", |
|
input_variables=["page_content"] |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
instruction_prompt_template = """ |
|
{summaries} |
|
|
|
QUESTION: {question} |
|
|
|
Quote relevant text above.""" |
|
|
|
|
|
INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template, input_variables=['question', 'summaries']) |
|
|
|
return INSTRUCTION_PROMPT, CONTENT_PROMPT |
|
|
|
|
|
|
|
|
|
def get_history_sources_final_input_prompt(user_input, history): |
|
|
|
|
|
|
|
|
|
print("\n==== date/time: " + str(datetime.datetime.now()) + " ====") |
|
print("User input: " + user_input) |
|
|
|
history = history or [] |
|
|
|
|
|
|
|
|
|
instruction_prompt, content_prompt = create_prompt_templates() |
|
instruction_prompt_out, string_docs_content =\ |
|
create_final_prompt({"question": user_input, "chat_history": history}, vectorstore, |
|
instruction_prompt, content_prompt) |
|
|
|
sources_txt = string_docs_content |
|
|
|
|
|
|
|
|
|
history.append(user_input) |
|
|
|
print("Output history is:") |
|
print(history) |
|
|
|
print("The output prompt is:") |
|
print(instruction_prompt_out) |
|
|
|
return history, sources_txt, instruction_prompt_out |
|
|
|
|
|
|
|
|
|
def produce_streaming_answer_chatbot(history, full_prompt): |
|
|
|
print("The question is: ") |
|
print(full_prompt) |
|
|
|
|
|
model_inputs = tokenizer(text=full_prompt, return_tensors="pt").to(torch_device) |
|
|
|
|
|
|
|
streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) |
|
generate_kwargs = dict( |
|
model_inputs, |
|
streamer=streamer, |
|
max_new_tokens=512, |
|
do_sample=True, |
|
|
|
temperature=float(0.00001) |
|
|
|
) |
|
t = Thread(target=model.generate, kwargs=generate_kwargs) |
|
t.start() |
|
|
|
|
|
|
|
history[-1][1] = "" |
|
for new_text in streamer: |
|
history[-1][1] += new_text |
|
yield history |
|
|
|
|
|
def user(user_message, history): |
|
return gr.update(value="", interactive=False), history + [[user_message, None]] |
|
|
|
|
|
def add_inputs_answer_to_history(user_message, history): |
|
|
|
|
|
print("History after appending is:") |
|
print(history) |
|
|
|
|
|
return history |
|
|
|
|
|
|
|
|
|
def find_relevant_passages(new_question_kworded, embeddings, k_val, out_passages, vec_score_cut_off, vec_weight, tfidf_weight, svm_weight, vectorstore=vectorstore): |
|
|
|
docs = vectorstore.similarity_search_with_score(new_question_kworded, k=k_val) |
|
|
|
|
|
|
|
|
|
docs_scores = [x[1] for x in docs] |
|
|
|
|
|
score_more_limit = pd.Series(docs_scores) < vec_score_cut_off |
|
docs_keep = list(compress(docs, score_more_limit)) |
|
|
|
if docs_keep == []: |
|
docs_keep_as_doc = [] |
|
docs_content = [] |
|
docs_url = [] |
|
return docs_keep_as_doc, docs_content, docs_url |
|
|
|
|
|
|
|
docs_keep_as_doc = [x[0] for x in docs_keep] |
|
docs_keep_length = len(docs_keep_as_doc) |
|
|
|
|
|
|
|
|
|
vec_rank = [*range(1, docs_keep_length+1)] |
|
vec_score = [(docs_keep_length/x)*vec_weight for x in vec_rank] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
content_keep=[] |
|
for item in docs_keep: |
|
content_keep.append(item[0].page_content) |
|
|
|
tfidf_retriever = TFIDFRetriever.from_texts(content_keep, k = k_val) |
|
tfidf_result = tfidf_retriever.get_relevant_documents(new_question_kworded) |
|
|
|
|
|
|
|
|
|
tfidf_rank=[] |
|
tfidf_score = [] |
|
|
|
for vec_item in docs_keep: |
|
x = 0 |
|
for tfidf_item in tfidf_result: |
|
x = x + 1 |
|
if tfidf_item.page_content == vec_item[0].page_content: |
|
tfidf_rank.append(x) |
|
tfidf_score.append((docs_keep_length/x)*tfidf_weight) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val) |
|
svm_result = svm_retriever.get_relevant_documents(new_question_kworded) |
|
|
|
|
|
|
|
|
|
svm_rank=[] |
|
svm_score = [] |
|
|
|
for vec_item in docs_keep: |
|
x = 0 |
|
for svm_item in svm_result: |
|
x = x + 1 |
|
if svm_item.page_content == vec_item[0].page_content: |
|
svm_rank.append(x) |
|
svm_score.append((docs_keep_length/x)*svm_weight) |
|
|
|
|
|
|
|
|
|
|
|
|
|
final_score = [a + b + c for a, b, c in zip(vec_score, tfidf_score, svm_score)] |
|
final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score] |
|
|
|
|
|
|
|
|
|
|
|
|
|
best_rank_index_pos = [] |
|
|
|
for x in range(1,out_passages+1): |
|
try: |
|
best_rank_index_pos.append(final_rank.index(x)) |
|
except IndexError: |
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
best_rank_pos_series = pd.Series(best_rank_index_pos) |
|
|
|
|
|
|
|
|
|
|
|
docs_keep_out = [docs_keep[i] for i in best_rank_index_pos] |
|
|
|
|
|
|
|
|
|
docs_keep_as_doc = [x[0] for x in docs_keep_out] |
|
|
|
|
|
|
|
|
|
|
|
|
|
content=[] |
|
meta_url=[] |
|
score=[] |
|
|
|
for item in docs_keep_out: |
|
content.append(item[0].page_content) |
|
meta_url.append(item[0].metadata['source']) |
|
score.append(item[1]) |
|
|
|
|
|
|
|
doc_df = pd.DataFrame(list(zip(content, meta_url, score)), |
|
columns =['page_content', 'meta_url', 'score']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs_content = doc_df['page_content'].astype(str) |
|
docs_url = "https://" + doc_df['meta_url'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return docs_keep_as_doc, docs_content, docs_url |
|
|