import torch import locale import os from typing import Dict, List, Any from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig from langchain.llms import HuggingFacePipeline from langchain.retrievers.document_compressors import LLMChainExtractor from langchain.retrievers import ContextualCompressionRetriever from langchain.vectorstores import Chroma from langchain import PromptTemplate, LLMChain from langchain.chains import RetrievalQA, ConversationalRetrievalChain from langchain.prompts import PromptTemplate from langchain.prompts.prompt import PromptTemplate from langchain.memory import ConversationBufferMemory from langchain.embeddings import HuggingFaceBgeEmbeddings from langchain.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains.qa_with_sources import load_qa_with_sources_chain from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_core.messages import HumanMessage from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnableLambda, RunnableBranch, RunnablePassthrough from operator import itemgetter from langchain.schema import format_document from langchain.memory import ConversationBufferMemory from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string from langchain_core.runnables import RunnableParallel from typing import Optional from langchain.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores.utils import DistanceStrategy EMBEDDING_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1" MARKDOWN_SEPARATORS = [ "\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", "\n\n", "\n", " ", "", ] class EndpointHandler(): def __init__(self, path=""): # Load Vector db urls = [ "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2).html", "https://scholars.cityu.edu.hk/en/persons/man-hon-michael-cheung(0f913a96-a28d-47ea-848c-f444804c16f2)/publications.html", "https://www.cityu.edu.hk/media/press-release/2022/05/17/cityu-council-announces-appointment-professor-freddy-boey-next-president", "https://www.cityu.edu.hk/media/press-release/2023/05/18/professor-freddy-boey-installed-5th-president-cityu" ] loader = WebBaseLoader(urls) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME), chunk_size=512, chunk_overlap=int(512 / 10), add_start_index=True, strip_whitespace=True, separators=MARKDOWN_SEPARATORS, ) docs_processed = [] for doc in docs: docs_processed += text_splitter.split_documents([doc]) # Remove duplicates unique_texts = {} docs_processed_unique = [] for doc in docs_processed: if doc.page_content not in unique_texts: unique_texts[doc.page_content] = True docs_processed_unique.append(doc) embedding_model = HuggingFaceEmbeddings( model_name=EMBEDDING_MODEL_NAME, multi_process=True, model_kwargs={"device": "cuda"}, encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity ) self.vectorstore = FAISS.from_documents( docs_processed_unique, embedding_model, distance_strategy=DistanceStrategy.COSINE ) # Create LLM READER_MODEL_NAME = path bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config) tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME) # Testing # tokenizer.pad_token = tokenizer.eos_token self.READER_LLM = pipeline( model=model, tokenizer=tokenizer, task="text-generation", do_sample=True, temperature=0.2, repetition_penalty=1.1, return_full_text=False, max_new_tokens=256, ) prompt_in_chat_format = [ { "role": "system", "content": """Using the information contained in the context. Respond only to the question asked, response should be concise and relevant to the question. If the answer cannot be deduced from the context, do not give an answer.""", }, { "role": "user", "content": """Context: {context} Now here is the question you need to answer. Question: {question}""", }, ] self.RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template( prompt_in_chat_format, tokenize=False, add_generation_prompt=True ) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: # get inputs inputs = data.pop("inputs",data) date = data.pop("date", None) retrieved_docs = self.vectorstore.similarity_search(query=inputs, k=2) retrieved_docs_text = [ doc.page_content for doc in retrieved_docs ] # we only need the text of the documents context = "\nExtracted documents:\n" context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]) final_prompt = self.RAG_PROMPT_TEMPLATE.format( question=inputs, context=context ) # Redact an answer answer = self.READER_LLM(final_prompt)[0] return answer