import gradio as gr import time import os import glob import textwrap import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline ) from langchain.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain.llms import HuggingFacePipeline from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate # Configuration class class CFG: # LLMs model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B temperature = 0 top_p = 0.95 repetition_penalty = 1.15 # splitting split_chunk_size = 800 split_overlap = 0 # embeddings embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2' # similar passages k = 6 # paths PDFs_path = './' # Set to your PDF path Embeddings_path = './faiss-hp-sentence-transformers' Output_folder = './rag-vectordb' # Set preferred encoding to UTF-8 (for non-ASCII characters) import locale locale.getpreferredencoding = lambda: "UTF-8" # Function to get model def get_model(model = CFG.model_name): print('\nDownloading model: ', model, '\n\n') if model == 'wizardlm': model_repo = 'TheBloke/wizardLM-7B-HF' tokenizer = AutoTokenizer.from_pretrained(model_repo) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( model_repo, quantization_config=bnb_config, device_map='auto', low_cpu_mem_usage=True ) max_len = 1024 elif model == 'llama2-7b-chat': model_repo = 'daryl149/llama-2-7b-chat-hf' tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( model_repo, quantization_config=bnb_config, device_map='auto', low_cpu_mem_usage=True, trust_remote_code=True ) max_len = 2048 elif model == 'llama2-13b-chat': model_repo = 'daryl149/llama-2-13b-chat-hf' tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( model_repo, quantization_config=bnb_config, low_cpu_mem_usage=True, trust_remote_code=True ) max_len = 2048 else: print("Model not implemented!") return tokenizer, model, max_len # Get the model tokenizer, model, max_len = get_model(CFG.model_name) # Set up Hugging Face pipeline pipe = pipeline( task="text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id, max_length=max_len, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty ) # Langchain pipeline llm = HuggingFacePipeline(pipeline=pipe) # Load the documents loader = DirectoryLoader( CFG.PDFs_path, glob="./*.pdf", loader_cls=PyPDFLoader, show_progress=True, use_multithreading=True ) documents = loader.load() # Split the documents text_splitter = RecursiveCharacterTextSplitter( chunk_size=CFG.split_chunk_size, chunk_overlap=CFG.split_overlap ) texts = text_splitter.split_documents(documents) # Set up vector store vectordb = FAISS.from_documents( texts, HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo) ) # Save the vector store vectordb.save_local(f"{CFG.Output_folder}/faiss_index_rag") # Define the prompt template prompt_template = """ Don't try to make up an answer, if you don't know just say that you don't know. Answer in the same language the question was asked. Use only the following pieces of context to answer the question at the end. {context} Question: {question} Answer:""" PROMPT = PromptTemplate( template=prompt_template, input_variables=["context", "question"] ) # Set up retriever retriever = vectordb.as_retriever(search_kwargs={"k": CFG.k, "search_type": "similarity"}) # Create the retrieval-based QA chain qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", # other options: "map_reduce", "map_rerank", "refine" retriever=retriever, chain_type_kwargs={"prompt": PROMPT}, return_source_documents=True, verbose=False ) # Function to wrap text for proper display def wrap_text_preserve_newlines(text, width=700): lines = text.split('\n') wrapped_lines = [textwrap.fill(line, width=width) for line in lines] wrapped_text = '\n'.join(wrapped_lines) return wrapped_text # Function to process model response def process_llm_response(llm_response): ans = wrap_text_preserve_newlines(llm_response['result']) sources_used = ' \n'.join( [ source.metadata['source'].split('/')[-1][:-4] + ' - page: ' + str(source.metadata['page']) for source in llm_response['source_documents'] ] ) ans = ans + '\n\nSources: \n' + sources_used return ans # Function to get the answer from the model def llm_ans(query): start = time.time() llm_response = qa_chain.invoke(query) ans = process_llm_response(llm_response) end = time.time() time_elapsed = int(round(end - start, 0)) time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s' return ans + time_elapsed_str # Function for Gradio chat interface def predict(message, history): output = str(llm_ans(message)).replace("\n", "
") return output # Set up Gradio interface demo = gr.ChatInterface( fn=predict, title=f'Open-Source LLM ({CFG.model_name}) Question Answering' ) # Start the Gradio interface demo.queue() demo.launch()