Spaces:

HemaMeena
/

TextTrail

Runtime error

App Files Files Community

TextTrail / app.py

HemaMeena

Update app.py

e2d159d verified 6 months ago

raw

history blame

6.5 kB

	import gradio as gr
	import time
	import os
	import glob
	import textwrap
	import torch
	from transformers import (
	AutoTokenizer, AutoModelForCausalLM,
	BitsAndBytesConfig,
	pipeline
	)
	from langchain.document_loaders import PyPDFLoader, DirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import FAISS
	from langchain.llms import HuggingFacePipeline
	from langchain.embeddings import HuggingFaceInstructEmbeddings
	from langchain.chains import RetrievalQA
	from langchain.prompts import PromptTemplate

	# Configuration class
	class CFG:
	# LLMs
	model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B
	temperature = 0
	top_p = 0.95
	repetition_penalty = 1.15

	# splitting
	split_chunk_size = 800
	split_overlap = 0

	# embeddings
	embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'

	# similar passages
	k = 6

	# paths
	PDFs_path = './' # Set to your PDF path
	Embeddings_path = './faiss-hp-sentence-transformers'
	Output_folder = './rag-vectordb'

	# Set preferred encoding to UTF-8 (for non-ASCII characters)
	import locale
	locale.getpreferredencoding = lambda: "UTF-8"

	# Function to get model
	def get_model(model = CFG.model_name):
	print('\nDownloading model: ', model, '\n\n')

	if model == 'wizardlm':
	model_repo = 'TheBloke/wizardLM-7B-HF'

	tokenizer = AutoTokenizer.from_pretrained(model_repo)
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	)

	model = AutoModelForCausalLM.from_pretrained(
	model_repo,
	quantization_config=bnb_config,
	device_map='auto',
	low_cpu_mem_usage=True
	)

	max_len = 1024

	elif model == 'llama2-7b-chat':
	model_repo = 'daryl149/llama-2-7b-chat-hf'
	tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	)

	model = AutoModelForCausalLM.from_pretrained(
	model_repo,
	quantization_config=bnb_config,
	device_map='auto',
	low_cpu_mem_usage=True,
	trust_remote_code=True
	)

	max_len = 2048

	elif model == 'llama2-13b-chat':
	model_repo = 'daryl149/llama-2-13b-chat-hf'
	tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	)

	model = AutoModelForCausalLM.from_pretrained(
	model_repo,
	quantization_config=bnb_config,
	low_cpu_mem_usage=True,
	trust_remote_code=True
	)

	max_len = 2048

	else:
	print("Model not implemented!")

	return tokenizer, model, max_len

	# Get the model
	tokenizer, model, max_len = get_model(CFG.model_name)

	# Set up Hugging Face pipeline
	pipe = pipeline(
	task="text-generation",
	model=model,
	tokenizer=tokenizer,
	pad_token_id=tokenizer.eos_token_id,
	max_length=max_len,
	temperature=CFG.temperature,
	top_p=CFG.top_p,
	repetition_penalty=CFG.repetition_penalty
	)

	# Langchain pipeline
	llm = HuggingFacePipeline(pipeline=pipe)

	# Load the documents
	loader = DirectoryLoader(
	CFG.PDFs_path,
	glob="./*.pdf",
	loader_cls=PyPDFLoader,
	show_progress=True,
	use_multithreading=True
	)
	documents = loader.load()

	# Split the documents
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=CFG.split_chunk_size,
	chunk_overlap=CFG.split_overlap
	)
	texts = text_splitter.split_documents(documents)

	# Set up vector store
	vectordb = FAISS.from_documents(
	texts,
	HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo)
	)

	# Save the vector store
	vectordb.save_local(f"{CFG.Output_folder}/faiss_index_rag")

	# Define the prompt template
	prompt_template = """
	Don't try to make up an answer, if you don't know just say that you don't know.
	Answer in the same language the question was asked.
	Use only the following pieces of context to answer the question at the end.

	{context}

	Question: {question}
	Answer:"""

	PROMPT = PromptTemplate(
	template=prompt_template,
	input_variables=["context", "question"]
	)

	# Set up retriever
	retriever = vectordb.as_retriever(search_kwargs={"k": CFG.k, "search_type": "similarity"})

	# Create the retrieval-based QA chain
	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff", # other options: "map_reduce", "map_rerank", "refine"
	retriever=retriever,
	chain_type_kwargs={"prompt": PROMPT},
	return_source_documents=True,
	verbose=False
	)

	# Function to wrap text for proper display
	def wrap_text_preserve_newlines(text, width=700):
	lines = text.split('\n')
	wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
	wrapped_text = '\n'.join(wrapped_lines)
	return wrapped_text

	# Function to process model response
	def process_llm_response(llm_response):
	ans = wrap_text_preserve_newlines(llm_response['result'])
	sources_used = ' \n'.join(
	[
	source.metadata['source'].split('/')[-1][:-4]
	+ ' - page: '
	+ str(source.metadata['page'])
	for source in llm_response['source_documents']
	]
	)
	ans = ans + '\n\nSources: \n' + sources_used
	return ans

	# Function to get the answer from the model
	def llm_ans(query):
	start = time.time()
	llm_response = qa_chain.invoke(query)
	ans = process_llm_response(llm_response)
	end = time.time()

	time_elapsed = int(round(end - start, 0))
	time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
	return ans + time_elapsed_str

	# Function for Gradio chat interface
	def predict(message, history):
	output = str(llm_ans(message)).replace("\n", "<br/>")
	return output

	# Set up Gradio interface
	demo = gr.ChatInterface(
	fn=predict,
	title=f'Open-Source LLM ({CFG.model_name}) Question Answering'
	)

	# Start the Gradio interface
	demo.queue()
	demo.launch()