Spaces:
Sleeping
Sleeping
from dotenv import load_dotenv | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.llms import HuggingFaceHub | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
load_dotenv() | |
## Read Multiple PDF files | |
def read_multiple_pdf(files): | |
if type(files) == str: | |
files = list("document\yolo.pdf".split(" ")) | |
texts = "" | |
for file in files: | |
docs = PdfReader(file) | |
for text in docs.pages: | |
texts += (text.extract_text()) | |
return texts | |
## Split PDF into chunks | |
def chunk_docs(document, chunk_size = 500, chunk_overlap = 50, separators="\n"): | |
""" | |
Split a document into smaller chunks of text. | |
Args: | |
document (str): The document to be chunked. | |
chunk_size (int, optional): The size of each chunk in characters. Defaults to 500. | |
chunk_overlap (int, optional): The overlap between adjacent chunks in characters. Defaults to 50. | |
separators (str, optional): The separators used to split the document into chunks. Defaults to "\\n". | |
Returns: | |
str: The chunked document. | |
""" | |
text_splitter = RecursiveCharacterTextSplitter( | |
separators=separators, | |
chunk_size = chunk_size, | |
chunk_overlap = chunk_overlap | |
) | |
chunk = text_splitter.split_text(document) | |
return chunk | |
## Embeds the Data | |
def embedding_chunks(chunk, model_name = "sentence-transformers/all-MiniLM-L12-v2"): | |
embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
vector_stores = FAISS.from_texts(chunk, embeddings) | |
return vector_stores | |
## setup conversational chain | |
def chain_conversation(vector_stores,config = {'max_new_tokens': 256, 'temperature': 0.1},model_repo = "mistralai/Mixtral-8x7B-Instruct-v0.1"): | |
llm = HuggingFaceHub(repo_id = model_repo, model_kwargs = config) | |
memory = ConversationBufferMemory(memory_key= "chat_history", return_messages=True) | |
conversation_chain = ConversationalRetrievalChain.from_llm(llm= llm, | |
retriever= vector_stores.as_retriever(search_kwargs={"k": 10}), | |
memory= memory) | |
return conversation_chain |