Spaces:

LuckRafly
/

ChatBot-PDF

Sleeping

File size: 2,407 Bytes

c74cdc6

from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFaceHub
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

load_dotenv()

## Read Multiple PDF files
def read_multiple_pdf(files):
    if type(files) == str:
        files = list("document\yolo.pdf".split(" "))
    texts = ""
    for file in files:
        docs = PdfReader(file)
        for text in docs.pages:
            texts += (text.extract_text())
        return texts
    

## Split PDF into chunks
def chunk_docs(document, chunk_size = 500, chunk_overlap = 50, separators="\n"):
    """
    Split a document into smaller chunks of text.

    Args:
        document (str): The document to be chunked.
        chunk_size (int, optional): The size of each chunk in characters. Defaults to 500.
        chunk_overlap (int, optional): The overlap between adjacent chunks in characters. Defaults to 50.
        separators (str, optional): The separators used to split the document into chunks. Defaults to "\\n".

    Returns:
        str: The chunked document.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        separators=separators,
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )
    chunk = text_splitter.split_text(document)
    return chunk

## Embeds the Data
def embedding_chunks(chunk, model_name = "sentence-transformers/all-MiniLM-L12-v2"):
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    vector_stores = FAISS.from_texts(chunk, embeddings)
    return vector_stores


## setup conversational chain
def chain_conversation(vector_stores,config = {'max_new_tokens': 256, 'temperature': 0.1},model_repo = "mistralai/Mixtral-8x7B-Instruct-v0.1"):
    llm = HuggingFaceHub(repo_id = model_repo, model_kwargs = config)
    memory = ConversationBufferMemory(memory_key= "chat_history", return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(llm= llm, 
                                                               retriever= vector_stores.as_retriever(search_kwargs={"k": 10}),
                                                               memory= memory)
    return conversation_chain