Spaces:

Samarth991
/

RAG-PDF_With_LLAMA-3B

Sleeping

File size: 2,025 Bytes

cb0d8e2
5ab154b
4294bc8
5ab154b
 
1d73ddf
4294bc8
d45c8e8
5ab154b
 
4b41cfa
5ab154b
 
 
4b41cfa
5ab154b
 
4b41cfa
5ab154b
 
 
4b41cfa
5ab154b
 
 
4b41cfa
4294bc8
 
 
 
 
 
 
 
 
 
d45c8e8
5ab154b
 
 
 
 
 
 
 
 
 
 
4294bc8
 
 
 
5ab154b
 
dc76509
 
5ab154b
48962a0

import os 
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from PyPDF2 import PdfReader

embedding_modelPath = "sentence-transformers/all-MiniLM-l6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})

def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document.

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents

def read_pdf_text(pdf_path):
    text = ""
    pdf_reader = PdfReader(pdf_path)
        for page in pdf_reader.pages:
            text += page.extract_text()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    text_chunks = text_splitter.split_text(text)
    return text_chunks

def read_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    print("Total Documents :",len(docs))
    return docs

def Chunks(docs):
    text_splitter = SemanticChunker(embeddings,breakpoint_threshold_type='interquartile')
    docs = text_splitter.split_documents(docs)
    cleaned_docs = replace_t_with_space(docs)
    return cleaned_docs
    
def PDF_4_QA(file_path):
    #docs = read_pdf(file_path)
    #cleaned_docs = Chunks(docs)
    read_pdf_text(file_path)
    vectordb = Chroma.from_documents(
        documents=cleaned_docs,
        embedding=embeddings,
        persist_directory="Chroma/docs"
    )
    return vectordb,docs