Spaces:
Sleeping
Sleeping
File size: 2,108 Bytes
cb0d8e2 5ab154b 4294bc8 5ab154b 1d73ddf 4294bc8 a0b6dfc d45c8e8 5ab154b 4b41cfa 5ab154b 4b41cfa 5ab154b 4b41cfa 5ab154b 4b41cfa 5ab154b 4b41cfa 4294bc8 73bf17a 4294bc8 f18883e 4294bc8 d45c8e8 5ab154b 4294bc8 6a1b5d7 f18883e 6a1b5d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
embedding_modelPath = "sentence-transformers/all-MiniLM-l6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})
def replace_t_with_space(list_of_documents):
"""
Replaces all tab characters ('\t') with spaces in the page content of each document.
Args:
list_of_documents: A list of document objects, each with a 'page_content' attribute.
Returns:
The modified list of documents with tab characters replaced by spaces.
"""
for doc in list_of_documents:
doc.page_content = doc.page_content.replace('\t', ' ') # Replace tabs with spaces
return list_of_documents
def read_pdf_text(pdf_path):
text = ""
pdf_reader = PdfReader(pdf_path)
for page in pdf_reader.pages:
text += page.extract_text()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
text_chunks = text_splitter.split_text(text)
text_docs = [Document(page_content=txt) for txt in text_chunks]
return text_docs
def read_pdf(pdf_path):
loader = PyPDFLoader(pdf_path)
docs = loader.load()
print("Total Documents :",len(docs))
return docs
def Chunks(docs):
text_splitter = SemanticChunker(embeddings,breakpoint_threshold_type='interquartile')
docs = text_splitter.split_documents(docs)
cleaned_docs = replace_t_with_space(docs)
return cleaned_docs
def PDF_4_QA(file_path):
#docs = read_pdf(file_path)
#cleaned_docs = Chunks(docs)
cleaned_docs = read_pdf_text(file_path)
vectordb = Chroma.from_documents(cleaned_docs,embedding=embeddings,persist_directory="Chroma/docs")
return vectordb,cleaned_docs |