Spaces:
Runtime error
Runtime error
import glob | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import Qdrant | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_core.documents.base import Document | |
class DocReader: | |
def __init__(self, pdf_path, model_path="sentence-transformers/all-mpnet-base-v2", persist_directory="db"): | |
self.pdfs = glob.glob(f"{pdf_path}/*.pdf") # Adjusted to get all PDF files in the folder | |
self.model_path = model_path | |
self.persist_directory = persist_directory | |
def load_pdfs(self): | |
all_pages = [] | |
for pdf_file in self.pdfs: | |
loader = PyPDFLoader(pdf_file) | |
pages = loader.load() | |
all_pages.extend(pages) | |
return all_pages | |
def convert_to_markdown(self, documents): | |
markdown_text = "" | |
for doc in documents: | |
page_text = doc.page_content.replace('\n', '\n\n') # Add extra newline for Markdown | |
markdown_text += page_text + "\n\n---\n\n" | |
return markdown_text | |
def split_text(self, pages): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=128, | |
chunk_overlap=24) | |
documents = [Document(page_content=page) for page in pages] | |
split_documents = text_splitter.split_documents(documents) | |
texts = [doc.page_content for doc in split_documents] | |
return texts | |
def generate_embeddings(self, texts): | |
embeddings = HuggingFaceEmbeddings( | |
model_name=self.model_path, | |
model_kwargs={"device": "cuda:0"}, | |
encode_kwargs={"normalize_embeddings": True}, | |
) | |
documents = [Document(page_content=text) for text in texts] | |
db = Qdrant.from_documents(documents, embeddings, location=":memory:", collection_name="pdf_collection") | |
return db | |
def search_similar(self, input_text, k=3): | |
results = self.db.similarity_search(input_text, k) | |
return results | |