|
from langchain.document_loaders import PyPDFLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from sentence_transformers import SentenceTransformer
|
|
import faiss
|
|
import numpy as np
|
|
import pickle
|
|
|
|
|
|
pdf_path = "data\Mental Health Handbook English.pdf"
|
|
loader = PyPDFLoader(file_path=pdf_path)
|
|
|
|
|
|
documents = loader.load()
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
|
|
sections = text_splitter.split_documents(documents)
|
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
|
|
|
section_texts = [section.page_content for section in sections]
|
|
embeddings = model.encode(section_texts)
|
|
|
|
print(embeddings.shape)
|
|
|
|
embeddings_np = np.array(embeddings).astype('float32')
|
|
|
|
|
|
dimension = embeddings_np.shape[1]
|
|
index = faiss.IndexFlatL2(dimension)
|
|
|
|
|
|
index.add(embeddings_np)
|
|
|
|
|
|
faiss.write_index(index, "database/pdf_sections_index.faiss")
|
|
|
|
|
|
sections_data = [
|
|
{
|
|
'content': section.page_content,
|
|
'metadata': section.metadata
|
|
}
|
|
for section in sections
|
|
]
|
|
|
|
|
|
with open('database/pdf_sections_data.pkl', 'wb') as f:
|
|
pickle.dump(sections_data, f)
|
|
|
|
print("Embeddings stored in FAISS index and saved to file.")
|
|
|