|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from sentence_transformers import SentenceTransformer |
|
import faiss |
|
import numpy as np |
|
import pickle |
|
|
|
|
|
pdf_path = "data\Mental Health Handbook English.pdf" |
|
loader = PyPDFLoader(file_path=pdf_path) |
|
|
|
|
|
documents = loader.load() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) |
|
sections = text_splitter.split_documents(documents) |
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
section_texts = [section.page_content for section in sections] |
|
embeddings = model.encode(section_texts) |
|
|
|
print(embeddings.shape) |
|
|
|
embeddings_np = np.array(embeddings).astype('float32') |
|
|
|
|
|
dimension = embeddings_np.shape[1] |
|
index = faiss.IndexFlatL2(dimension) |
|
|
|
|
|
index.add(embeddings_np) |
|
|
|
|
|
faiss.write_index(index, "database/pdf_sections_index.faiss") |
|
|
|
|
|
sections_data = [ |
|
{ |
|
'content': section.page_content, |
|
'metadata': section.metadata |
|
} |
|
for section in sections |
|
] |
|
|
|
|
|
with open('database/pdf_sections_data.pkl', 'wb') as f: |
|
pickle.dump(sections_data, f) |
|
|
|
print("Embeddings stored in FAISS index and saved to file.") |
|
|