Spaces:
Configuration error
Configuration error
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import pickle | |
# Load the PDF | |
pdf_path = "data\Mental Health Handbook English.pdf" | |
loader = PyPDFLoader(file_path=pdf_path) | |
# Load the content | |
documents = loader.load() | |
# Split the document into sections | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) | |
sections = text_splitter.split_documents(documents) | |
# Load the embedding model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Generate embeddings for each section | |
section_texts = [section.page_content for section in sections] | |
embeddings = model.encode(section_texts) | |
print(embeddings.shape) | |
embeddings_np = np.array(embeddings).astype('float32') | |
# Create a FAISS index | |
dimension = embeddings_np.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
# Add vectors to the index | |
index.add(embeddings_np) | |
# Save the index to a file | |
faiss.write_index(index, "database/pdf_sections_index.faiss") | |
# When creating the index: | |
sections_data = [ | |
{ | |
'content': section.page_content, | |
'metadata': section.metadata | |
} | |
for section in sections | |
] | |
# Save sections data | |
with open('database/pdf_sections_data.pkl', 'wb') as f: | |
pickle.dump(sections_data, f) | |
print("Embeddings stored in FAISS index and saved to file.") | |