Spaces:
Running
Running
"""Index documents.""" | |
from pathlib import Path | |
import numpy as np | |
from sqlalchemy.engine import make_url | |
from sqlmodel import Session, select | |
from tqdm.auto import tqdm | |
from raglite._config import RAGLiteConfig | |
from raglite._database import Chunk, ChunkEmbedding, Document, IndexMetadata, create_database_engine | |
from raglite._embed import embed_sentences, sentence_embedding_type | |
from raglite._markdown import document_to_markdown | |
from raglite._split_chunks import split_chunks | |
from raglite._split_sentences import split_sentences | |
from raglite._typing import FloatMatrix | |
def _create_chunk_records( | |
document_id: str, | |
chunks: list[str], | |
chunk_embeddings: list[FloatMatrix], | |
config: RAGLiteConfig, | |
) -> tuple[list[Chunk], list[list[ChunkEmbedding]]]: | |
"""Process chunks into chunk and chunk embedding records.""" | |
# Create the chunk records. | |
chunk_records, headings = [], "" | |
for i, chunk in enumerate(chunks): | |
# Create and append the chunk record. | |
record = Chunk.from_body(document_id=document_id, index=i, body=chunk, headings=headings) | |
chunk_records.append(record) | |
# Update the Markdown headings with those of this chunk. | |
headings = record.extract_headings() | |
# Create the chunk embedding records. | |
chunk_embedding_records = [] | |
if sentence_embedding_type(config=config) == "late_chunking": | |
# Every chunk record is associated with a list of chunk embedding records, one for each of | |
# the sentences in the chunk. | |
for chunk_record, chunk_embedding in zip(chunk_records, chunk_embeddings, strict=True): | |
chunk_embedding_records.append( | |
[ | |
ChunkEmbedding(chunk_id=chunk_record.id, embedding=sentence_embedding) | |
for sentence_embedding in chunk_embedding | |
] | |
) | |
else: | |
# Embed the full chunks, including the current Markdown headings. | |
full_chunk_embeddings = embed_sentences([str(chunk) for chunk in chunks], config=config) | |
# Every chunk record is associated with a list of chunk embedding records. The chunk | |
# embedding records each correspond to a linear combination of a sentence embedding and an | |
# embedding of the full chunk with Markdown headings. | |
α = 0.382 # Golden ratio. # noqa: PLC2401 | |
for chunk_record, chunk_embedding, full_chunk_embedding in zip( | |
chunk_records, chunk_embeddings, full_chunk_embeddings, strict=True | |
): | |
chunk_embedding_records.append( | |
[ | |
ChunkEmbedding( | |
chunk_id=chunk_record.id, | |
embedding=α * sentence_embedding + (1 - α) * full_chunk_embedding, | |
) | |
for sentence_embedding in chunk_embedding | |
] | |
) | |
return chunk_records, chunk_embedding_records | |
def insert_document(doc_path: Path, *, config: RAGLiteConfig | None = None) -> None: # noqa: PLR0915 | |
"""Insert a document into the database and update the index.""" | |
# Use the default config if not provided. | |
config = config or RAGLiteConfig() | |
db_backend = make_url(config.db_url).get_backend_name() | |
# Preprocess the document into chunks and chunk embeddings. | |
with tqdm(total=5, unit="step", dynamic_ncols=True) as pbar: | |
pbar.set_description("Initializing database") | |
engine = create_database_engine(config) | |
pbar.update(1) | |
pbar.set_description("Converting to Markdown") | |
doc = document_to_markdown(doc_path) | |
pbar.update(1) | |
pbar.set_description("Splitting sentences") | |
sentences = split_sentences(doc, max_len=config.chunk_max_size) | |
pbar.update(1) | |
pbar.set_description("Embedding sentences") | |
sentence_embeddings = embed_sentences(sentences, config=config) | |
pbar.update(1) | |
pbar.set_description("Splitting chunks") | |
chunks, chunk_embeddings = split_chunks( | |
sentences=sentences, | |
sentence_embeddings=sentence_embeddings, | |
sentence_window_size=config.embedder_sentence_window_size, | |
max_size=config.chunk_max_size, | |
) | |
pbar.update(1) | |
# Create and store the chunk records. | |
with Session(engine) as session: | |
# Add the document to the document table. | |
document_record = Document.from_path(doc_path) | |
if session.get(Document, document_record.id) is None: | |
session.add(document_record) | |
session.commit() | |
# Create the chunk records to insert into the chunk table. | |
chunk_records, chunk_embedding_records = _create_chunk_records( | |
document_record.id, chunks, chunk_embeddings, config | |
) | |
# Store the chunk and chunk embedding records. | |
for chunk_record, chunk_embedding_record_list in tqdm( | |
zip(chunk_records, chunk_embedding_records, strict=True), | |
desc="Inserting chunks", | |
total=len(chunk_records), | |
unit="chunk", | |
dynamic_ncols=True, | |
): | |
if session.get(Chunk, chunk_record.id) is not None: | |
continue | |
session.add(chunk_record) | |
session.add_all(chunk_embedding_record_list) | |
session.commit() | |
# Manually update the vector search chunk index for SQLite. | |
if db_backend == "sqlite": | |
from pynndescent import NNDescent | |
with Session(engine) as session: | |
# Get the vector search chunk index from the database, or create a new one. | |
index_metadata = session.get(IndexMetadata, "default") or IndexMetadata(id="default") | |
chunk_ids = index_metadata.metadata_.get("chunk_ids", []) | |
chunk_sizes = index_metadata.metadata_.get("chunk_sizes", []) | |
# Get the unindexed chunks. | |
unindexed_chunks = list(session.exec(select(Chunk).offset(len(chunk_ids))).all()) | |
if not unindexed_chunks: | |
return | |
# Assemble the unindexed chunk embeddings into a NumPy array. | |
unindexed_chunk_embeddings = [chunk.embedding_matrix for chunk in unindexed_chunks] | |
X = np.vstack(unindexed_chunk_embeddings) # noqa: N806 | |
# Index the unindexed chunks. | |
with tqdm( | |
total=len(unindexed_chunks), | |
desc="Indexing chunks", | |
unit="chunk", | |
dynamic_ncols=True, | |
) as pbar: | |
# Fit or update the ANN index. | |
if len(chunk_ids) == 0: | |
nndescent = NNDescent(X, metric=config.vector_search_index_metric) | |
else: | |
nndescent = index_metadata.metadata_["index"] | |
nndescent.update(X) | |
# Prepare the ANN index so it can to handle query vectors not in the training set. | |
nndescent.prepare() | |
# Update the index metadata and mark it as dirty by recreating the dictionary. | |
index_metadata.metadata_ = { | |
**index_metadata.metadata_, | |
"index": nndescent, | |
"chunk_ids": chunk_ids + [c.id for c in unindexed_chunks], | |
"chunk_sizes": chunk_sizes + [len(em) for em in unindexed_chunk_embeddings], | |
} | |
# Store the updated vector search chunk index. | |
session.add(index_metadata) | |
session.commit() | |
pbar.update(len(unindexed_chunks)) | |