Spaces:
Running
Running
File size: 7,538 Bytes
54f5afe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
"""Index documents."""
from pathlib import Path
import numpy as np
from sqlalchemy.engine import make_url
from sqlmodel import Session, select
from tqdm.auto import tqdm
from raglite._config import RAGLiteConfig
from raglite._database import Chunk, ChunkEmbedding, Document, IndexMetadata, create_database_engine
from raglite._embed import embed_sentences, sentence_embedding_type
from raglite._markdown import document_to_markdown
from raglite._split_chunks import split_chunks
from raglite._split_sentences import split_sentences
from raglite._typing import FloatMatrix
def _create_chunk_records(
document_id: str,
chunks: list[str],
chunk_embeddings: list[FloatMatrix],
config: RAGLiteConfig,
) -> tuple[list[Chunk], list[list[ChunkEmbedding]]]:
"""Process chunks into chunk and chunk embedding records."""
# Create the chunk records.
chunk_records, headings = [], ""
for i, chunk in enumerate(chunks):
# Create and append the chunk record.
record = Chunk.from_body(document_id=document_id, index=i, body=chunk, headings=headings)
chunk_records.append(record)
# Update the Markdown headings with those of this chunk.
headings = record.extract_headings()
# Create the chunk embedding records.
chunk_embedding_records = []
if sentence_embedding_type(config=config) == "late_chunking":
# Every chunk record is associated with a list of chunk embedding records, one for each of
# the sentences in the chunk.
for chunk_record, chunk_embedding in zip(chunk_records, chunk_embeddings, strict=True):
chunk_embedding_records.append(
[
ChunkEmbedding(chunk_id=chunk_record.id, embedding=sentence_embedding)
for sentence_embedding in chunk_embedding
]
)
else:
# Embed the full chunks, including the current Markdown headings.
full_chunk_embeddings = embed_sentences([str(chunk) for chunk in chunks], config=config)
# Every chunk record is associated with a list of chunk embedding records. The chunk
# embedding records each correspond to a linear combination of a sentence embedding and an
# embedding of the full chunk with Markdown headings.
α = 0.382 # Golden ratio. # noqa: PLC2401
for chunk_record, chunk_embedding, full_chunk_embedding in zip(
chunk_records, chunk_embeddings, full_chunk_embeddings, strict=True
):
chunk_embedding_records.append(
[
ChunkEmbedding(
chunk_id=chunk_record.id,
embedding=α * sentence_embedding + (1 - α) * full_chunk_embedding,
)
for sentence_embedding in chunk_embedding
]
)
return chunk_records, chunk_embedding_records
def insert_document(doc_path: Path, *, config: RAGLiteConfig | None = None) -> None: # noqa: PLR0915
"""Insert a document into the database and update the index."""
# Use the default config if not provided.
config = config or RAGLiteConfig()
db_backend = make_url(config.db_url).get_backend_name()
# Preprocess the document into chunks and chunk embeddings.
with tqdm(total=5, unit="step", dynamic_ncols=True) as pbar:
pbar.set_description("Initializing database")
engine = create_database_engine(config)
pbar.update(1)
pbar.set_description("Converting to Markdown")
doc = document_to_markdown(doc_path)
pbar.update(1)
pbar.set_description("Splitting sentences")
sentences = split_sentences(doc, max_len=config.chunk_max_size)
pbar.update(1)
pbar.set_description("Embedding sentences")
sentence_embeddings = embed_sentences(sentences, config=config)
pbar.update(1)
pbar.set_description("Splitting chunks")
chunks, chunk_embeddings = split_chunks(
sentences=sentences,
sentence_embeddings=sentence_embeddings,
sentence_window_size=config.embedder_sentence_window_size,
max_size=config.chunk_max_size,
)
pbar.update(1)
# Create and store the chunk records.
with Session(engine) as session:
# Add the document to the document table.
document_record = Document.from_path(doc_path)
if session.get(Document, document_record.id) is None:
session.add(document_record)
session.commit()
# Create the chunk records to insert into the chunk table.
chunk_records, chunk_embedding_records = _create_chunk_records(
document_record.id, chunks, chunk_embeddings, config
)
# Store the chunk and chunk embedding records.
for chunk_record, chunk_embedding_record_list in tqdm(
zip(chunk_records, chunk_embedding_records, strict=True),
desc="Inserting chunks",
total=len(chunk_records),
unit="chunk",
dynamic_ncols=True,
):
if session.get(Chunk, chunk_record.id) is not None:
continue
session.add(chunk_record)
session.add_all(chunk_embedding_record_list)
session.commit()
# Manually update the vector search chunk index for SQLite.
if db_backend == "sqlite":
from pynndescent import NNDescent
with Session(engine) as session:
# Get the vector search chunk index from the database, or create a new one.
index_metadata = session.get(IndexMetadata, "default") or IndexMetadata(id="default")
chunk_ids = index_metadata.metadata_.get("chunk_ids", [])
chunk_sizes = index_metadata.metadata_.get("chunk_sizes", [])
# Get the unindexed chunks.
unindexed_chunks = list(session.exec(select(Chunk).offset(len(chunk_ids))).all())
if not unindexed_chunks:
return
# Assemble the unindexed chunk embeddings into a NumPy array.
unindexed_chunk_embeddings = [chunk.embedding_matrix for chunk in unindexed_chunks]
X = np.vstack(unindexed_chunk_embeddings) # noqa: N806
# Index the unindexed chunks.
with tqdm(
total=len(unindexed_chunks),
desc="Indexing chunks",
unit="chunk",
dynamic_ncols=True,
) as pbar:
# Fit or update the ANN index.
if len(chunk_ids) == 0:
nndescent = NNDescent(X, metric=config.vector_search_index_metric)
else:
nndescent = index_metadata.metadata_["index"]
nndescent.update(X)
# Prepare the ANN index so it can to handle query vectors not in the training set.
nndescent.prepare()
# Update the index metadata and mark it as dirty by recreating the dictionary.
index_metadata.metadata_ = {
**index_metadata.metadata_,
"index": nndescent,
"chunk_ids": chunk_ids + [c.id for c in unindexed_chunks],
"chunk_sizes": chunk_sizes + [len(em) for em in unindexed_chunk_embeddings],
}
# Store the updated vector search chunk index.
session.add(index_metadata)
session.commit()
pbar.update(len(unindexed_chunks))
|