Ashmi Banerjee
refactored the vectordb
89cd5d5
raw
history blame
2 kB
from typing import Optional, Callable
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(encoding='utf-8', level=logging.DEBUG)
from src.vectordb.helpers import read_docs, read_listings, preprocess_df
from src.vectordb.schema import WikivoyageDocuments, WikivoyageListings
from src.vectordb.helpers import set_uri
import lancedb
def _create_table_and_ingest_data(table_name: str, schema: object, data_fetcher: Callable,
preprocessor: Optional[Callable] = None):
"""
Generalized function to create a table and ingest data into the database.
Args:
- table_name: str, name of the table to create.
- schema: object, schema of the table.
- data_fetcher: Callable, function to fetch the data.
- preprocessor: Optional[Callable], function to preprocess the data (default is None).
"""
uri = set_uri()
db = lancedb.connect(uri)
logger.info(f"Connected to DB. Reading data for table {table_name} now...")
df = data_fetcher()
if preprocessor:
df = preprocessor(df)
logger.info(f"Finished reading data for {table_name}, attempting to create table and ingest the data...")
db.drop_table(table_name, ignore_missing=True)
table = db.create_table(table_name, schema=schema)
table.add(df.to_dict('records'))
logger.info(f"Completed ingestion for {table_name}.")
def create_wikivoyage_docs_db_and_add_data():
"""
Creates the Wikivoyage documents table and ingests data.
"""
_create_table_and_ingest_data(
table_name="wikivoyage_documents",
schema=WikivoyageDocuments,
data_fetcher=read_docs,
preprocessor=preprocess_df
)
def create_wikivoyage_listings_db_and_add_data():
"""
Creates the Wikivoyage listings table and ingests data.
"""
_create_table_and_ingest_data(
table_name="wikivoyage_listings",
schema=WikivoyageListings,
data_fetcher=read_listings
)