Spaces:
Running
Running
# Reindex script | |
# File: scripts/reindex.py | |
import yaml | |
from orchestrator.client import MCPClient | |
from orchestrator.provenance import init_db, Paper | |
def main(): | |
""" | |
Reindex all existing papers from the provenance DB into Chroma via MCP. | |
""" | |
cfg = yaml.safe_load(open("config.yaml")) | |
chroma = MCPClient(cfg['mcp_servers']['chroma']) | |
Session = init_db(cfg.get('db_url', 'sqlite:///embeddings.db')) | |
session = Session() | |
print("Fetching all paper IDs from DB...") | |
paper_ids = [p.id for p in session.query(Paper).all()] | |
print(f"Found {len(paper_ids)} papers. Reindexing...") | |
for pid in paper_ids: | |
paper = session.query(Paper).filter_by(id=pid).first() | |
text = (paper.title or '') + ' ' + (paper.abstract or '') | |
chroma.call("chroma.insert", {"id": pid, "text": text, "metadata": {}}) | |
print("Reindexing finished.") | |
if __name__ == "__main__": | |
main() | |