Spaces:

anthony-chen
/

Chem-210-Autograder

Sleeping

File size: 1,778 Bytes

a1551fc

from pymilvus import Collection, DataType, FieldSchema, CollectionSchema, connections
from sentence_transformers import SentenceTransformer
import configparser


def retrieve_molecule_index(molecule):
    model = SentenceTransformer(model_name_or_path="bert-base-uncased")
    search_vector = model.encode(molecule).reshape(1,-1)
    cfp = configparser.RawConfigParser()
    cfp.read('config.ini')
    milvus_uri = cfp.get('example', 'uri')
    token = cfp.get('example', 'token')
    connections.connect("default",
                        uri=milvus_uri,
                        token=token)
    print(f"Connecting to DB: {milvus_uri}")
    collection_name = "molecule_embeddings"
    dim = 768  # Adjust based on the dimensionality of your embeddings
    
    # Define collection schema
    molecule_cid = FieldSchema(name="molecule_cid", dtype=DataType.INT64, description="cid", is_primary = True)
    molecule_name = FieldSchema(name="molecule_name", dtype=DataType.VARCHAR, max_length=256, description="name")
    molecule_embeddings = FieldSchema(name="molecule_embedding", dtype=DataType.FLOAT_VECTOR, dim=dim)
    schema = CollectionSchema(fields=[molecule_cid, molecule_name, molecule_embeddings],
                          auto_id=False,
                          description="my first collection!")
    
    print(f"Creating example collection: {collection_name}")
    collection = Collection(name=collection_name, schema=schema)
    
    search_params = {"metric_type": "IP"}
    topk = 1
    results = collection.search(search_vector, anns_field='molecule_embedding', param=search_params, limit=topk)
    print(results)
    # Disconnect from Milvus server
    connections.disconnect("default")
    print("Disconnected from Milvus server.")
    return results[0].ids