Spaces:
Sleeping
Sleeping
File size: 4,567 Bytes
306849a 409f88f 306849a 409f88f 306849a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
# milvus.py
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import pandas as pd
import os
import sys
from sentence_transformers import SentenceTransformer
import time
# Default Milvus connection details
DEFAULT_MILVUS_HOST = 'localhost'
DEFAULT_MILVUS_PORT = '19530'
DEFAULT_COLLECTION_NAME = 'document_collection'
DEFAULT_DIMENSION = 384 # Adjust based on your embedding model
DEFAULT_MAX_RETRIES = 3
DEFAULT_RETRY_DELAY = 5 # seconds
# Embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
def create_milvus_collection(host, port, collection_name, dimension):
"""
Creates a new Milvus collection if it doesn't exist.
"""
if not utility.has_collection(collection_name):
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=500),
FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=dimension)
]
schema = CollectionSchema(fields, "Document Vector Store")
collection = Collection(collection_name, schema, consistency_level="Strong")
index_params = {
"metric_type": "L2",
"index_type": "IVF_FLAT",
"params": {"nlist": 1024}
}
collection.create_index(field_name="content_vector", index_params=index_params)
print(f"Collection {collection_name} created and index built.")
else:
print(f"Collection {collection_name} already exists.")
def load_data_to_milvus(host, port, collection_name):
"""
Loads data from the DataFrame into Milvus, using sentence embeddings.
"""
extraction_dir = "extraction"
pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
if not pkl_files:
print("No .pkl files found in the 'extraction' directory.")
return
df_path = os.path.join(extraction_dir, pkl_files[0])
df = pd.read_pickle(df_path)
# Generate sentence embeddings
df['content_vector'] = df['content'].apply(lambda x: model.encode(x).tolist())
data_to_insert = [
df['path'].tolist(),
df['content_vector'].tolist()
]
collection = Collection(collection_name)
collection.insert(data_to_insert)
collection.flush()
print(f"Data from {df_path} loaded into Milvus collection {collection_name}.")
def connect_to_milvus(host, port, max_retries, retry_delay):
"""Connects to Milvus with retries."""
retries = 0
while retries < max_retries:
try:
connections.connect(host=host, port=port)
print(f"Successfully connected to Milvus at {host}:{port}")
return True
except Exception as e:
print(f"Error connecting to Milvus: {e}")
retries += 1
if retries < max_retries:
print(f"Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
else:
print("Max retries reached. Could not connect to Milvus.")
return False
def initialize_milvus(host, port, collection_name, dimension, max_retries, retry_delay):
"""Initializes Milvus with parameters.
Returns:
True if successfully connected and initialized, False otherwise.
"""
if connect_to_milvus(host, port, max_retries, retry_delay):
try:
create_milvus_collection(host, port, collection_name, dimension)
load_data_to_milvus(host, port, collection_name)
connections.disconnect(alias='default')
return True # Return True if everything is successful
except Exception as e:
print(f"Error during initialization: {e}")
return False # Return False if any error occurs during collection creation or data loading
else:
return False # Return False if connection failed
if __name__ == "__main__":
# Use default values or environment variables if available
milvus_host = os.environ.get('MILVUS_HOST', DEFAULT_MILVUS_HOST)
milvus_port = os.environ.get('MILVUS_PORT', DEFAULT_MILVUS_PORT)
collection_name = os.environ.get('COLLECTION_NAME', DEFAULT_COLLECTION_NAME)
dimension = int(os.environ.get('DIMENSION', DEFAULT_DIMENSION))
max_retries = int(os.environ.get('MAX_RETRIES', DEFAULT_MAX_RETRIES))
retry_delay = int(os.environ.get('RETRY_DELAY', DEFAULT_RETRY_DELAY))
initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay) |