File size: 4,567 Bytes
306849a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409f88f
306849a
409f88f
 
 
 
 
 
 
 
 
 
 
 
 
 
306849a
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# milvus.py
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import pandas as pd
import os
import sys
from sentence_transformers import SentenceTransformer
import time

# Default Milvus connection details
DEFAULT_MILVUS_HOST = 'localhost'
DEFAULT_MILVUS_PORT = '19530'
DEFAULT_COLLECTION_NAME = 'document_collection'
DEFAULT_DIMENSION = 384  # Adjust based on your embedding model
DEFAULT_MAX_RETRIES = 3
DEFAULT_RETRY_DELAY = 5  # seconds

# Embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def create_milvus_collection(host, port, collection_name, dimension):
    """
    Creates a new Milvus collection if it doesn't exist.
    """
    if not utility.has_collection(collection_name):
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=500),
            FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=dimension)
        ]
        schema = CollectionSchema(fields, "Document Vector Store")
        collection = Collection(collection_name, schema, consistency_level="Strong")

        index_params = {
            "metric_type": "L2",
            "index_type": "IVF_FLAT",
            "params": {"nlist": 1024}
        }
        collection.create_index(field_name="content_vector", index_params=index_params)
        print(f"Collection {collection_name} created and index built.")
    else:
        print(f"Collection {collection_name} already exists.")

def load_data_to_milvus(host, port, collection_name):
    """
    Loads data from the DataFrame into Milvus, using sentence embeddings.
    """
    extraction_dir = "extraction"
    pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
    if not pkl_files:
        print("No .pkl files found in the 'extraction' directory.")
        return

    df_path = os.path.join(extraction_dir, pkl_files[0])
    df = pd.read_pickle(df_path)

    # Generate sentence embeddings
    df['content_vector'] = df['content'].apply(lambda x: model.encode(x).tolist())

    data_to_insert = [
        df['path'].tolist(),
        df['content_vector'].tolist()
    ]

    collection = Collection(collection_name)
    collection.insert(data_to_insert)
    collection.flush()

    print(f"Data from {df_path} loaded into Milvus collection {collection_name}.")

def connect_to_milvus(host, port, max_retries, retry_delay):
    """Connects to Milvus with retries."""
    retries = 0
    while retries < max_retries:
        try:
            connections.connect(host=host, port=port)
            print(f"Successfully connected to Milvus at {host}:{port}")
            return True
        except Exception as e:
            print(f"Error connecting to Milvus: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Max retries reached. Could not connect to Milvus.")
                return False

def initialize_milvus(host, port, collection_name, dimension, max_retries, retry_delay):
    """Initializes Milvus with parameters.

    Returns:
        True if successfully connected and initialized, False otherwise.
    """
    if connect_to_milvus(host, port, max_retries, retry_delay):
        try:
            create_milvus_collection(host, port, collection_name, dimension)
            load_data_to_milvus(host, port, collection_name)
            connections.disconnect(alias='default')
            return True  # Return True if everything is successful
        except Exception as e:
            print(f"Error during initialization: {e}")
            return False # Return False if any error occurs during collection creation or data loading
    else:
        return False  # Return False if connection failed

if __name__ == "__main__":
    # Use default values or environment variables if available
    milvus_host = os.environ.get('MILVUS_HOST', DEFAULT_MILVUS_HOST)
    milvus_port = os.environ.get('MILVUS_PORT', DEFAULT_MILVUS_PORT)
    collection_name = os.environ.get('COLLECTION_NAME', DEFAULT_COLLECTION_NAME)
    dimension = int(os.environ.get('DIMENSION', DEFAULT_DIMENSION))
    max_retries = int(os.environ.get('MAX_RETRIES', DEFAULT_MAX_RETRIES))
    retry_delay = int(os.environ.get('RETRY_DELAY', DEFAULT_RETRY_DELAY))
    
    initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay)