File size: 5,226 Bytes
8ff45d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import glob
import logging
from pathlib import Path
from typing import List, Optional
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import CSVLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('vectorize.log'),
        logging.StreamHandler()
    ]
)

class VectorizationError(Exception):
    """Custom exception for vectorization-related errors"""
    pass

def load_csv_documents(csv_file_path: str) -> List[Document]:
    """

    Load CSV documents from the specified path.



    Args:

        csv_file_path (str): Path pattern to search for CSV files.



    Returns:

        List[Document]: A list of documents loaded from the CSV files.



    Raises:

        VectorizationError: If no CSV files are found or if there's an error loading them.

    """
    try:
        documents = []
        csv_files = list(glob.glob(csv_file_path))
        
        if not csv_files:
            raise VectorizationError(f"No CSV files found at path: {csv_file_path}")
        
        for csv_file in csv_files:
            logging.info(f"Loading CSV file: {csv_file}")
            loader = CSVLoader(csv_file, encoding="utf-8")
            documents.extend(loader.load())
            
        logging.info(f"Successfully loaded {len(documents)} documents from {len(csv_files)} CSV files")
        return documents
    
    except Exception as e:
        raise VectorizationError(f"Error loading CSV documents: {str(e)}")

def create_vector_store(

    documents: List[Document],

    embeddings_model: HuggingFaceEmbeddings,

    output_path: str,

    chunk_size: int = 500,

    chunk_overlap: int = 50

) -> Optional[FAISS]:
    """

    Create and save a FAISS vector store from documents.



    Args:

        documents (List[Document]): List of documents to vectorize

        embeddings_model (HuggingFaceEmbeddings): The embeddings model to use

        output_path (str): Path to save the FAISS index

        chunk_size (int, optional): Size of text chunks. Defaults to 500.

        chunk_overlap (int, optional): Overlap between chunks. Defaults to 50.



    Returns:

        Optional[FAISS]: The created FAISS index if successful, None otherwise

    """
    try:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        
        chunked_documents = text_splitter.split_documents(documents)
        logging.info(f"Created {len(chunked_documents)} chunks from {len(documents)} documents")
        
        faiss_index = FAISS.from_documents(chunked_documents, embeddings_model)
        
       
        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
        
        faiss_index.save_local(output_path)
        logging.info(f"Successfully saved FAISS index to {output_path}")
        
        return faiss_index
    
    except Exception as e:
        logging.error(f"Error creating vector store: {str(e)}")
        return None

def main():
    try:
        # Configuration with relative paths
        config = {
            'msd_data_path': "./processed_data/msd/msd_processed.csv",
            'medical_csv_path': "./processed_data/cbip/*.csv",
            'msd_vector_path': "./vectors_data/msd_data_vec",
            'medical_vector_path': "./vectors_data/med_data_vec",
            'model_name': "sentence-transformers/all-MiniLM-L12-v2"
        }
        
        # Create vectors_data directory if it doesn't exist
        Path("./vectors_data").mkdir(exist_ok=True)
        
        logging.info("Starting vectorization process")
        
        # Load documents
        msd_data_documents = load_csv_documents(config['msd_data_path'])
        medical_documents = load_csv_documents(config['medical_csv_path'])
        
        # Initialize embeddings model
        logging.info(f"Initializing embeddings model: {config['model_name']}")
        embeddings_model = HuggingFaceEmbeddings(model_name=config['model_name'])
        
        # Create vector stores
        msd_index = create_vector_store(
            msd_data_documents,
            embeddings_model,
            config['msd_vector_path']
        )
        
        medical_index = create_vector_store(
            medical_documents,
            embeddings_model,
            config['medical_vector_path']
        )
        
        if msd_index and medical_index:
            logging.info("Vectorization process completed successfully")
        else:
            logging.error("Vectorization process completed with errors")
            
    except VectorizationError as ve:
        logging.error(f"Vectorization error: {str(ve)}")
        raise
    except Exception as e:
        logging.error(f"Unexpected error: {str(e)}")
        raise

if __name__ == "__main__":
    main()