Spaces:

shamim237
/

artech-med-bot

Sleeping

File size: 5,226 Bytes

8ff45d7

import glob
import logging
from pathlib import Path
from typing import List, Optional
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import CSVLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('vectorize.log'),
        logging.StreamHandler()
    ]
)

class VectorizationError(Exception):
    """Custom exception for vectorization-related errors"""
    pass

def load_csv_documents(csv_file_path: str) -> List[Document]:
    """

    Load CSV documents from the specified path.



    Args:

        csv_file_path (str): Path pattern to search for CSV files.



    Returns:

        List[Document]: A list of documents loaded from the CSV files.



    Raises:

        VectorizationError: If no CSV files are found or if there's an error loading them.

    """
    try:
        documents = []
        csv_files = list(glob.glob(csv_file_path))
        
        if not csv_files:
            raise VectorizationError(f"No CSV files found at path: {csv_file_path}")
        
        for csv_file in csv_files:
            logging.info(f"Loading CSV file: {csv_file}")
            loader = CSVLoader(csv_file, encoding="utf-8")
            documents.extend(loader.load())
            
        logging.info(f"Successfully loaded {len(documents)} documents from {len(csv_files)} CSV files")
        return documents
    
    except Exception as e:
        raise VectorizationError(f"Error loading CSV documents: {str(e)}")

def create_vector_store(

    documents: List[Document],

    embeddings_model: HuggingFaceEmbeddings,

    output_path: str,

    chunk_size: int = 500,

    chunk_overlap: int = 50

) -> Optional[FAISS]:
    """

    Create and save a FAISS vector store from documents.



    Args:

        documents (List[Document]): List of documents to vectorize

        embeddings_model (HuggingFaceEmbeddings): The embeddings model to use

        output_path (str): Path to save the FAISS index

        chunk_size (int, optional): Size of text chunks. Defaults to 500.

        chunk_overlap (int, optional): Overlap between chunks. Defaults to 50.



    Returns:

        Optional[FAISS]: The created FAISS index if successful, None otherwise

    """
    try:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        
        chunked_documents = text_splitter.split_documents(documents)
        logging.info(f"Created {len(chunked_documents)} chunks from {len(documents)} documents")
        
        faiss_index = FAISS.from_documents(chunked_documents, embeddings_model)
        
       
        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
        
        faiss_index.save_local(output_path)
        logging.info(f"Successfully saved FAISS index to {output_path}")
        
        return faiss_index
    
    except Exception as e:
        logging.error(f"Error creating vector store: {str(e)}")
        return None

def main():
    try:
        # Configuration with relative paths
        config = {
            'msd_data_path': "./processed_data/msd/msd_processed.csv",
            'medical_csv_path': "./processed_data/cbip/*.csv",
            'msd_vector_path': "./vectors_data/msd_data_vec",
            'medical_vector_path': "./vectors_data/med_data_vec",
            'model_name': "sentence-transformers/all-MiniLM-L12-v2"
        }
        
        # Create vectors_data directory if it doesn't exist
        Path("./vectors_data").mkdir(exist_ok=True)
        
        logging.info("Starting vectorization process")
        
        # Load documents
        msd_data_documents = load_csv_documents(config['msd_data_path'])
        medical_documents = load_csv_documents(config['medical_csv_path'])
        
        # Initialize embeddings model
        logging.info(f"Initializing embeddings model: {config['model_name']}")
        embeddings_model = HuggingFaceEmbeddings(model_name=config['model_name'])
        
        # Create vector stores
        msd_index = create_vector_store(
            msd_data_documents,
            embeddings_model,
            config['msd_vector_path']
        )
        
        medical_index = create_vector_store(
            medical_documents,
            embeddings_model,
            config['medical_vector_path']
        )
        
        if msd_index and medical_index:
            logging.info("Vectorization process completed successfully")
        else:
            logging.error("Vectorization process completed with errors")
            
    except VectorizationError as ve:
        logging.error(f"Vectorization error: {str(ve)}")
        raise
    except Exception as e:
        logging.error(f"Unexpected error: {str(e)}")
        raise

if __name__ == "__main__":
    main()