Spaces:

tall-tree
/

README

No application file

App Files Files Community

talltree commited on Feb 20, 2024

Commit

70789a8

verified ·

1 Parent(s): b79c8d7

Upload 3 files

Browse files

Files changed (3) hide show

utils/__init__.py +0 -0
utils/data_processing.py +66 -0
utils/update_vector_database.py +223 -0

utils/__init__.py ADDED Viewed

File without changes

utils/data_processing.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pandas as pd
+def format_docs(docs):
+    """Print the contents of a list of Langchain Documents.
+    Args:
+        docs (str):
+    """
+    print(
+        f"\n{'-' * 100}\n".join(
+            [f"Document {i+1}:\n\n" +
+                d.page_content for i, d in enumerate(docs)]
+        )
+    )
+def excel_to_dataframe(data_directory: str) -> pd.DataFrame:
+    """Load an Excel file, clean its contents, and generate a pd.Dataframe.
+    Args:
+        data_directory (str): File path to the directory where the Excel file is located.
+    Raises:
+        FileNotFoundError: If no Excel files are found in the specified directory.
+    Returns:
+        pd.Dataframe:
+    """
+    # Get the xls file name (one excel worksheet)
+    excel_files = [file for file in data_directory.iterdir()
+                   if file.suffix == '.xlsx']
+    if not excel_files:
+        raise FileNotFoundError(
+            "No Excel files found in the specified directory.")
+    if len(excel_files) > 1:
+        raise ValueError(
+            "More than one Excel file found in the specified directory.")
+    path = excel_files[0]
+    # Load Excel file
+    df = pd.read_excel(path, engine='openpyxl')
+    # Change column names to title case
+    df.columns = df.columns.str.title()
+    # Function to replace curly apostrophes with straight ones
+    def replace_apostrophes(text):
+        if isinstance(text, str):
+            return text.replace("\u2019", "'")
+        return text
+    # Clean data
+    # Trim strings, standardize text (convert to title case), and replace apostrophes
+    for col in df.columns:
+        # If the column is text-based
+        if col.lower() != 'booking link' and df[col].dtype == 'object':
+            # Trim, standardize case, and replace apostrophes
+            df[col] = df[col].str.strip().str.title().apply(replace_apostrophes)
+    # Handle missing values
+    df.fillna('Information Not Available', inplace=True)
+    return df

utils/update_vector_database.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import json
+import os
+import sys
+from functools import cache
+from pathlib import Path
+import torch
+from langchain_community.retrievers import QdrantSparseVectorRetriever
+from langchain_community.vectorstores import Qdrant
+from langchain_core.documents import Document
+from langchain_openai.embeddings import OpenAIEmbeddings
+from qdrant_client import QdrantClient, models
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from data_processing import excel_to_dataframe
+class DataProcessor:
+    def __init__(self, data_dir: Path):
+        self.data_dir = data_dir
+    @staticmethod
+    def categorize_location(location):
+        if any(place in location.lower() for place in ['cordova bay', 'james bay']):
+            return 'Victoria'
+        return location
+    def load_practitioners_data(self):
+        try:
+            df = excel_to_dataframe(self.data_dir)
+            df['City'] = df['Location'].apply(self.categorize_location)
+            practitioners_data = []
+            for idx, row in df.iterrows():
+                # I am using dot as a separator for text embeddings
+                content = '. '.join(
+                    f"{key}: {value}" for key, value in row.items())
+                doc = Document(page_content=content, metadata={'row': idx})
+                practitioners_data.append(doc)
+            return practitioners_data
+        except FileNotFoundError:
+            sys.exit(
+                "Directory or Excel file not found. Please check the path and try again.")
+    def load_tall_tree_data(self):
+        # Check if the file has a .json extension
+        json_files = [file for file in self.data_dir.iterdir()
+                      if file.suffix == '.json']
+        if not json_files:
+            raise FileNotFoundError(
+                "No JSON files found in the specified directory.")
+        if len(json_files) > 1:
+            raise ValueError(
+                "More than one JSON file found in the specified directory.")
+        path = json_files[0]
+        data = self.load_json_file(path)
+        tall_tree_data = self.process_json_data(data)
+        return tall_tree_data
+    def load_json_file(self, path):
+        try:
+            with open(path, 'r') as f:
+                data = json.load(f)
+            return data
+        except json.JSONDecodeError:
+            raise ValueError(f"The file {path} is not a valid JSON file.")
+    def process_json_data(self, data):
+        tall_tree_data = []
+        for idx, (key, value) in enumerate(data.items()):
+            content = f"{key}: {value}"
+            doc = Document(page_content=content, metadata={'row': idx})
+            tall_tree_data.append(doc)
+        return tall_tree_data
+class DenseVectorStore:
+    """Store dense data in Qdrant vector database."""
+    def __init__(self, documents: list[Document], embeddings: OpenAIEmbeddings, collection_name: str = 'practitioners_db'):
+        self.validate_environment_variables()
+        self.qdrant_db = Qdrant.from_documents(
+            documents,
+            embeddings,
+            url=os.getenv("QDRANT_URL"),
+            prefer_grpc=True,
+            api_key=os.getenv(
+                "QDRANT_API_KEY"),
+            collection_name=collection_name,
+            force_recreate=True)
+    def validate_environment_variables(self):
+        required_vars = ["QDRANT_API_KEY", "QDRANT_URL"]
+        for var in required_vars:
+            if not os.getenv(var):
+                raise EnvironmentError(f"Missing environment variable: {var}")
+    def get_db(self):
+        return self.qdrant_db
+class SparseVectorStore:
+    """Store sparse vectors in Qdrant vector database using SPLADE neural retrieval model."""
+    def __init__(self, documents: list[Document], collection_name: str, vector_name: str, k: int = 4, splade_model_id: str = "naver/splade-cocondenser-ensembledistil"):
+        self.validate_environment_variables()
+        self.client = QdrantClient(url=os.getenv(
+            "QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))
+        self.model_id = splade_model_id
+        self.tokenizer, self.model = self.set_tokenizer_config()
+        self.collection_name = collection_name
+        self.vector_name = vector_name
+        self.k = k
+        self.sparse_retriever = self.create_sparse_retriever()
+        self.add_documents(documents)
+    def validate_environment_variables(self):
+        required_vars = ["QDRANT_API_KEY", "QDRANT_URL"]
+        for var in required_vars:
+            if not os.getenv(var):
+                raise EnvironmentError(f"Missing environment variable: {var}")
+    @cache
+    def set_tokenizer_config(self):
+        """Initialize the tokenizer and the SPLADE neural retrieval model.
+        See to https://huggingface.co/naver/splade-cocondenser-ensembledistil for more details.
+        """
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        model = AutoModelForMaskedLM.from_pretrained(self.model_id)
+        return tokenizer, model
+    @cache
+    def sparse_encoder(self, text: str) -> tuple[list[int], list[float]]:
+        """This function encodes the input text into a sparse vector. The sparse_encoder is required for the QdrantSparseVectorRetriever.
+        Adapted from the Qdrant documentation: Computing the Sparse Vector code.
+        Args:
+            text (str): Text to encode
+        Returns:
+            tuple[list[int], list[float]]: Indices and values of the sparse vector
+        """
+        tokens = self.tokenizer(
+            text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
+        output = self.model(**tokens)
+        logits, attention_mask = output.logits, tokens.attention_mask
+        relu_log = torch.log(1 + torch.relu(logits))
+        weighted_log = relu_log * attention_mask.unsqueeze(-1)
+        max_val, _ = torch.max(weighted_log, dim=1)
+        vec = max_val.squeeze()
+        indices = vec.nonzero().numpy().flatten()
+        values = vec.detach().numpy()[indices]
+        return indices.tolist(), values.tolist()
+    def create_sparse_retriever(self):
+        self.client.recreate_collection(
+            self.collection_name,
+            vectors_config={},
+            sparse_vectors_config={
+                self.vector_name: models.SparseVectorParams(
+                    index=models.SparseIndexParams(
+                        on_disk=False,
+                    )
+                )
+            },
+        )
+        return QdrantSparseVectorRetriever(
+            client=self.client,
+            collection_name=self.collection_name,
+            sparse_vector_name=self.vector_name,
+            sparse_encoder=self.sparse_encoder,
+            k=self.k,
+        )
+    def add_documents(self, documents):
+        self.sparse_retriever.add_documents(documents)
+def main():
+    data_dir = Path().resolve().parent / "data"
+    if not data_dir.exists():
+        sys.exit(f"The directory {data_dir} does not exist.")
+    processor = DataProcessor(data_dir)
+    print("Loading and cleaning Practitioners data...")
+    practitioners_dataset = processor.load_practitioners_data()
+    print("Loading Tall Tree data from json file...")
+    tall_tree_dataset = processor.load_tall_tree_data()
+    # Set OpenAI embeddings model
+    # TODO: Test new embeddings model text-embedding-3-small
+    embeddings_model = "text-embedding-ada-002"
+    openai_embeddings = OpenAIEmbeddings(model=embeddings_model)
+    # Store both datasets in Qdrant
+    print(f"Storing dense vectors in Qdrant using {embeddings_model}...")
+    practitioners_db = DenseVectorStore(practitioners_dataset,
+                                        openai_embeddings,
+                                        collection_name="practitioners_db").get_db()
+    tall_tree_db = DenseVectorStore(tall_tree_dataset,
+                                    openai_embeddings,
+                                    collection_name="tall_tree_db").get_db()
+    print(f"Storing sparse vectors in Qdrant using SPLADE neural retrieval model...")
+    practitioners_sparse_vector_db = SparseVectorStore(
+        documents=practitioners_dataset,
+        collection_name="practitioners_db_sparse_collection",
+        vector_name="sparse_vector",
+        k=15,
+        splade_model_id="naver/splade-cocondenser-ensembledistil",
+    )
+if __name__ == "__main__":
+    main()