Spaces:

airabbitX
/

mongo-vector-search-util

Running

App Files Files Community

airabbitX commited on Jan 28

Commit

50e3a95

verified ·

1 Parent(s): 124432f

Upload 3 files

Browse files

Files changed (3) hide show

ui/__init__.py +8 -0
ui/embeddings_tab.py +192 -0
ui/search_tab.py +142 -0

ui/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# UI package for MongoDB Vector Search Tool
+from ui.embeddings_tab import create_embeddings_tab
+from ui.search_tab import create_search_tab
+__all__ = [
+    'create_embeddings_tab',
+    'create_search_tab'
+]

ui/embeddings_tab.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import gradio as gr
+from typing import Tuple, Optional, List
+from openai import OpenAI
+from utils.db_utils import DatabaseUtils
+from utils.embedding_utils import parallel_generate_embeddings
+def create_embeddings_tab(openai_client: OpenAI, db_utils: DatabaseUtils, databases: List[str]) -> Tuple[gr.Tab, dict]:
+    """Create the embeddings generation tab UI
+    Args:
+        openai_client: OpenAI client instance
+        db_utils: DatabaseUtils instance
+        databases: List of available databases
+    Returns:
+        Tuple[gr.Tab, dict]: The tab component and its interface elements
+    """
+    def update_collections(db_name: str) -> gr.Dropdown:
+        """Update collections dropdown when database changes"""
+        collections = db_utils.get_collections(db_name)
+        # If there's only one collection, select it by default
+        value = collections[0] if len(collections) == 1 else None
+        return gr.Dropdown(choices=collections, value=value)
+    def update_fields(db_name: str, collection_name: str) -> gr.Dropdown:
+        """Update fields dropdown when collection changes"""
+        if db_name and collection_name:
+            fields = db_utils.get_field_names(db_name, collection_name)
+            return gr.Dropdown(choices=fields)
+        return gr.Dropdown(choices=[])
+    def generate_embeddings(
+        db_name: str,
+        collection_name: str,
+        field_name: str,
+        embedding_field: str,
+        limit: int = 10,
+        progress=gr.Progress()
+    ) -> Tuple[str, str]:
+        """Generate embeddings for documents with progress tracking"""
+        try:
+            db = db_utils.client[db_name]
+            collection = db[collection_name]
+            # Count documents that need embeddings
+            total_docs = collection.count_documents({field_name: {"$exists": True}})
+            if total_docs == 0:
+                return f"No documents found with field '{field_name}'", ""
+            # Get total count of documents that need processing
+            query = {
+                field_name: {"$exists": True},
+                embedding_field: {"$exists": False}  # Only get docs without embeddings
+            }
+            total_to_process = collection.count_documents(query)
+            if total_to_process == 0:
+                return "No documents found that need embeddings", ""
+            # Apply limit if specified
+            if limit > 0:
+                total_to_process = min(total_to_process, limit)
+            print(f"\nFound {total_to_process} documents that need embeddings...")
+            # Progress tracking
+            progress_text = ""
+            def update_progress(prog: float, processed: int, total: int):
+                nonlocal progress_text
+                progress_text = f"Progress: {prog:.1f}% ({processed}/{total} documents)\n"
+                print(progress_text)  # Terminal logging
+                progress(prog/100, f"Processed {processed}/{total} documents")
+            # Show initial progress
+            update_progress(0, 0, total_to_process)
+            # Create cursor for batch processing
+            cursor = collection.find(query)
+            if limit > 0:
+                cursor = cursor.limit(limit)
+            # Generate embeddings in parallel with cursor-based batching
+            processed = parallel_generate_embeddings(
+                collection=collection,
+                cursor=cursor,
+                field_name=field_name,
+                embedding_field=embedding_field,
+                openai_client=openai_client,
+                total_docs=total_to_process,
+                callback=update_progress
+            )
+            # Return completion message and final progress
+            instructions = f"""
+Successfully generated embeddings for {processed} documents using parallel processing!
+To create the vector search index in MongoDB Atlas:
+1. Go to your Atlas cluster
+2. Click on 'Search' tab
+3. Create an index named 'vector_index' with this configuration:
+{{
+  "fields": [
+    {{
+      "type": "vector",
+      "path": "{embedding_field}",
+      "numDimensions": 1536,
+      "similarity": "dotProduct"
+    }}
+  ]
+}}
+You can now use the search tab with:
+- Field to search: {field_name}
+- Embedding field: {embedding_field}
+"""
+            return instructions, progress_text
+        except Exception as e:
+            return f"Error: {str(e)}", ""
+    # Create the tab UI
+    with gr.Tab("Generate Embeddings") as tab:
+        with gr.Row():
+            db_input = gr.Dropdown(
+                choices=databases,
+                label="Select Database",
+                info="Available databases in Atlas cluster"
+            )
+            collection_input = gr.Dropdown(
+                choices=[],
+                label="Select Collection",
+                info="Collections in selected database"
+            )
+        with gr.Row():
+            field_input = gr.Dropdown(
+                choices=[],
+                label="Select Field for Embeddings",
+                info="Fields available in collection"
+            )
+            embedding_field_input = gr.Textbox(
+                label="Embedding Field Name",
+                value="embedding",
+                info="Field name where embeddings will be stored"
+            )
+            limit_input = gr.Number(
+                label="Document Limit",
+                value=10,
+                minimum=0,
+                info="Number of documents to process (0 for all documents)"
+            )
+        generate_btn = gr.Button("Generate Embeddings")
+        generate_output = gr.Textbox(label="Results", lines=10)
+        progress_output = gr.Textbox(label="Progress", lines=3)
+        # Set up event handlers
+        db_input.change(
+            fn=update_collections,
+            inputs=[db_input],
+            outputs=[collection_input]
+        )
+        collection_input.change(
+            fn=update_fields,
+            inputs=[db_input, collection_input],
+            outputs=[field_input]
+        )
+        generate_btn.click(
+            fn=generate_embeddings,
+            inputs=[
+                db_input,
+                collection_input,
+                field_input,
+                embedding_field_input,
+                limit_input
+            ],
+            outputs=[generate_output, progress_output]
+        )
+    # Return the tab and its interface elements
+    interface = {
+        'db_input': db_input,
+        'collection_input': collection_input,
+        'field_input': field_input,
+        'embedding_field_input': embedding_field_input,
+        'limit_input': limit_input,
+        'generate_btn': generate_btn,
+        'generate_output': generate_output,
+        'progress_output': progress_output
+    }
+    return tab, interface

ui/search_tab.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import gradio as gr
+from typing import Tuple, List
+from openai import OpenAI
+from utils.db_utils import DatabaseUtils
+from utils.embedding_utils import get_embedding
+def create_search_tab(openai_client: OpenAI, db_utils: DatabaseUtils, databases: List[str]) -> Tuple[gr.Tab, dict]:
+    """Create the vector search tab UI
+    Args:
+        openai_client: OpenAI client instance
+        db_utils: DatabaseUtils instance
+        databases: List of available databases
+    Returns:
+        Tuple[gr.Tab, dict]: The tab component and its interface elements
+    """
+    def update_collections(db_name: str) -> gr.Dropdown:
+        """Update collections dropdown when database changes"""
+        collections = db_utils.get_collections(db_name)
+        # If there's only one collection, select it by default
+        value = collections[0] if len(collections) == 1 else None
+        return gr.Dropdown(choices=collections, value=value)
+    def vector_search(
+        query_text: str,
+        db_name: str,
+        collection_name: str,
+        embedding_field: str,
+        index_name: str
+    ) -> str:
+        """Perform vector search using embeddings"""
+        try:
+            print(f"\nProcessing query: {query_text}")
+            db = db_utils.client[db_name]
+            collection = db[collection_name]
+            # Get embeddings for query
+            embedding = get_embedding(query_text, openai_client)
+            print("Generated embeddings successfully")
+            results = collection.aggregate([
+                {
+                    '$vectorSearch': {
+                        "index": index_name,
+                        "path": embedding_field,
+                        "queryVector": embedding,
+                        "numCandidates": 50,
+                        "limit": 5
+                    }
+                },
+                {
+                    "$project": {
+                        "search_score": { "$meta": "vectorSearchScore" },
+                        "document": "$$ROOT"
+                    }
+                }
+            ])
+            # Format results
+            results_list = list(results)
+            formatted_results = []
+            for idx, result in enumerate(results_list, 1):
+                doc = result['document']
+                formatted_result = f"{idx}. Score: {result['search_score']:.4f}\n"
+                # Add all fields except _id and embeddings
+                for key, value in doc.items():
+                    if key not in ['_id', embedding_field]:
+                        formatted_result += f"{key}: {value}\n"
+                formatted_results.append(formatted_result)
+            return "\n".join(formatted_results) if formatted_results else "No results found"
+        except Exception as e:
+            return f"Error: {str(e)}"
+    # Create the tab UI
+    with gr.Tab("Search") as tab:
+        with gr.Row():
+            db_input = gr.Dropdown(
+                choices=databases,
+                label="Select Database",
+                info="Database containing the vectors"
+            )
+            collection_input = gr.Dropdown(
+                choices=[],
+                label="Select Collection",
+                info="Collection containing the vectors"
+            )
+        with gr.Row():
+            embedding_field_input = gr.Textbox(
+                label="Embedding Field Name",
+                value="embedding",
+                info="Field containing the vectors"
+            )
+            index_input = gr.Textbox(
+                label="Vector Search Index Name",
+                value="vector_index",
+                info="Index created in Atlas UI"
+            )
+        query_input = gr.Textbox(
+            label="Search Query",
+            lines=2,
+            placeholder="What would you like to search for?"
+        )
+        search_btn = gr.Button("Search")
+        search_output = gr.Textbox(label="Results", lines=10)
+        # Set up event handlers
+        db_input.change(
+            fn=update_collections,
+            inputs=[db_input],
+            outputs=[collection_input]
+        )
+        search_btn.click(
+            fn=vector_search,
+            inputs=[
+                query_input,
+                db_input,
+                collection_input,
+                embedding_field_input,
+                index_input
+            ],
+            outputs=search_output
+        )
+    # Return the tab and its interface elements
+    interface = {
+        'db_input': db_input,
+        'collection_input': collection_input,
+        'embedding_field_input': embedding_field_input,
+        'index_input': index_input,
+        'query_input': query_input,
+        'search_btn': search_btn,
+        'search_output': search_output
+    }
+    return tab, interface