File size: 3,265 Bytes
56f7920
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from typing import List, Optional

from llama_index.core import VectorStoreIndex
from llama_index.core.vector_stores import (FilterOperator, MetadataFilter,
                                            MetadataFilters)
from pydantic import BaseModel, Field

from .config import get_vector_store


class Node(BaseModel):
    file_name: str = Field("Name of the file")
    url: str = Field("GitHub repo url of the file")
    score: float = Field("Relevance score of the node")  # Changed to float
    content: str = Field("Content of the node")


class ContextResponseModel(BaseModel):
    response: str = Field("Response for user's query")
    source_nodes: Optional[List[Node]] = Field(
        "List of sources used to generate response"
    )


class QueryRetriever:
    def __init__(self, repo):
        self.vector_store_index = VectorStoreIndex.from_vector_store(get_vector_store())
        self.filters = MetadataFilters(
            filters=[
                MetadataFilter(
                    key="metadata.repo",
                    value=repo,
                    operator=FilterOperator.EQ,
                )
            ]
        )

    def make_query(self, query: str, mode: str = "default") -> dict:
        """
        Retrieve relevant documentation context for a given query using specified retrieval mode.

        This function is designed to support Retrieval-Augmented Generation (RAG) by extracting
        the most relevant context chunks from indexed documentation sources.

        Args:
            query (str): The user's input query related to the documentation.
            mode (str, optional): Retrieval strategy to use. One of:
                - "default": Standard semantic similarity search.
                - "text_search": Keyword-based search.
                - "hybrid": Combines semantic and keyword-based methods.
                Defaults to "default".

        Returns:
            dict: Dictionary with 'response' and 'source_nodes' keys
        """
        query_engine = self.vector_store_index.as_query_engine(
            similarity_top_k=5,
            vector_store_query_mode=mode,
            filters=self.filters,
            response_mode="refine",
        )

        response = query_engine.query(query)
        nodes = []
        for node in response.source_nodes:
            nodes.append(
                {
                    "file_name": node.metadata.get("file_name", "Unknown"),
                    "url": node.metadata.get("url", "#"),
                    "score": float(node.score) if node.score else 0.0,
                    "content": node.get_content(),
                }
            )

        return {"response": str(response.response), "source_nodes": nodes}

    @staticmethod
    def get_available_repos() -> List[str]:
        """Get list of available repositories in the vector store"""
        try:
            from .config import get_available_repos as get_repos_from_db
            print("fetching repos")
            re = get_repos_from_db()

            print(re)
            return re
        except Exception as e:
            print(f"Error getting repos from database: {e}")
            # Fallback to hardcoded list
            return ["mindsdb/mindsdb", "run-llama/llama_index"]