Antoine Chaffin commited on
Commit
f25eebc
·
1 Parent(s): b75393b

Initial commit

Browse files
Files changed (3) hide show
  1. app.py +43 -0
  2. core.py +115 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ import gradio as gr
4
+ from core import MCPyLate
5
+ from huggingface_hub import snapshot_download
6
+
7
+ """
8
+ MCPyLate Server
9
+ A Model Context Protocol server that provides search functionality using PyLate.
10
+ """
11
+
12
+ snapshot_download(
13
+ repo_id="lightonai/leetcode_reasonmoderncolbert",
14
+ local_dir="indexes/",
15
+ repo_type="dataset",
16
+ )
17
+ mcpylate = MCPyLate()
18
+
19
+
20
+ def pylate_search_leetcode(
21
+ query: str, k: int = 10, index_name: Optional[str] = None
22
+ ) -> List[Dict[str, Any]]:
23
+ """
24
+ Search the PyLate with multi-vector models in the leetcode collection containing code problems solutions and return top-k hits
25
+ Args:
26
+ query: Search query string
27
+ k: Number of results to return (default: 10)
28
+ index_name: Name of index to search (default: use default index)
29
+ Returns:
30
+ List of search results with docid, score, text snippet, and index name
31
+ """
32
+ return mcpylate.search(query, k)
33
+
34
+
35
+ demo = gr.Interface(
36
+ fn=pylate_search_leetcode,
37
+ inputs=["text"],
38
+ outputs="text",
39
+ title="LeetCode Search",
40
+ description="Search in leetcode database index using PyLate",
41
+ )
42
+
43
+ demo.launch(mcp_server=True, share=True)
core.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import mteb
8
+ from sqlitedict import SqliteDict
9
+
10
+ from pylate import indexes, models, retrieve
11
+
12
+ # Configure logging
13
+ logging.basicConfig(
14
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
15
+ )
16
+
17
+
18
+ class IndexType(Enum):
19
+ """Supported index types."""
20
+
21
+ PREBUILT = "prebuilt"
22
+ LOCAL = "local"
23
+
24
+
25
+ @dataclass
26
+ class IndexConfig:
27
+ """Configuration for a search index."""
28
+
29
+ name: str
30
+ type: IndexType
31
+ path: str
32
+ description: Optional[str] = None
33
+
34
+
35
+ class MCPyLate:
36
+ """Main server class that manages PyLate indexes and search operations."""
37
+
38
+ def __init__(self, override: bool = False):
39
+ self.logger = logging.getLogger(__name__)
40
+ dataset_name = "leetcode"
41
+
42
+ model_name = "lightonai/Reason-ModernColBERT"
43
+ override = override or not os.path.exists(
44
+ f"indexes/{dataset_name}_{model_name.split('/')[-1]}"
45
+ )
46
+
47
+ self.model = models.ColBERT(
48
+ model_name_or_path=model_name,
49
+ )
50
+ self.index = indexes.PLAID(
51
+ override=override,
52
+ index_name=f"{dataset_name}_{model_name.split('/')[-1]}",
53
+ )
54
+ self.id_to_doc = SqliteDict(
55
+ f"./indexes/{dataset_name}_{model_name.split('/')[-1]}/id_to_doc.sqlite",
56
+ outer_stack=False,
57
+ )
58
+
59
+ self.retriever = retrieve.ColBERT(index=self.index)
60
+ if override:
61
+ tasks = mteb.get_tasks(tasks=["BrightRetrieval"])
62
+ tasks[0].load_data()
63
+ for doc, doc_id in zip(
64
+ list(tasks[0].corpus[dataset_name]["standard"].values()),
65
+ list(tasks[0].corpus[dataset_name]["standard"].keys()),
66
+ ):
67
+ self.id_to_doc[doc_id] = doc
68
+ self.id_to_doc.commit() # Don't forget to commit to save changes!
69
+ documents_embeddings = self.model.encode(
70
+ sentences=list(tasks[0].corpus[dataset_name]["standard"].values()),
71
+ batch_size=100,
72
+ is_query=False,
73
+ show_progress_bar=True,
74
+ )
75
+
76
+ self.index.add_documents(
77
+ documents_ids=list(tasks[0].corpus[dataset_name]["standard"].keys()),
78
+ documents_embeddings=documents_embeddings,
79
+ )
80
+ self.logger.info("Created PyLate MCP Server")
81
+
82
+ def get_document(
83
+ self,
84
+ docid: str,
85
+ ) -> Optional[Dict[str, Any]]:
86
+ """Retrieve full document by document ID."""
87
+
88
+ return {"docid": docid, "text": self.id_to_doc[docid]}
89
+
90
+ def search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
91
+ """Perform multi-vector search on specified index."""
92
+ try:
93
+ query_embeddings = self.model.encode(
94
+ sentences=[query],
95
+ is_query=True,
96
+ show_progress_bar=True,
97
+ batch_size=32,
98
+ )
99
+ scores = self.retriever.retrieve(queries_embeddings=query_embeddings, k=20)
100
+ results = []
101
+ for score in scores[0]:
102
+ results.append(
103
+ {
104
+ "docid": score["id"],
105
+ "score": round(score["score"], 5),
106
+ "text": self.id_to_doc[score["id"]],
107
+ # "text": self.id_to_doc[score["id"]][:200] + "…"
108
+ # if len(self.id_to_doc[score["id"]]) > 200
109
+ # else self.id_to_doc[score["id"]],
110
+ }
111
+ )
112
+ return results
113
+ except Exception as e:
114
+ self.logger.error(f"Search failed: {e}")
115
+ raise RuntimeError(f"Search operation failed: {e}")
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ git+https://github.com/lightonai/pylate.git@MCPyLate
3
+ mteb