Spaces:
Running
Running
Commit
·
56f7920
1
Parent(s):
3028c31
created doc-mcp
Browse files- .env.example +3 -0
- .gitignore +3 -0
- .python-version +1 -0
- README.md +95 -1
- main.py +1152 -0
- rag/__init__.py +0 -0
- rag/config.py +211 -0
- rag/github_file_loader.py +521 -0
- rag/ingest.py +66 -0
- rag/query.py +94 -0
- requirements.txt +0 -0
.env.example
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
GITHUB_API_KEY=
|
2 |
+
NEBIUS_API_KEY=
|
3 |
+
MONGODB_URI=
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
.venv
|
3 |
+
__pycache__
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.13
|
README.md
CHANGED
@@ -5,10 +5,104 @@ colorFrom: yellow
|
|
5 |
colorTo: pink
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.33.0
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
short_description: 'RAG on documentations for your agent '
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
colorTo: pink
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.33.0
|
8 |
+
python_version: 3.13
|
9 |
app_file: app.py
|
10 |
pinned: false
|
11 |
license: mit
|
12 |
short_description: 'RAG on documentations for your agent '
|
13 |
---
|
14 |
|
15 |
+
# Doc-MCP 📚
|
16 |
+
|
17 |
+
> Transform GitHub documentation repositories into accessible MCP (Model Context Protocol) servers for AI agents
|
18 |
+
|
19 |
+
**Hackathon Track**: `mcp-server-track`
|
20 |
+
|
21 |
+
## 🎯 What is Doc-MCP?
|
22 |
+
|
23 |
+
Doc-MCP ingests markdown documentation from GitHub repositories and creates MCP servers that provide easy access to documentation context for AI agents. Just point it at any GitHub repo with markdown docs, and get an intelligent Q&A interface powered by vector search.
|
24 |
+
|
25 |
+
## ✨ Key Features
|
26 |
+
|
27 |
+
- **GitHub Integration**: Fetch markdown files directly from any GitHub repository
|
28 |
+
- **Vector Search**: Uses MongoDB Atlas with Nebius AI embeddings for semantic search
|
29 |
+
- **MCP Server**: Exposes documentation as MCP endpoints for AI agents
|
30 |
+
- **Smart Q&A**: Ask questions about documentation with source citations
|
31 |
+
- **Repository Management**: Track multiple repositories and their statistics
|
32 |
+
|
33 |
+
## 🚀 Quick Start
|
34 |
+
|
35 |
+
1. **Setup Environment**:
|
36 |
+
```bash
|
37 |
+
# Clone and install
|
38 |
+
git clone https://github.com/yourusername/doc-mcp.git
|
39 |
+
cd doc-mcp
|
40 |
+
uv sync
|
41 |
+
|
42 |
+
# Configure environment
|
43 |
+
cp .env.example .env
|
44 |
+
# Add your NEBIUS_API_KEY and MONGODB_URI
|
45 |
+
```
|
46 |
+
|
47 |
+
2. **Run the App**:
|
48 |
+
```bash
|
49 |
+
python main.py
|
50 |
+
# Open http://localhost:7860
|
51 |
+
```
|
52 |
+
|
53 |
+
3. **Ingest Documentation**:
|
54 |
+
- Enter a GitHub repo URL (e.g., `gradio-app/gradio`)
|
55 |
+
- Select markdown files to process
|
56 |
+
- Load files and generate vector embeddings
|
57 |
+
|
58 |
+
4. **Query Documentation**:
|
59 |
+
- Select your repository
|
60 |
+
- Ask questions about the documentation
|
61 |
+
- Get answers with source citations
|
62 |
+
|
63 |
+
|
64 |
+
## Workflow
|
65 |
+
```mermaid
|
66 |
+
flowchart TD
|
67 |
+
subgraph Ingestion["Ingestion"]
|
68 |
+
B["Discover Markdown Files"]
|
69 |
+
A["GitHub Repo URL"]
|
70 |
+
C["User File Selection"]
|
71 |
+
D["Chunk & Embed Documents"]
|
72 |
+
E["Store in MongoDB"]
|
73 |
+
end
|
74 |
+
subgraph Query["Query"]
|
75 |
+
G["Select Repository"]
|
76 |
+
F["User Question"]
|
77 |
+
H["Vector Search"]
|
78 |
+
I["Retrieve Context"]
|
79 |
+
J["Generate Response"]
|
80 |
+
K["Display with Sources"]
|
81 |
+
end
|
82 |
+
A --> B
|
83 |
+
B --> C
|
84 |
+
C --> D
|
85 |
+
D --> E
|
86 |
+
F --> G
|
87 |
+
G --> H
|
88 |
+
H --> I
|
89 |
+
I --> J
|
90 |
+
J --> K
|
91 |
+
E --> H
|
92 |
+
```
|
93 |
+
|
94 |
+
## 🛠️ Technology Stack
|
95 |
+
|
96 |
+
- **Frontend**: Gradio
|
97 |
+
- **Vector Store**: MongoDB Atlas with vector search
|
98 |
+
- **Embeddings**: Nebius AI (BAAI/bge-en-icl)
|
99 |
+
- **LLM**: Nebius LLM (Llama-3.3-70B-Instruct)
|
100 |
+
- **Document Processing**: LlamaIndex
|
101 |
+
|
102 |
+
## 📹 Demo Video
|
103 |
+
|
104 |
+
[Link to demo video showing MCP server in action with Claude Desktop/Cursor]
|
105 |
+
|
106 |
+
---
|
107 |
+
|
108 |
+
**Transform your documentation into intelligent, accessible knowledge for AI agents!** 🚀
|
main.py
ADDED
@@ -0,0 +1,1152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
from typing import Dict, List
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from llama_index.core import Settings
|
9 |
+
from llama_index.core.text_splitter import SentenceSplitter
|
10 |
+
|
11 |
+
from rag.config import embed_model, get_available_repos, llm
|
12 |
+
from rag.github_file_loader import \
|
13 |
+
fetch_markdown_files as fetch_files_with_loader, load_github_files
|
14 |
+
from rag.ingest import ingest_documents_async
|
15 |
+
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
Settings.llm = llm
|
19 |
+
Settings.embed_model = embed_model
|
20 |
+
Settings.node_parser = SentenceSplitter(chunk_size=3072)
|
21 |
+
|
22 |
+
|
23 |
+
def get_available_repositories():
|
24 |
+
return get_available_repos()
|
25 |
+
|
26 |
+
|
27 |
+
def start_file_loading(
|
28 |
+
repo_url: str, selected_files: List[str], current_progress: Dict
|
29 |
+
):
|
30 |
+
"""Step 1: Load files from GitHub"""
|
31 |
+
print("\n🔄 STARTING FILE LOADING STEP")
|
32 |
+
print(f"📍 Repository: {repo_url}")
|
33 |
+
print(f"📋 Selected files: {selected_files}")
|
34 |
+
|
35 |
+
if not selected_files:
|
36 |
+
return {
|
37 |
+
"status": "error",
|
38 |
+
"message": "❌ No files selected for loading",
|
39 |
+
"progress": 0,
|
40 |
+
"details": "",
|
41 |
+
"step": "file_loading",
|
42 |
+
}
|
43 |
+
|
44 |
+
total_files = len(selected_files)
|
45 |
+
start_time = time.time()
|
46 |
+
|
47 |
+
# Parse repo name from URL
|
48 |
+
if "github.com" in repo_url:
|
49 |
+
repo_name = (
|
50 |
+
repo_url.replace("https://github.com/", "")
|
51 |
+
.replace("http://github.com/", "")
|
52 |
+
.strip("/")
|
53 |
+
)
|
54 |
+
if "/" not in repo_name:
|
55 |
+
return {
|
56 |
+
"status": "error",
|
57 |
+
"message": "❌ Invalid repository URL format",
|
58 |
+
"progress": 0,
|
59 |
+
"details": "",
|
60 |
+
"step": "file_loading",
|
61 |
+
}
|
62 |
+
else:
|
63 |
+
repo_name = repo_url.strip()
|
64 |
+
|
65 |
+
try:
|
66 |
+
batch_size = 25
|
67 |
+
all_documents = []
|
68 |
+
all_failed = []
|
69 |
+
|
70 |
+
current_progress.update(
|
71 |
+
{
|
72 |
+
"status": "loading",
|
73 |
+
"message": f"🚀 Loading files from {repo_name}",
|
74 |
+
"progress": 0,
|
75 |
+
"total_files": total_files,
|
76 |
+
"processed_files": 0,
|
77 |
+
"phase": "File Loading",
|
78 |
+
"details": f"Processing {total_files} files in batches...",
|
79 |
+
"step": "file_loading",
|
80 |
+
}
|
81 |
+
)
|
82 |
+
|
83 |
+
for i in range(0, len(selected_files), batch_size):
|
84 |
+
batch = selected_files[i : i + batch_size]
|
85 |
+
|
86 |
+
print(f"\n📦 PROCESSING BATCH {i // batch_size + 1}")
|
87 |
+
print(f" Files: {batch}")
|
88 |
+
|
89 |
+
# Update progress for current batch
|
90 |
+
progress_percentage = (i / total_files) * 100
|
91 |
+
current_progress.update(
|
92 |
+
{
|
93 |
+
"progress": progress_percentage,
|
94 |
+
"processed_files": i,
|
95 |
+
"current_batch": i // batch_size + 1,
|
96 |
+
"details": f"Loading batch {i // batch_size + 1}: {', '.join([f.split('/')[-1] for f in batch])}",
|
97 |
+
}
|
98 |
+
)
|
99 |
+
|
100 |
+
try:
|
101 |
+
documents, failed = load_github_files(
|
102 |
+
repo_name=repo_name,
|
103 |
+
file_paths=batch,
|
104 |
+
branch="main",
|
105 |
+
concurrent_requests=10,
|
106 |
+
github_token=os.getenv("GITHUB_API_KEY"),
|
107 |
+
)
|
108 |
+
|
109 |
+
print("✅ Load results:")
|
110 |
+
print(f" - Documents: {len(documents)}")
|
111 |
+
print(f" - Failed: {len(failed)}")
|
112 |
+
|
113 |
+
if documents:
|
114 |
+
for j, doc in enumerate(documents):
|
115 |
+
print(f" 📄 Doc {j + 1}: {doc.doc_id}")
|
116 |
+
print(f" Size: {len(doc.text)} chars")
|
117 |
+
|
118 |
+
# Ensure repo metadata is set
|
119 |
+
if "repo" not in doc.metadata:
|
120 |
+
doc.metadata["repo"] = repo_name
|
121 |
+
print(f" ✅ Added repo metadata: {repo_name}")
|
122 |
+
|
123 |
+
all_documents.extend(documents)
|
124 |
+
all_failed.extend(failed)
|
125 |
+
|
126 |
+
except Exception as batch_error:
|
127 |
+
print(f"❌ Batch processing error: {batch_error}")
|
128 |
+
all_failed.extend(batch)
|
129 |
+
|
130 |
+
loading_time = time.time() - start_time
|
131 |
+
|
132 |
+
# Store loaded documents in progress state for next step
|
133 |
+
current_progress.update(
|
134 |
+
{
|
135 |
+
"status": "loaded",
|
136 |
+
"message": f"✅ File Loading Complete! Loaded {len(all_documents)} documents",
|
137 |
+
"progress": 100,
|
138 |
+
"phase": "Files Loaded",
|
139 |
+
"details": f"Successfully loaded {len(all_documents)} documents in {loading_time:.1f}s",
|
140 |
+
"step": "file_loading_complete",
|
141 |
+
"loaded_documents": all_documents, # Store documents for next step
|
142 |
+
"failed_files": all_failed,
|
143 |
+
"loading_time": loading_time,
|
144 |
+
"repo_name": repo_name,
|
145 |
+
}
|
146 |
+
)
|
147 |
+
|
148 |
+
return current_progress
|
149 |
+
|
150 |
+
except Exception as e:
|
151 |
+
total_time = time.time() - start_time
|
152 |
+
error_msg = f"❌ File loading error after {total_time:.1f}s: {str(e)}"
|
153 |
+
print(error_msg)
|
154 |
+
|
155 |
+
current_progress.update(
|
156 |
+
{
|
157 |
+
"status": "error",
|
158 |
+
"message": error_msg,
|
159 |
+
"progress": 0,
|
160 |
+
"phase": "Failed",
|
161 |
+
"details": str(e),
|
162 |
+
"error": str(e),
|
163 |
+
"step": "file_loading",
|
164 |
+
}
|
165 |
+
)
|
166 |
+
|
167 |
+
return current_progress
|
168 |
+
|
169 |
+
|
170 |
+
def start_vector_ingestion(current_progress: Dict):
|
171 |
+
"""Step 2: Ingest loaded documents into vector store"""
|
172 |
+
print("\n🔄 STARTING VECTOR INGESTION STEP")
|
173 |
+
|
174 |
+
# Check if we have loaded documents from previous step
|
175 |
+
if current_progress.get("step") != "file_loading_complete":
|
176 |
+
return {
|
177 |
+
"status": "error",
|
178 |
+
"message": "❌ No loaded documents found. Please load files first.",
|
179 |
+
"progress": 0,
|
180 |
+
"details": "",
|
181 |
+
"step": "vector_ingestion",
|
182 |
+
}
|
183 |
+
|
184 |
+
all_documents = current_progress.get("loaded_documents", [])
|
185 |
+
repo_name = current_progress.get("repo_name", "")
|
186 |
+
|
187 |
+
if not all_documents:
|
188 |
+
return {
|
189 |
+
"status": "error",
|
190 |
+
"message": "❌ No documents available for vector ingestion",
|
191 |
+
"progress": 0,
|
192 |
+
"details": "",
|
193 |
+
"step": "vector_ingestion",
|
194 |
+
}
|
195 |
+
|
196 |
+
vector_start_time = time.time()
|
197 |
+
|
198 |
+
# Update state for vector store phase
|
199 |
+
current_progress.update(
|
200 |
+
{
|
201 |
+
"status": "vectorizing",
|
202 |
+
"message": "🔄 Generating embeddings and storing in vector database",
|
203 |
+
"progress": 0,
|
204 |
+
"phase": "Vector Store Ingestion",
|
205 |
+
"details": f"Processing {len(all_documents)} documents for embedding...",
|
206 |
+
"step": "vector_ingestion",
|
207 |
+
}
|
208 |
+
)
|
209 |
+
|
210 |
+
try:
|
211 |
+
print("🔄 STARTING VECTOR STORE INGESTION")
|
212 |
+
print(f" Repository: {repo_name}")
|
213 |
+
print(f" Documents to process: {len(all_documents)}")
|
214 |
+
|
215 |
+
# Call the async ingestion function with repo name
|
216 |
+
loop = asyncio.new_event_loop()
|
217 |
+
asyncio.set_event_loop(loop)
|
218 |
+
try:
|
219 |
+
loop.run_until_complete(ingest_documents_async(all_documents, repo_name))
|
220 |
+
finally:
|
221 |
+
loop.close()
|
222 |
+
|
223 |
+
vector_time = time.time() - vector_start_time
|
224 |
+
loading_time = current_progress.get("loading_time", 0)
|
225 |
+
total_time = loading_time + vector_time
|
226 |
+
|
227 |
+
print(f"✅ Vector ingestion completed in {vector_time:.2f} seconds")
|
228 |
+
|
229 |
+
failed_files_data = current_progress.get("failed_files", [])
|
230 |
+
if isinstance(failed_files_data, list):
|
231 |
+
failed_files_count = len(failed_files_data)
|
232 |
+
else:
|
233 |
+
failed_files_count = failed_files_data if isinstance(failed_files_data, int) else 0
|
234 |
+
|
235 |
+
# Update final success state with repository update flag
|
236 |
+
current_progress.update(
|
237 |
+
{
|
238 |
+
"status": "complete",
|
239 |
+
"message": "✅ Complete Ingestion Pipeline Finished!",
|
240 |
+
"progress": 100,
|
241 |
+
"phase": "Complete",
|
242 |
+
"details": f"Successfully processed {len(all_documents)} documents for {repo_name}",
|
243 |
+
"step": "complete",
|
244 |
+
"total_time": total_time,
|
245 |
+
"documents_processed": len(all_documents),
|
246 |
+
"failed_files_count": failed_files_count, # Use count instead of trying len()
|
247 |
+
"failed_files": failed_files_data, # Keep original data
|
248 |
+
"vector_time": vector_time,
|
249 |
+
"loading_time": loading_time,
|
250 |
+
"repo_name": repo_name,
|
251 |
+
"repository_updated": True, # Flag to trigger repo list refresh
|
252 |
+
}
|
253 |
+
)
|
254 |
+
|
255 |
+
return current_progress
|
256 |
+
|
257 |
+
except Exception as ingest_error:
|
258 |
+
vector_time = time.time() - vector_start_time
|
259 |
+
print(f"❌ Vector ingestion failed after {vector_time:.2f} seconds")
|
260 |
+
print(f"❌ Error: {ingest_error}")
|
261 |
+
|
262 |
+
# Get failed files data safely
|
263 |
+
failed_files_data = current_progress.get("failed_files", [])
|
264 |
+
if isinstance(failed_files_data, list):
|
265 |
+
failed_files_count = len(failed_files_data)
|
266 |
+
else:
|
267 |
+
failed_files_count = failed_files_data if isinstance(failed_files_data, int) else 0
|
268 |
+
|
269 |
+
current_progress.update(
|
270 |
+
{
|
271 |
+
"status": "error",
|
272 |
+
"message": "❌ Vector Store Ingestion Failed",
|
273 |
+
"progress": 0,
|
274 |
+
"phase": "Failed",
|
275 |
+
"details": f"Error: {str(ingest_error)}",
|
276 |
+
"error": str(ingest_error),
|
277 |
+
"step": "vector_ingestion",
|
278 |
+
"failed_files_count": failed_files_count,
|
279 |
+
"failed_files": failed_files_data,
|
280 |
+
}
|
281 |
+
)
|
282 |
+
|
283 |
+
return current_progress
|
284 |
+
|
285 |
+
def start_file_loading_generator(
|
286 |
+
repo_url: str, selected_files: List[str], current_progress: Dict
|
287 |
+
):
|
288 |
+
"""Step 1: Load files from GitHub with yield-based real-time updates"""
|
289 |
+
|
290 |
+
print("\n🔄 STARTING FILE LOADING STEP")
|
291 |
+
print(f"📍 Repository: {repo_url}")
|
292 |
+
print(f"📋 Selected files: {len(selected_files)} files")
|
293 |
+
|
294 |
+
if not selected_files:
|
295 |
+
error_progress = {
|
296 |
+
"status": "error",
|
297 |
+
"message": "❌ No files selected for loading",
|
298 |
+
"progress": 0,
|
299 |
+
"details": "Please select at least one file to proceed.",
|
300 |
+
"step": "file_loading",
|
301 |
+
}
|
302 |
+
yield error_progress
|
303 |
+
return error_progress
|
304 |
+
|
305 |
+
total_files = len(selected_files)
|
306 |
+
start_time = time.time()
|
307 |
+
|
308 |
+
# Parse repo name from URL
|
309 |
+
if "github.com" in repo_url:
|
310 |
+
repo_name = (
|
311 |
+
repo_url.replace("https://github.com/", "")
|
312 |
+
.replace("http://github.com/", "")
|
313 |
+
.strip("/")
|
314 |
+
)
|
315 |
+
if "/" not in repo_name:
|
316 |
+
error_progress = {
|
317 |
+
"status": "error",
|
318 |
+
"message": "❌ Invalid repository URL format",
|
319 |
+
"progress": 0,
|
320 |
+
"details": "Expected format: owner/repo or https://github.com/owner/repo",
|
321 |
+
"step": "file_loading",
|
322 |
+
}
|
323 |
+
yield error_progress
|
324 |
+
return error_progress
|
325 |
+
else:
|
326 |
+
repo_name = repo_url.strip()
|
327 |
+
|
328 |
+
try:
|
329 |
+
batch_size = 10
|
330 |
+
all_documents = []
|
331 |
+
all_failed = []
|
332 |
+
|
333 |
+
# Initial progress update
|
334 |
+
initial_progress = {
|
335 |
+
"status": "loading",
|
336 |
+
"message": f"🚀 Starting file loading from {repo_name}",
|
337 |
+
"progress": 0,
|
338 |
+
"total_files": total_files,
|
339 |
+
"processed_files": 0,
|
340 |
+
"successful_files": 0,
|
341 |
+
"failed_files": 0,
|
342 |
+
"phase": "File Loading",
|
343 |
+
"details": f"Preparing to load {total_files} files in batches of {batch_size}...",
|
344 |
+
"step": "file_loading",
|
345 |
+
"current_batch": 0,
|
346 |
+
"total_batches": (len(selected_files) + batch_size - 1) // batch_size,
|
347 |
+
"repo_name": repo_name,
|
348 |
+
}
|
349 |
+
yield initial_progress
|
350 |
+
|
351 |
+
time.sleep(0.5)
|
352 |
+
|
353 |
+
for i in range(0, len(selected_files), batch_size):
|
354 |
+
batch = selected_files[i : i + batch_size]
|
355 |
+
current_batch_num = i // batch_size + 1
|
356 |
+
total_batches = (len(selected_files) + batch_size - 1) // batch_size
|
357 |
+
|
358 |
+
# Update progress at batch start
|
359 |
+
batch_start_progress = {
|
360 |
+
"status": "loading",
|
361 |
+
"message": f"🔄 Loading batch {current_batch_num}/{total_batches}",
|
362 |
+
"progress": (i / total_files) * 90,
|
363 |
+
"processed_files": i,
|
364 |
+
"successful_files": len(all_documents),
|
365 |
+
"failed_files": len(all_failed),
|
366 |
+
"current_batch": current_batch_num,
|
367 |
+
"total_batches": total_batches,
|
368 |
+
"phase": "File Loading",
|
369 |
+
"details": f"Processing batch {current_batch_num}: {', '.join([f.split('/')[-1] for f in batch[:3]])}{'...' if len(batch) > 3 else ''}",
|
370 |
+
"step": "file_loading",
|
371 |
+
"repo_name": repo_name,
|
372 |
+
}
|
373 |
+
yield batch_start_progress
|
374 |
+
|
375 |
+
try:
|
376 |
+
print(f"\n📦 PROCESSING BATCH {current_batch_num}/{total_batches}")
|
377 |
+
print(f" Files: {[f.split('/')[-1] for f in batch]}")
|
378 |
+
|
379 |
+
documents, failed = load_github_files(
|
380 |
+
repo_name=repo_name,
|
381 |
+
file_paths=batch,
|
382 |
+
branch="main",
|
383 |
+
concurrent_requests=10,
|
384 |
+
github_token=os.getenv("GITHUB_API_KEY"),
|
385 |
+
)
|
386 |
+
|
387 |
+
print("✅ Load results:")
|
388 |
+
print(f" - Documents: {len(documents)}")
|
389 |
+
print(f" - Failed: {len(failed)}")
|
390 |
+
|
391 |
+
# Process documents
|
392 |
+
for j, doc in enumerate(documents):
|
393 |
+
print(f" 📄 Doc {j + 1}: {doc.doc_id}")
|
394 |
+
print(f" Size: {len(doc.text)} chars")
|
395 |
+
|
396 |
+
if "repo" not in doc.metadata:
|
397 |
+
doc.metadata["repo"] = repo_name
|
398 |
+
print(f" ✅ Added repo metadata: {repo_name}")
|
399 |
+
|
400 |
+
all_documents.extend(documents)
|
401 |
+
all_failed.extend(failed)
|
402 |
+
|
403 |
+
# Update progress after batch completion
|
404 |
+
batch_complete_progress = {
|
405 |
+
"status": "loading",
|
406 |
+
"message": f"✅ Completed batch {current_batch_num}/{total_batches}",
|
407 |
+
"progress": ((i + len(batch)) / total_files) * 90,
|
408 |
+
"processed_files": i + len(batch),
|
409 |
+
"successful_files": len(all_documents),
|
410 |
+
"failed_files": len(all_failed),
|
411 |
+
"current_batch": current_batch_num,
|
412 |
+
"total_batches": total_batches,
|
413 |
+
"phase": "File Loading",
|
414 |
+
"details": f"✅ Batch {current_batch_num} complete: {len(documents)} loaded, {len(failed)} failed. Total progress: {len(all_documents)} documents loaded.",
|
415 |
+
"step": "file_loading",
|
416 |
+
"repo_name": repo_name,
|
417 |
+
}
|
418 |
+
yield batch_complete_progress
|
419 |
+
|
420 |
+
time.sleep(0.3)
|
421 |
+
|
422 |
+
except Exception as batch_error:
|
423 |
+
print(f"❌ Batch processing error: {batch_error}")
|
424 |
+
all_failed.extend(batch)
|
425 |
+
|
426 |
+
error_progress = {
|
427 |
+
"status": "loading",
|
428 |
+
"message": f"⚠️ Error in batch {current_batch_num}",
|
429 |
+
"progress": ((i + len(batch)) / total_files) * 90,
|
430 |
+
"processed_files": i + len(batch),
|
431 |
+
"successful_files": len(all_documents),
|
432 |
+
"failed_files": len(all_failed),
|
433 |
+
"current_batch": current_batch_num,
|
434 |
+
"phase": "File Loading",
|
435 |
+
"details": f"❌ Batch {current_batch_num} error: {str(batch_error)[:100]}... Continuing with next batch.",
|
436 |
+
"step": "file_loading",
|
437 |
+
"repo_name": repo_name,
|
438 |
+
}
|
439 |
+
yield error_progress
|
440 |
+
|
441 |
+
loading_time = time.time() - start_time
|
442 |
+
|
443 |
+
# Final completion update
|
444 |
+
completion_progress = {
|
445 |
+
"status": "loaded",
|
446 |
+
"message": f"✅ File Loading Complete! Loaded {len(all_documents)} documents",
|
447 |
+
"progress": 100,
|
448 |
+
"phase": "Files Loaded Successfully",
|
449 |
+
"details": f"🎯 Final Results:\n✅ Successfully loaded: {len(all_documents)} documents\n❌ Failed files: {len(all_failed)}\n⏱️ Total time: {loading_time:.1f}s\n📊 Success rate: {(len(all_documents)/(len(all_documents)+len(all_failed))*100):.1f}%",
|
450 |
+
"step": "file_loading_complete",
|
451 |
+
"loaded_documents": all_documents,
|
452 |
+
"failed_files": all_failed,
|
453 |
+
"loading_time": loading_time,
|
454 |
+
"repo_name": repo_name,
|
455 |
+
"total_files": total_files,
|
456 |
+
"processed_files": total_files,
|
457 |
+
"successful_files": len(all_documents),
|
458 |
+
}
|
459 |
+
yield completion_progress
|
460 |
+
return completion_progress
|
461 |
+
|
462 |
+
except Exception as e:
|
463 |
+
total_time = time.time() - start_time
|
464 |
+
error_msg = f"❌ File loading error after {total_time:.1f}s: {str(e)}"
|
465 |
+
print(error_msg)
|
466 |
+
|
467 |
+
error_progress = {
|
468 |
+
"status": "error",
|
469 |
+
"message": error_msg,
|
470 |
+
"progress": 0,
|
471 |
+
"phase": "Loading Failed",
|
472 |
+
"details": f"Critical error during file loading:\n{str(e)}",
|
473 |
+
"error": str(e),
|
474 |
+
"step": "file_loading",
|
475 |
+
}
|
476 |
+
yield error_progress
|
477 |
+
return error_progress
|
478 |
+
|
479 |
+
# Progress display component
|
480 |
+
def format_progress_display(progress_state: Dict) -> str:
|
481 |
+
"""Format progress state into readable display with enhanced details"""
|
482 |
+
if not progress_state:
|
483 |
+
return "🚀 Ready to start ingestion...\n\n📋 **Two-Step Process:**\n1️⃣ Load files from GitHub repository\n2️⃣ Generate embeddings and store in vector database"
|
484 |
+
|
485 |
+
status = progress_state.get("status", "unknown")
|
486 |
+
message = progress_state.get("message", "")
|
487 |
+
progress = progress_state.get("progress", 0)
|
488 |
+
phase = progress_state.get("phase", "")
|
489 |
+
details = progress_state.get("details", "")
|
490 |
+
|
491 |
+
# Enhanced progress bar
|
492 |
+
filled = int(progress / 2.5) # 40 chars total
|
493 |
+
progress_bar = "█" * filled + "░" * (40 - filled)
|
494 |
+
|
495 |
+
# Status emoji mapping
|
496 |
+
status_emoji = {
|
497 |
+
"loading": "⏳",
|
498 |
+
"loaded": "✅",
|
499 |
+
"vectorizing": "🧠",
|
500 |
+
"complete": "🎉",
|
501 |
+
"error": "❌"
|
502 |
+
}
|
503 |
+
|
504 |
+
emoji = status_emoji.get(status, "🔄")
|
505 |
+
|
506 |
+
output = f"{emoji} **{message}**\n\n"
|
507 |
+
|
508 |
+
# Phase and progress section
|
509 |
+
output += f"📊 **Current Phase:** {phase}\n"
|
510 |
+
output += f"📈 **Progress:** {progress:.1f}%\n"
|
511 |
+
output += f"[{progress_bar}] {progress:.1f}%\n\n"
|
512 |
+
|
513 |
+
# Step-specific details for file loading
|
514 |
+
if progress_state.get("step") == "file_loading":
|
515 |
+
processed = progress_state.get("processed_files", 0)
|
516 |
+
total = progress_state.get("total_files", 0)
|
517 |
+
successful = progress_state.get("successful_files", 0)
|
518 |
+
failed = progress_state.get("failed_files", 0)
|
519 |
+
|
520 |
+
if total > 0:
|
521 |
+
output += "📁 **File Processing Status:**\n"
|
522 |
+
output += f" • Total files: {total}\n"
|
523 |
+
output += f" • Processed: {processed}/{total}\n"
|
524 |
+
output += f" • ✅ Successful: {successful}\n"
|
525 |
+
output += f" • ❌ Failed: {failed}\n"
|
526 |
+
|
527 |
+
if "current_batch" in progress_state and "total_batches" in progress_state:
|
528 |
+
output += f" • 📦 Current batch: {progress_state['current_batch']}/{progress_state['total_batches']}\n"
|
529 |
+
output += "\n"
|
530 |
+
|
531 |
+
# Step-specific details for vector ingestion
|
532 |
+
elif progress_state.get("step") == "vector_ingestion":
|
533 |
+
docs_count = progress_state.get("documents_count", 0)
|
534 |
+
repo_name = progress_state.get("repo_name", "Unknown")
|
535 |
+
|
536 |
+
if docs_count > 0:
|
537 |
+
output += "🧠 **Vector Processing Status:**\n"
|
538 |
+
output += f" • Repository: {repo_name}\n"
|
539 |
+
output += f" • Documents: {docs_count:,}\n"
|
540 |
+
output += f" • Stage: {phase}\n\n"
|
541 |
+
|
542 |
+
# Detailed information
|
543 |
+
output += f"📝 **Details:**\n{details}\n"
|
544 |
+
|
545 |
+
# Final summary for completion
|
546 |
+
if status == "complete":
|
547 |
+
total_time = progress_state.get("total_time", 0)
|
548 |
+
docs_processed = progress_state.get("documents_processed", 0)
|
549 |
+
failed_files = progress_state.get("failed_files", 0)
|
550 |
+
vector_time = progress_state.get("vector_time", 0)
|
551 |
+
loading_time = progress_state.get("loading_time", 0)
|
552 |
+
repo_name = progress_state.get("repo_name", "Unknown")
|
553 |
+
|
554 |
+
output += "\n🎊 **INGESTION COMPLETED SUCCESSFULLY!**\n"
|
555 |
+
output += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
|
556 |
+
output += f"🎯 **Repository:** {repo_name}\n"
|
557 |
+
output += f"📄 **Documents processed:** {docs_processed:,}\n"
|
558 |
+
output += f"❌ **Failed files:** {len(failed_files) if isinstance(failed_files, list) else failed_files}\n"
|
559 |
+
output += f"⏱️ **Total time:** {total_time:.1f} seconds\n"
|
560 |
+
output += f" ├─ File loading: {loading_time:.1f}s\n"
|
561 |
+
output += f" └─ Vector processing: {vector_time:.1f}s\n"
|
562 |
+
output += f"📊 **Processing rate:** {docs_processed/total_time:.1f} docs/second\n\n"
|
563 |
+
output += "🚀 **Next Step:** Go to the 'Query Interface' tab to start asking questions!"
|
564 |
+
|
565 |
+
elif status == "error":
|
566 |
+
error = progress_state.get("error", "Unknown error")
|
567 |
+
output += "\n💥 **ERROR OCCURRED**\n"
|
568 |
+
output += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
|
569 |
+
output += f"❌ **Error Details:** {error[:300]}{'...' if len(error) > 300 else ''}\n"
|
570 |
+
output += "\n🔧 **Troubleshooting Tips:**\n"
|
571 |
+
output += " • Check your GitHub token permissions\n"
|
572 |
+
output += " • Verify repository URL format\n"
|
573 |
+
output += " • Ensure selected files exist\n"
|
574 |
+
output += " • Check network connectivity\n"
|
575 |
+
|
576 |
+
return output
|
577 |
+
|
578 |
+
|
579 |
+
# Create the main Gradio interface
|
580 |
+
with gr.Blocks(title="Doc-MCP") as demo:
|
581 |
+
gr.Markdown("# 📚Doc-MCP: Documentation RAG System")
|
582 |
+
gr.Markdown(
|
583 |
+
"Transform GitHub documentation repositories into accessible MCP (Model Context Protocol) servers for AI agents. Upload documentation, generate vector embeddings, and query with intelligent context retrieval."
|
584 |
+
)
|
585 |
+
|
586 |
+
# State variables
|
587 |
+
files_state = gr.State([])
|
588 |
+
progress_state = gr.State({})
|
589 |
+
|
590 |
+
with gr.Tabs():
|
591 |
+
with gr.TabItem("📥 Documentation Ingestion"):
|
592 |
+
gr.Markdown("### 🚀 Two-Step Documentation Processing Pipeline")
|
593 |
+
gr.Markdown(
|
594 |
+
"**Step 1:** Fetch markdown files from GitHub repository → **Step 2:** Generate vector embeddings and store in MongoDB Atlas"
|
595 |
+
)
|
596 |
+
|
597 |
+
with gr.Row():
|
598 |
+
with gr.Column(scale=2):
|
599 |
+
repo_input = gr.Textbox(
|
600 |
+
label="📂 GitHub Repository URL",
|
601 |
+
placeholder="Enter: owner/repo or https://github.com/owner/repo (e.g., gradio-app/gradio)",
|
602 |
+
value="",
|
603 |
+
info="Enter any GitHub repository containing markdown documentation"
|
604 |
+
)
|
605 |
+
load_btn = gr.Button("🔍 Discover Documentation Files", variant="secondary")
|
606 |
+
|
607 |
+
with gr.Column(scale=1):
|
608 |
+
status_output = gr.Textbox(
|
609 |
+
label="Repository Discovery Status", interactive=False, lines=4,
|
610 |
+
placeholder="Repository scanning results will appear here..."
|
611 |
+
)
|
612 |
+
with gr.Row():
|
613 |
+
select_all_btn = gr.Button("📋 Select All Documents", variant="secondary")
|
614 |
+
clear_all_btn = gr.Button("🗑️ Clear Selection", variant="secondary")
|
615 |
+
|
616 |
+
# File selection
|
617 |
+
with gr.Accordion(label="Available Documentation Files"):
|
618 |
+
file_selector = gr.CheckboxGroup(
|
619 |
+
choices=[], label="Select Markdown Files for RAG Processing", visible=False
|
620 |
+
)
|
621 |
+
|
622 |
+
# Two-step ingestion controls
|
623 |
+
gr.Markdown("### 🔄 RAG Pipeline Execution")
|
624 |
+
gr.Markdown("Process your documentation through our advanced RAG pipeline using Nebius AI embeddings and MongoDB Atlas vector storage.")
|
625 |
+
|
626 |
+
with gr.Row():
|
627 |
+
with gr.Column():
|
628 |
+
step1_btn = gr.Button(
|
629 |
+
"📥 Step 1: Load Files from GitHub",
|
630 |
+
variant="primary",
|
631 |
+
size="lg",
|
632 |
+
interactive=False,
|
633 |
+
)
|
634 |
+
|
635 |
+
with gr.Column():
|
636 |
+
step2_btn = gr.Button(
|
637 |
+
"🔄 Step 2: Start Ingestion",
|
638 |
+
variant="primary",
|
639 |
+
size="lg",
|
640 |
+
interactive=False,
|
641 |
+
)
|
642 |
+
|
643 |
+
with gr.Row():
|
644 |
+
refresh_btn = gr.Button("🔄 Refresh Progress", variant="secondary")
|
645 |
+
reset_btn = gr.Button("🗑️ Reset Progress", variant="secondary")
|
646 |
+
|
647 |
+
# Progress display
|
648 |
+
progress_display = gr.Textbox(
|
649 |
+
label="📊 Real-time Ingestion Progress",
|
650 |
+
interactive=False,
|
651 |
+
lines=25,
|
652 |
+
value="🚀 Ready to start two-step ingestion process...\n\n📋 Steps:\n1️⃣ Load files from GitHub repository\n2️⃣ Generate embeddings and store in vector database",
|
653 |
+
max_lines=30,
|
654 |
+
show_copy_button=True,
|
655 |
+
)
|
656 |
+
|
657 |
+
# Event handlers
|
658 |
+
def load_files_handler(repo_url: str):
|
659 |
+
if not repo_url.strip():
|
660 |
+
return (
|
661 |
+
gr.CheckboxGroup(choices=[], visible=False),
|
662 |
+
"Please enter a repository URL",
|
663 |
+
[],
|
664 |
+
gr.Button(interactive=False),
|
665 |
+
gr.Button(interactive=False),
|
666 |
+
)
|
667 |
+
|
668 |
+
files, message = fetch_files_with_loader(repo_url)
|
669 |
+
|
670 |
+
if files:
|
671 |
+
return (
|
672 |
+
gr.CheckboxGroup(
|
673 |
+
choices=files,
|
674 |
+
value=[],
|
675 |
+
label=f"Select Files from {repo_url} ({len(files)} files)",
|
676 |
+
visible=True,
|
677 |
+
),
|
678 |
+
message,
|
679 |
+
files,
|
680 |
+
gr.Button(interactive=True), # Enable step 1 button
|
681 |
+
gr.Button(interactive=False), # Keep step 2 disabled
|
682 |
+
)
|
683 |
+
else:
|
684 |
+
return (
|
685 |
+
gr.CheckboxGroup(choices=[], visible=False),
|
686 |
+
message,
|
687 |
+
[],
|
688 |
+
gr.Button(interactive=False),
|
689 |
+
gr.Button(interactive=False),
|
690 |
+
)
|
691 |
+
|
692 |
+
def start_step1_generator(repo_url: str, selected_files: List[str], current_progress: Dict):
|
693 |
+
"""Start Step 1 with generator-based real-time progress updates"""
|
694 |
+
for progress_update in start_file_loading_generator(repo_url, selected_files, current_progress.copy()):
|
695 |
+
progress_text = format_progress_display(progress_update)
|
696 |
+
step2_enabled = progress_update.get("step") == "file_loading_complete"
|
697 |
+
|
698 |
+
yield (
|
699 |
+
progress_update,
|
700 |
+
progress_text,
|
701 |
+
gr.Button(interactive=step2_enabled),
|
702 |
+
)
|
703 |
+
|
704 |
+
def start_step2(current_progress: Dict):
|
705 |
+
"""Start Step 2: Vector Ingestion"""
|
706 |
+
new_progress = start_vector_ingestion(current_progress.copy())
|
707 |
+
progress_text = format_progress_display(new_progress)
|
708 |
+
return new_progress, progress_text
|
709 |
+
|
710 |
+
def refresh_progress(current_progress: Dict):
|
711 |
+
"""Refresh the progress display"""
|
712 |
+
progress_text = format_progress_display(current_progress)
|
713 |
+
return progress_text
|
714 |
+
|
715 |
+
def reset_progress():
|
716 |
+
"""Reset all progress"""
|
717 |
+
return (
|
718 |
+
{},
|
719 |
+
"Ready to start two-step ingestion process...",
|
720 |
+
gr.Button(interactive=False),
|
721 |
+
)
|
722 |
+
|
723 |
+
def select_all_handler(available_files):
|
724 |
+
if available_files:
|
725 |
+
return gr.CheckboxGroup(value=available_files)
|
726 |
+
return gr.CheckboxGroup(value=[])
|
727 |
+
|
728 |
+
def clear_all_handler():
|
729 |
+
return gr.CheckboxGroup(value=[])
|
730 |
+
|
731 |
+
# Wire up events
|
732 |
+
load_btn.click(
|
733 |
+
fn=load_files_handler,
|
734 |
+
inputs=[repo_input],
|
735 |
+
outputs=[
|
736 |
+
file_selector,
|
737 |
+
status_output,
|
738 |
+
files_state,
|
739 |
+
step1_btn,
|
740 |
+
step2_btn,
|
741 |
+
],
|
742 |
+
show_api=False,
|
743 |
+
)
|
744 |
+
|
745 |
+
select_all_btn.click(
|
746 |
+
fn=select_all_handler,
|
747 |
+
inputs=[files_state],
|
748 |
+
outputs=[file_selector],
|
749 |
+
show_api=False,
|
750 |
+
)
|
751 |
+
|
752 |
+
clear_all_btn.click(
|
753 |
+
fn=clear_all_handler, outputs=[file_selector], show_api=False
|
754 |
+
)
|
755 |
+
|
756 |
+
step1_btn.click(
|
757 |
+
fn=start_step1_generator,
|
758 |
+
inputs=[repo_input, file_selector, progress_state],
|
759 |
+
outputs=[progress_state, progress_display, step2_btn],
|
760 |
+
show_api=False,
|
761 |
+
)
|
762 |
+
|
763 |
+
step2_btn.click(
|
764 |
+
fn=start_step2,
|
765 |
+
inputs=[progress_state],
|
766 |
+
outputs=[progress_state, progress_display],
|
767 |
+
show_api=False,
|
768 |
+
)
|
769 |
+
|
770 |
+
refresh_btn.click(
|
771 |
+
fn=refresh_progress,
|
772 |
+
inputs=[progress_state],
|
773 |
+
outputs=[progress_display],
|
774 |
+
show_api=False,
|
775 |
+
)
|
776 |
+
|
777 |
+
reset_btn.click(
|
778 |
+
fn=reset_progress,
|
779 |
+
outputs=[progress_state, progress_display, step2_btn],
|
780 |
+
show_api=False,
|
781 |
+
)
|
782 |
+
|
783 |
+
# ================================
|
784 |
+
# Tab 2: Query Interface
|
785 |
+
# ================================
|
786 |
+
with gr.TabItem("🤖 AI Documentation Assistant"):
|
787 |
+
gr.Markdown("### 💬 Intelligent Documentation Q&A")
|
788 |
+
gr.Markdown(
|
789 |
+
"Query your processed documentation using advanced semantic search. Get contextual answers with source citations powered by Nebius LLM and vector similarity search."
|
790 |
+
)
|
791 |
+
|
792 |
+
with gr.Row():
|
793 |
+
with gr.Column(scale=2):
|
794 |
+
# Repository selection
|
795 |
+
repo_dropdown = gr.Dropdown(
|
796 |
+
choices=get_available_repositories(),
|
797 |
+
label="Select Documentation Repository",
|
798 |
+
value=None,
|
799 |
+
interactive=True,
|
800 |
+
allow_custom_value=False,
|
801 |
+
)
|
802 |
+
refresh_repos_btn = gr.Button(
|
803 |
+
"🔄 Refresh Repositories", variant="secondary", size="sm"
|
804 |
+
)
|
805 |
+
|
806 |
+
# Query mode selection
|
807 |
+
query_mode = gr.Radio(
|
808 |
+
choices=["default", "text_search", "hybrid"],
|
809 |
+
label="Query Mode",
|
810 |
+
value="default",
|
811 |
+
info="default: semantic similarity, text_search: keyword-based, hybrid: combines both",
|
812 |
+
)
|
813 |
+
|
814 |
+
# Query input
|
815 |
+
query_input = gr.Textbox(
|
816 |
+
label="Your Query",
|
817 |
+
placeholder="Ask about the documentation...",
|
818 |
+
lines=3,
|
819 |
+
)
|
820 |
+
|
821 |
+
query_btn = gr.Button("🔍 Search", variant="primary", size="lg")
|
822 |
+
|
823 |
+
# Response display as text area
|
824 |
+
response_output = gr.Textbox(
|
825 |
+
label="Response",
|
826 |
+
value="Your query response will appear here...",
|
827 |
+
lines=10,
|
828 |
+
interactive=False,
|
829 |
+
)
|
830 |
+
|
831 |
+
with gr.Column(scale=2):
|
832 |
+
gr.Markdown("### Source Nodes (JSON)")
|
833 |
+
|
834 |
+
# Source nodes display as JSON
|
835 |
+
sources_output = gr.JSON(
|
836 |
+
label="Source Nodes",
|
837 |
+
value={
|
838 |
+
"message": "Source nodes will appear here after querying..."
|
839 |
+
},
|
840 |
+
)
|
841 |
+
|
842 |
+
def get_available_docs_repo():
|
843 |
+
"""
|
844 |
+
List the available docs of repositories
|
845 |
+
|
846 |
+
Returns:
|
847 |
+
List of repo names
|
848 |
+
"""
|
849 |
+
try:
|
850 |
+
repos = get_available_repositories()
|
851 |
+
return gr.Dropdown(choices=repos, value=repos[0] if repos else None)
|
852 |
+
except Exception as e:
|
853 |
+
print(f"Error refreshing repository list: {e}")
|
854 |
+
return gr.Dropdown(choices=[], value=None)
|
855 |
+
|
856 |
+
# Simple query handler
|
857 |
+
def handle_query(repo: str, mode: str, query: str):
|
858 |
+
"""
|
859 |
+
Handle query request - returns raw data from retriever
|
860 |
+
Args:
|
861 |
+
repo: Selected repository
|
862 |
+
mode: Query mode (default, text_search, hybrid)
|
863 |
+
query: User's query
|
864 |
+
Returns:
|
865 |
+
Raw result dict from QueryRetriever.make_query()
|
866 |
+
"""
|
867 |
+
if not query.strip():
|
868 |
+
return {"error": "Please enter a query."}
|
869 |
+
|
870 |
+
if not repo:
|
871 |
+
return {"error": "Please select a repository."}
|
872 |
+
|
873 |
+
try:
|
874 |
+
# Import QueryRetriever here to avoid circular imports
|
875 |
+
from rag.query import QueryRetriever
|
876 |
+
|
877 |
+
# Create query retriever for the selected repo
|
878 |
+
retriever = QueryRetriever(repo)
|
879 |
+
|
880 |
+
# Make the query and return raw result
|
881 |
+
result = retriever.make_query(query, mode)
|
882 |
+
return result
|
883 |
+
|
884 |
+
except Exception as e:
|
885 |
+
print(f"Query error: {e}")
|
886 |
+
import traceback
|
887 |
+
|
888 |
+
traceback.print_exc()
|
889 |
+
return {"error": f"Query failed: {str(e)}"}
|
890 |
+
|
891 |
+
def make_query(repo: str, mode: str, query: str):
|
892 |
+
"""
|
893 |
+
Retrieve relevant documentation context for a given query using specified retrieval mode.
|
894 |
+
|
895 |
+
This function is designed to support Retrieval-Augmented Generation (RAG) by extracting
|
896 |
+
the most relevant context chunks from indexed documentation sources.
|
897 |
+
Args:
|
898 |
+
repo: Selected repository
|
899 |
+
mode: Query mode
|
900 |
+
query: User's query
|
901 |
+
Returns:
|
902 |
+
Tuple of (response_text, source_nodes_json)
|
903 |
+
"""
|
904 |
+
# Get raw result
|
905 |
+
result = handle_query(repo, mode, query)
|
906 |
+
|
907 |
+
# Extract response text
|
908 |
+
if "error" in result:
|
909 |
+
response_text = f"Error: {result['error']}"
|
910 |
+
source_nodes = {"error": result["error"]}
|
911 |
+
else:
|
912 |
+
response_text = result.get("response", "No response available")
|
913 |
+
source_nodes = result.get("source_nodes", [])
|
914 |
+
|
915 |
+
return response_text, source_nodes
|
916 |
+
|
917 |
+
refresh_repos_btn.click(
|
918 |
+
fn=get_available_docs_repo,
|
919 |
+
outputs=[repo_dropdown],
|
920 |
+
api_name="List available docs",
|
921 |
+
)
|
922 |
+
|
923 |
+
# Simple event wiring - single button click
|
924 |
+
query_btn.click(
|
925 |
+
fn=make_query,
|
926 |
+
inputs=[repo_dropdown, query_mode, query_input],
|
927 |
+
outputs=[response_output, sources_output],
|
928 |
+
api_name="Query docs",
|
929 |
+
)
|
930 |
+
|
931 |
+
# Also allow Enter key to trigger query
|
932 |
+
query_input.submit(
|
933 |
+
fn=make_query,
|
934 |
+
inputs=[repo_dropdown, query_mode, query_input],
|
935 |
+
outputs=[response_output, sources_output],
|
936 |
+
show_api=False,
|
937 |
+
)
|
938 |
+
|
939 |
+
# ================================
|
940 |
+
# Tab 3: Repository Management
|
941 |
+
# ================================
|
942 |
+
with gr.TabItem("🗂️ Repository Management"):
|
943 |
+
gr.Markdown("Manage your ingested repositories - view details and delete repositories when needed.")
|
944 |
+
|
945 |
+
with gr.Row():
|
946 |
+
with gr.Column(scale=1):
|
947 |
+
gr.Markdown("### 📊 Repository Statistics")
|
948 |
+
stats_display = gr.JSON(
|
949 |
+
label="Database Statistics",
|
950 |
+
value={"message": "Click refresh to load statistics..."}
|
951 |
+
)
|
952 |
+
refresh_stats_btn = gr.Button("🔄 Refresh Statistics", variant="secondary")
|
953 |
+
|
954 |
+
with gr.Column(scale=2):
|
955 |
+
gr.Markdown("### 📋 Repository Details")
|
956 |
+
repos_table = gr.Dataframe(
|
957 |
+
headers=["Repository", "Files", "Last Updated"],
|
958 |
+
datatype=["str", "number", "str"],
|
959 |
+
label="Ingested Repositories",
|
960 |
+
interactive=False,
|
961 |
+
wrap=True
|
962 |
+
)
|
963 |
+
refresh_repos_btn = gr.Button("🔄 Refresh Repository List", variant="secondary")
|
964 |
+
|
965 |
+
gr.Markdown("### 🗑️ Delete Repository")
|
966 |
+
gr.Markdown("**⚠️ Warning:** This will permanently delete all documents and metadata for the selected repository.")
|
967 |
+
|
968 |
+
with gr.Row():
|
969 |
+
with gr.Column(scale=2):
|
970 |
+
delete_repo_dropdown = gr.Dropdown(
|
971 |
+
choices=[],
|
972 |
+
label="Select Repository to Delete",
|
973 |
+
value=None,
|
974 |
+
interactive=True,
|
975 |
+
allow_custom_value=False,
|
976 |
+
)
|
977 |
+
|
978 |
+
# Confirmation checkbox
|
979 |
+
confirm_delete = gr.Checkbox(
|
980 |
+
label="I understand this action cannot be undone",
|
981 |
+
value=False
|
982 |
+
)
|
983 |
+
|
984 |
+
delete_btn = gr.Button(
|
985 |
+
"🗑️ Delete Repository",
|
986 |
+
variant="stop",
|
987 |
+
size="lg",
|
988 |
+
interactive=False
|
989 |
+
)
|
990 |
+
|
991 |
+
with gr.Column(scale=1):
|
992 |
+
deletion_status = gr.Textbox(
|
993 |
+
label="Deletion Status",
|
994 |
+
value="Select a repository and confirm to enable deletion.",
|
995 |
+
interactive=False,
|
996 |
+
lines=6
|
997 |
+
)
|
998 |
+
|
999 |
+
# Management functions
|
1000 |
+
def load_repository_stats():
|
1001 |
+
"""Load overall repository statistics"""
|
1002 |
+
try:
|
1003 |
+
from rag.config import get_repository_stats
|
1004 |
+
stats = get_repository_stats()
|
1005 |
+
return stats
|
1006 |
+
except Exception as e:
|
1007 |
+
return {"error": f"Failed to load statistics: {str(e)}"}
|
1008 |
+
|
1009 |
+
def load_repository_details():
|
1010 |
+
"""Load detailed repository information as a table"""
|
1011 |
+
try:
|
1012 |
+
from rag.config import get_repo_details
|
1013 |
+
details = get_repo_details()
|
1014 |
+
|
1015 |
+
if not details:
|
1016 |
+
return [["No repositories found", 0, "N/A"]]
|
1017 |
+
|
1018 |
+
# Format for dataframe
|
1019 |
+
table_data = []
|
1020 |
+
for repo in details:
|
1021 |
+
last_updated = repo.get("last_updated", "Unknown")
|
1022 |
+
if hasattr(last_updated, 'strftime'):
|
1023 |
+
last_updated = last_updated.strftime("%Y-%m-%d %H:%M")
|
1024 |
+
elif last_updated != "Unknown":
|
1025 |
+
last_updated = str(last_updated)
|
1026 |
+
|
1027 |
+
table_data.append([
|
1028 |
+
repo.get("repo_name", "Unknown"),
|
1029 |
+
repo.get("file_count", 0),
|
1030 |
+
last_updated
|
1031 |
+
])
|
1032 |
+
|
1033 |
+
return table_data
|
1034 |
+
|
1035 |
+
except Exception as e:
|
1036 |
+
return [["Error loading repositories", 0, str(e)]]
|
1037 |
+
|
1038 |
+
def update_delete_dropdown():
|
1039 |
+
"""Update the dropdown with available repositories"""
|
1040 |
+
try:
|
1041 |
+
repos = get_available_repositories()
|
1042 |
+
return gr.Dropdown(choices=repos, value=None)
|
1043 |
+
except Exception as e:
|
1044 |
+
print(f"Error updating delete dropdown: {e}")
|
1045 |
+
return gr.Dropdown(choices=[], value=None)
|
1046 |
+
|
1047 |
+
def check_delete_button_state(repo_selected, confirmation_checked):
|
1048 |
+
"""Enable/disable delete button based on selection and confirmation"""
|
1049 |
+
if repo_selected and confirmation_checked:
|
1050 |
+
return gr.Button(interactive=True)
|
1051 |
+
else:
|
1052 |
+
return gr.Button(interactive=False)
|
1053 |
+
|
1054 |
+
def delete_repository(repo_name: str, confirmed: bool):
|
1055 |
+
"""Delete the selected repository"""
|
1056 |
+
if not repo_name:
|
1057 |
+
return "❌ No repository selected.", gr.Dropdown(choices=[]), gr.Checkbox(value=False)
|
1058 |
+
|
1059 |
+
if not confirmed:
|
1060 |
+
return "❌ Please confirm deletion by checking the checkbox.", gr.Dropdown(choices=[]), gr.Checkbox(value=False)
|
1061 |
+
|
1062 |
+
try:
|
1063 |
+
from rag.config import delete_repository_data
|
1064 |
+
|
1065 |
+
# Perform deletion
|
1066 |
+
result = delete_repository_data(repo_name)
|
1067 |
+
|
1068 |
+
# Prepare status message
|
1069 |
+
status_msg = result["message"]
|
1070 |
+
if result["success"]:
|
1071 |
+
status_msg += "\n\n📊 Deletion Summary:"
|
1072 |
+
status_msg += f"\n- Vector documents removed: {result['vector_docs_deleted']}"
|
1073 |
+
status_msg += f"\n- Repository record deleted: {'Yes' if result['repo_record_deleted'] else 'No'}"
|
1074 |
+
status_msg += f"\n\n✅ Repository '{repo_name}' has been completely removed."
|
1075 |
+
|
1076 |
+
# Update dropdown (remove deleted repo)
|
1077 |
+
updated_dropdown = update_delete_dropdown()
|
1078 |
+
|
1079 |
+
# Reset confirmation checkbox
|
1080 |
+
reset_checkbox = gr.Checkbox(value=False)
|
1081 |
+
|
1082 |
+
return status_msg, updated_dropdown, reset_checkbox
|
1083 |
+
|
1084 |
+
except Exception as e:
|
1085 |
+
error_msg = f"❌ Error deleting repository: {str(e)}"
|
1086 |
+
return error_msg, gr.Dropdown(choices=[]), gr.Checkbox(value=False)
|
1087 |
+
|
1088 |
+
# Wire up management events
|
1089 |
+
refresh_stats_btn.click(
|
1090 |
+
fn=load_repository_stats,
|
1091 |
+
outputs=[stats_display],
|
1092 |
+
show_api=False
|
1093 |
+
)
|
1094 |
+
|
1095 |
+
refresh_repos_btn.click(
|
1096 |
+
fn=load_repository_details,
|
1097 |
+
outputs=[repos_table],
|
1098 |
+
show_api=False
|
1099 |
+
)
|
1100 |
+
|
1101 |
+
# Update delete dropdown when refreshing repos
|
1102 |
+
refresh_repos_btn.click(
|
1103 |
+
fn=update_delete_dropdown,
|
1104 |
+
outputs=[delete_repo_dropdown],
|
1105 |
+
show_api=False
|
1106 |
+
)
|
1107 |
+
|
1108 |
+
# Enable/disable delete button based on selection and confirmation
|
1109 |
+
delete_repo_dropdown.change(
|
1110 |
+
fn=check_delete_button_state,
|
1111 |
+
inputs=[delete_repo_dropdown, confirm_delete],
|
1112 |
+
outputs=[delete_btn],
|
1113 |
+
show_api=False
|
1114 |
+
)
|
1115 |
+
|
1116 |
+
confirm_delete.change(
|
1117 |
+
fn=check_delete_button_state,
|
1118 |
+
inputs=[delete_repo_dropdown, confirm_delete],
|
1119 |
+
outputs=[delete_btn],
|
1120 |
+
show_api=False
|
1121 |
+
)
|
1122 |
+
|
1123 |
+
# Delete repository
|
1124 |
+
delete_btn.click(
|
1125 |
+
fn=delete_repository,
|
1126 |
+
inputs=[delete_repo_dropdown, confirm_delete],
|
1127 |
+
outputs=[deletion_status, delete_repo_dropdown, confirm_delete],
|
1128 |
+
show_api=False
|
1129 |
+
)
|
1130 |
+
|
1131 |
+
# Load data on tab load
|
1132 |
+
demo.load(
|
1133 |
+
fn=load_repository_stats,
|
1134 |
+
outputs=[stats_display],
|
1135 |
+
show_api=False
|
1136 |
+
)
|
1137 |
+
|
1138 |
+
demo.load(
|
1139 |
+
fn=load_repository_details,
|
1140 |
+
outputs=[repos_table],
|
1141 |
+
show_api=False
|
1142 |
+
)
|
1143 |
+
|
1144 |
+
demo.load(
|
1145 |
+
fn=update_delete_dropdown,
|
1146 |
+
outputs=[delete_repo_dropdown],
|
1147 |
+
show_api=False
|
1148 |
+
)
|
1149 |
+
|
1150 |
+
|
1151 |
+
if __name__ == "__main__":
|
1152 |
+
demo.launch(mcp_server=True)
|
rag/__init__.py
ADDED
File without changes
|
rag/config.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
from llama_index.embeddings.nebius import NebiusEmbedding
|
5 |
+
from llama_index.llms.nebius import NebiusLLM
|
6 |
+
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
|
7 |
+
from pymongo import MongoClient
|
8 |
+
from pymongo.operations import SearchIndexModel
|
9 |
+
|
10 |
+
llm = NebiusLLM(
|
11 |
+
model="meta-llama/Llama-3.3-70B-Instruct-fast", api_key=os.getenv("NEBIUS_API_KEY")
|
12 |
+
)
|
13 |
+
embed_model = NebiusEmbedding(
|
14 |
+
model_name="BAAI/bge-en-icl",
|
15 |
+
api_key=os.getenv("NEBIUS_API_KEY"),
|
16 |
+
embed_batch_size=10,
|
17 |
+
)
|
18 |
+
|
19 |
+
MONGO_DB_URI = os.getenv("MONGO_DB_URI")
|
20 |
+
mongo_client = MongoClient(MONGO_DB_URI)
|
21 |
+
|
22 |
+
# Database and collection names
|
23 |
+
DB_NAME = "docmcp"
|
24 |
+
COLLECTION_NAME = "doc_rag"
|
25 |
+
REPOS_COLLECTION_NAME = "ingested_repos"
|
26 |
+
|
27 |
+
VS_INDEX_NAME = "vector_index"
|
28 |
+
FTS_INDEX_NAME = "fts_index"
|
29 |
+
vs_model = SearchIndexModel(
|
30 |
+
definition={
|
31 |
+
"fields": [
|
32 |
+
{
|
33 |
+
"type": "vector",
|
34 |
+
"path": "embedding",
|
35 |
+
"numDimensions": 4096,
|
36 |
+
"similarity": "cosine",
|
37 |
+
},
|
38 |
+
{"type": "filter", "path": "metadata.repo"},
|
39 |
+
]
|
40 |
+
},
|
41 |
+
name=VS_INDEX_NAME,
|
42 |
+
type="vectorSearch",
|
43 |
+
)
|
44 |
+
|
45 |
+
fts_model = SearchIndexModel(
|
46 |
+
definition={"mappings": {"dynamic": False, "fields": {"text": {"type": "string"}}}},
|
47 |
+
name=FTS_INDEX_NAME,
|
48 |
+
type="search",
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
def get_vector_store():
|
53 |
+
collection = mongo_client[DB_NAME][COLLECTION_NAME]
|
54 |
+
|
55 |
+
vector_store = MongoDBAtlasVectorSearch(
|
56 |
+
mongodb_client=mongo_client,
|
57 |
+
db_name=DB_NAME,
|
58 |
+
collection_name=COLLECTION_NAME,
|
59 |
+
vector_index_name=VS_INDEX_NAME,
|
60 |
+
fulltext_index_name=FTS_INDEX_NAME,
|
61 |
+
embedding_key="embedding",
|
62 |
+
text_key="text",
|
63 |
+
)
|
64 |
+
|
65 |
+
collection.create_search_indexes(models=[vs_model, fts_model])
|
66 |
+
|
67 |
+
return vector_store
|
68 |
+
|
69 |
+
|
70 |
+
def get_repos_collection():
|
71 |
+
return mongo_client[DB_NAME][REPOS_COLLECTION_NAME]
|
72 |
+
|
73 |
+
|
74 |
+
def store_ingested_repo(repo_name: str, ingested_files: List[str]) -> bool:
|
75 |
+
try:
|
76 |
+
repos_collection = get_repos_collection()
|
77 |
+
|
78 |
+
# Simple document format
|
79 |
+
repo_doc = {
|
80 |
+
"_id": repo_name, # Use repo name as unique ID
|
81 |
+
"repo_name": repo_name,
|
82 |
+
"ingested_files": ingested_files,
|
83 |
+
"file_count": len(ingested_files),
|
84 |
+
}
|
85 |
+
|
86 |
+
# Upsert the document (update if exists, insert if not)
|
87 |
+
repos_collection.replace_one({"_id": repo_name}, repo_doc, upsert=True)
|
88 |
+
|
89 |
+
print(f"✅ Stored repository: {repo_name} with {len(ingested_files)} files")
|
90 |
+
return True
|
91 |
+
|
92 |
+
except Exception as e:
|
93 |
+
print(f"❌ Error storing repository data: {e}")
|
94 |
+
return False
|
95 |
+
|
96 |
+
|
97 |
+
def get_available_repos():
|
98 |
+
try:
|
99 |
+
repos_collection = get_repos_collection()
|
100 |
+
|
101 |
+
# Get all repository names
|
102 |
+
repos = repos_collection.find({}, {"repo_name": 1})
|
103 |
+
repo_list = [repo["repo_name"] for repo in repos]
|
104 |
+
|
105 |
+
if repo_list:
|
106 |
+
return sorted(repo_list)
|
107 |
+
else:
|
108 |
+
# Fallback to hardcoded list if no repos in database
|
109 |
+
return []
|
110 |
+
|
111 |
+
except Exception as e:
|
112 |
+
print(f"Error getting repos from database: {e}")
|
113 |
+
# Fallback to hardcoded list
|
114 |
+
return []
|
115 |
+
|
116 |
+
|
117 |
+
def get_repo_details():
|
118 |
+
"""Get detailed information about all repositories"""
|
119 |
+
try:
|
120 |
+
repos_collection = get_repos_collection()
|
121 |
+
|
122 |
+
# Get all repository details
|
123 |
+
repos = repos_collection.find({})
|
124 |
+
repo_details = []
|
125 |
+
|
126 |
+
for repo in repos:
|
127 |
+
repo_info = {
|
128 |
+
"repo_name": repo.get("repo_name", "Unknown"),
|
129 |
+
"file_count": repo.get("file_count", 0),
|
130 |
+
"last_updated": repo.get("last_updated", "Unknown"),
|
131 |
+
"ingested_files": repo.get("ingested_files", [])
|
132 |
+
}
|
133 |
+
repo_details.append(repo_info)
|
134 |
+
|
135 |
+
return repo_details
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
print(f"Error getting repo details: {e}")
|
139 |
+
return []
|
140 |
+
|
141 |
+
def delete_repository_data(repo_name):
|
142 |
+
try:
|
143 |
+
result = {
|
144 |
+
"success": False,
|
145 |
+
"message": "",
|
146 |
+
"vector_docs_deleted": 0,
|
147 |
+
"repo_record_deleted": False,
|
148 |
+
}
|
149 |
+
|
150 |
+
# Delete from vector store (documents with this repo metadata)
|
151 |
+
collection = mongo_client[DB_NAME][COLLECTION_NAME]
|
152 |
+
vector_delete_result = collection.delete_many({"metadata.repo": repo_name})
|
153 |
+
result["vector_docs_deleted"] = vector_delete_result.deleted_count
|
154 |
+
|
155 |
+
# Delete from repos tracking collection
|
156 |
+
repos_collection = get_repos_collection()
|
157 |
+
repo_delete_result = repos_collection.delete_one({"_id": repo_name})
|
158 |
+
result["repo_record_deleted"] = repo_delete_result.deleted_count > 0
|
159 |
+
|
160 |
+
if result["vector_docs_deleted"] > 0 or result["repo_record_deleted"]:
|
161 |
+
result["success"] = True
|
162 |
+
result["message"] = f"✅ Successfully deleted repository '{repo_name}'"
|
163 |
+
if result["vector_docs_deleted"] > 0:
|
164 |
+
result["message"] += (
|
165 |
+
f" ({result['vector_docs_deleted']} documents removed)"
|
166 |
+
)
|
167 |
+
else:
|
168 |
+
result["message"] = (
|
169 |
+
f"⚠️ Repository '{repo_name}' not found or already deleted"
|
170 |
+
)
|
171 |
+
|
172 |
+
print(result["message"])
|
173 |
+
return result
|
174 |
+
|
175 |
+
except Exception as e:
|
176 |
+
error_msg = f"❌ Error deleting repository '{repo_name}': {str(e)}"
|
177 |
+
print(error_msg)
|
178 |
+
return {
|
179 |
+
"success": False,
|
180 |
+
"message": error_msg,
|
181 |
+
"vector_docs_deleted": 0,
|
182 |
+
"repo_record_deleted": False,
|
183 |
+
}
|
184 |
+
|
185 |
+
|
186 |
+
def get_repository_stats():
|
187 |
+
try:
|
188 |
+
repos_collection = get_repos_collection()
|
189 |
+
collection = mongo_client[DB_NAME][COLLECTION_NAME]
|
190 |
+
|
191 |
+
# Count total repositories
|
192 |
+
total_repos = repos_collection.count_documents({})
|
193 |
+
|
194 |
+
# Count total documents in vector store
|
195 |
+
total_docs = collection.count_documents({})
|
196 |
+
|
197 |
+
# Get total files across all repos
|
198 |
+
total_files = 0
|
199 |
+
repos = repos_collection.find({}, {"file_count": 1})
|
200 |
+
for repo in repos:
|
201 |
+
total_files += repo.get("file_count", 0)
|
202 |
+
|
203 |
+
return {
|
204 |
+
"total_repositories": total_repos,
|
205 |
+
"total_documents": total_docs,
|
206 |
+
"total_files": total_files,
|
207 |
+
}
|
208 |
+
|
209 |
+
except Exception as e:
|
210 |
+
print(f"Error getting repository stats: {e}")
|
211 |
+
return {"total_repositories": 0, "total_documents": 0, "total_files": 0}
|
rag/github_file_loader.py
ADDED
@@ -0,0 +1,521 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import base64
|
3 |
+
import logging
|
4 |
+
import re
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Dict, List, Optional, Tuple
|
7 |
+
|
8 |
+
import aiohttp
|
9 |
+
import requests
|
10 |
+
from llama_index.core.schema import Document
|
11 |
+
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
|
15 |
+
class GithubFileLoader:
|
16 |
+
"""
|
17 |
+
GitHub file loader that fetches specific files asynchronously.
|
18 |
+
|
19 |
+
Returns LlamaIndex Document objects for each successfully loaded file.
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
github_token: Optional[str] = None,
|
25 |
+
concurrent_requests: int = 10,
|
26 |
+
timeout: int = 30,
|
27 |
+
retries: int = 3,
|
28 |
+
):
|
29 |
+
"""
|
30 |
+
Initialize GitHub file loader.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
github_token: GitHub API token for higher rate limits
|
34 |
+
concurrent_requests: Number of concurrent requests
|
35 |
+
timeout: Request timeout in seconds
|
36 |
+
retries: Number of retry attempts for failed requests
|
37 |
+
"""
|
38 |
+
self.github_token = github_token
|
39 |
+
self.concurrent_requests = concurrent_requests
|
40 |
+
self.timeout = timeout
|
41 |
+
self.retries = retries
|
42 |
+
|
43 |
+
# Setup headers
|
44 |
+
self.headers = {
|
45 |
+
"Accept": "application/vnd.github.v3+json",
|
46 |
+
"User-Agent": "LlamaIndex-GitHub-Loader/1.0",
|
47 |
+
}
|
48 |
+
|
49 |
+
if self.github_token:
|
50 |
+
self.headers["Authorization"] = f"token {self.github_token}"
|
51 |
+
|
52 |
+
def fetch_repository_files(
|
53 |
+
self,
|
54 |
+
repo_url: str,
|
55 |
+
file_extensions: List[str] = [".md", ".mdx"],
|
56 |
+
branch: str = "main",
|
57 |
+
) -> Tuple[List[str], str]:
|
58 |
+
"""
|
59 |
+
Fetch files from GitHub repository using GitHub API
|
60 |
+
|
61 |
+
Args:
|
62 |
+
repo_url: GitHub repository URL or owner/repo format
|
63 |
+
file_extensions: List of file extensions to filter (e.g., [".md", ".mdx", ".txt"])
|
64 |
+
branch: Branch name to fetch from
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
Tuple of (list_of_file_paths, status_message)
|
68 |
+
"""
|
69 |
+
try:
|
70 |
+
# Parse GitHub URL to extract owner and repo
|
71 |
+
repo_name = self._parse_repo_name(repo_url)
|
72 |
+
if not repo_name:
|
73 |
+
return (
|
74 |
+
[],
|
75 |
+
"Invalid GitHub URL format. Use: https://github.com/owner/repo or owner/repo",
|
76 |
+
)
|
77 |
+
|
78 |
+
# GitHub API endpoint for repository tree
|
79 |
+
api_url = f"https://api.github.com/repos/{repo_name}/git/trees/{branch}?recursive=1"
|
80 |
+
|
81 |
+
# Make request with authentication if token is available
|
82 |
+
response = requests.get(api_url, headers=self.headers, timeout=self.timeout)
|
83 |
+
|
84 |
+
if response.status_code == 200:
|
85 |
+
data = response.json()
|
86 |
+
filtered_files = []
|
87 |
+
|
88 |
+
# Filter for specified file extensions
|
89 |
+
for item in data.get("tree", []):
|
90 |
+
if item["type"] == "blob":
|
91 |
+
file_path = item["path"]
|
92 |
+
# Check if file has any of the specified extensions
|
93 |
+
if any(
|
94 |
+
file_path.lower().endswith(ext.lower())
|
95 |
+
for ext in file_extensions
|
96 |
+
):
|
97 |
+
filtered_files.append(file_path)
|
98 |
+
|
99 |
+
if filtered_files:
|
100 |
+
ext_str = ", ".join(file_extensions)
|
101 |
+
return (
|
102 |
+
filtered_files,
|
103 |
+
f"Found {len(filtered_files)} files with extensions ({ext_str}) in {repo_name}/{branch}",
|
104 |
+
)
|
105 |
+
else:
|
106 |
+
ext_str = ", ".join(file_extensions)
|
107 |
+
return (
|
108 |
+
[],
|
109 |
+
f"No files with extensions ({ext_str}) found in repository {repo_name}/{branch}",
|
110 |
+
)
|
111 |
+
|
112 |
+
elif response.status_code == 404:
|
113 |
+
return (
|
114 |
+
[],
|
115 |
+
f"Repository '{repo_name}' not found or branch '{branch}' doesn't exist",
|
116 |
+
)
|
117 |
+
elif response.status_code == 403:
|
118 |
+
if "rate limit" in response.text.lower():
|
119 |
+
return (
|
120 |
+
[],
|
121 |
+
"GitHub API rate limit exceeded. Consider using a GitHub token.",
|
122 |
+
)
|
123 |
+
else:
|
124 |
+
return (
|
125 |
+
[],
|
126 |
+
"Access denied. Repository may be private or require authentication.",
|
127 |
+
)
|
128 |
+
else:
|
129 |
+
return (
|
130 |
+
[],
|
131 |
+
f"GitHub API Error: {response.status_code} - {response.text[:200]}",
|
132 |
+
)
|
133 |
+
|
134 |
+
except requests.exceptions.Timeout:
|
135 |
+
return [], f"Request timeout after {self.timeout} seconds"
|
136 |
+
except requests.exceptions.RequestException as e:
|
137 |
+
return [], f"Network error: {str(e)}"
|
138 |
+
except Exception as e:
|
139 |
+
return [], f"Unexpected error: {str(e)}"
|
140 |
+
|
141 |
+
def _parse_repo_name(self, repo_url: str) -> Optional[str]:
|
142 |
+
"""
|
143 |
+
Parse repository URL to extract owner/repo format
|
144 |
+
|
145 |
+
Args:
|
146 |
+
repo_url: GitHub repository URL or owner/repo format
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
Repository name in "owner/repo" format or None if invalid
|
150 |
+
"""
|
151 |
+
if "github.com" in repo_url:
|
152 |
+
# Extract from full URL
|
153 |
+
parts = (
|
154 |
+
repo_url.replace("https://github.com/", "")
|
155 |
+
.replace("http://github.com/", "")
|
156 |
+
.strip("/")
|
157 |
+
.split("/")
|
158 |
+
)
|
159 |
+
if len(parts) >= 2:
|
160 |
+
return f"{parts[0]}/{parts[1]}"
|
161 |
+
else:
|
162 |
+
# Assume format is owner/repo
|
163 |
+
parts = repo_url.strip().split("/")
|
164 |
+
if len(parts) == 2 and all(part.strip() for part in parts):
|
165 |
+
return repo_url.strip()
|
166 |
+
|
167 |
+
return None
|
168 |
+
|
169 |
+
def fetch_markdown_files(
|
170 |
+
self, repo_url: str, branch: str = "main"
|
171 |
+
) -> Tuple[List[str], str]:
|
172 |
+
"""
|
173 |
+
Fetch markdown files from GitHub repository (backward compatibility method)
|
174 |
+
|
175 |
+
Args:
|
176 |
+
repo_url: GitHub repository URL or owner/repo format
|
177 |
+
branch: Branch name to fetch from
|
178 |
+
|
179 |
+
Returns:
|
180 |
+
Tuple of (list_of_markdown_files, status_message)
|
181 |
+
"""
|
182 |
+
return self.fetch_repository_files(
|
183 |
+
repo_url=repo_url, file_extensions=[".md", ".mdx"], branch=branch
|
184 |
+
)
|
185 |
+
|
186 |
+
async def load_files(
|
187 |
+
self, repo_name: str, file_paths: List[str], branch: str = "main"
|
188 |
+
) -> Tuple[List[Document], List[str]]:
|
189 |
+
"""
|
190 |
+
Load files from GitHub repository asynchronously.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
repo_name: Repository name in format "owner/repo"
|
194 |
+
file_paths: List of file paths to load
|
195 |
+
branch: Branch name to load from
|
196 |
+
|
197 |
+
Returns:
|
198 |
+
Tuple of (successfully_loaded_documents, failed_file_paths)
|
199 |
+
"""
|
200 |
+
if not file_paths:
|
201 |
+
return [], []
|
202 |
+
|
203 |
+
# Validate repo name format
|
204 |
+
if not re.match(r"^[^/]+/[^/]+$", repo_name):
|
205 |
+
raise ValueError(f"Invalid repo format: {repo_name}. Expected 'owner/repo'")
|
206 |
+
|
207 |
+
# Create semaphore to limit concurrent requests
|
208 |
+
semaphore = asyncio.Semaphore(self.concurrent_requests)
|
209 |
+
|
210 |
+
# Create session
|
211 |
+
connector = aiohttp.TCPConnector(limit=self.concurrent_requests)
|
212 |
+
timeout_config = aiohttp.ClientTimeout(total=self.timeout)
|
213 |
+
|
214 |
+
async with aiohttp.ClientSession(
|
215 |
+
headers=self.headers, connector=connector, timeout=timeout_config
|
216 |
+
) as session:
|
217 |
+
# Create tasks for all files
|
218 |
+
tasks = []
|
219 |
+
for file_path in file_paths:
|
220 |
+
task = asyncio.create_task(
|
221 |
+
self._fetch_file_with_retry(
|
222 |
+
session, semaphore, repo_name, file_path, branch
|
223 |
+
)
|
224 |
+
)
|
225 |
+
tasks.append(task)
|
226 |
+
|
227 |
+
# Wait for all tasks to complete
|
228 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
229 |
+
|
230 |
+
# Process results
|
231 |
+
documents = []
|
232 |
+
failed_files = []
|
233 |
+
|
234 |
+
for i, result in enumerate(results):
|
235 |
+
file_path = file_paths[i]
|
236 |
+
|
237 |
+
if isinstance(result, Exception):
|
238 |
+
logger.error(f"Failed to load {file_path}: {result}")
|
239 |
+
failed_files.append(file_path)
|
240 |
+
elif result is None:
|
241 |
+
logger.warning(f"No content returned for {file_path}")
|
242 |
+
failed_files.append(file_path)
|
243 |
+
else:
|
244 |
+
documents.append(result)
|
245 |
+
|
246 |
+
logger.info(
|
247 |
+
f"Successfully loaded {len(documents)} files, failed: {len(failed_files)}"
|
248 |
+
)
|
249 |
+
return documents, failed_files
|
250 |
+
|
251 |
+
async def _fetch_file_with_retry(
|
252 |
+
self,
|
253 |
+
session: aiohttp.ClientSession,
|
254 |
+
semaphore: asyncio.Semaphore,
|
255 |
+
repo_name: str,
|
256 |
+
file_path: str,
|
257 |
+
branch: str,
|
258 |
+
) -> Optional[Document]:
|
259 |
+
"""Fetch a single file with retry logic."""
|
260 |
+
async with semaphore:
|
261 |
+
for attempt in range(self.retries + 1):
|
262 |
+
try:
|
263 |
+
return await self._fetch_single_file(
|
264 |
+
session, repo_name, file_path, branch
|
265 |
+
)
|
266 |
+
except Exception as e:
|
267 |
+
if attempt == self.retries:
|
268 |
+
logger.error(
|
269 |
+
f"Failed to fetch {file_path} after {self.retries + 1} attempts: {e}"
|
270 |
+
)
|
271 |
+
raise
|
272 |
+
else:
|
273 |
+
logger.warning(
|
274 |
+
f"Attempt {attempt + 1} failed for {file_path}: {e}"
|
275 |
+
)
|
276 |
+
await asyncio.sleep(2**attempt) # Exponential backoff
|
277 |
+
|
278 |
+
return None
|
279 |
+
|
280 |
+
async def _fetch_single_file(
|
281 |
+
self,
|
282 |
+
session: aiohttp.ClientSession,
|
283 |
+
repo_name: str,
|
284 |
+
file_path: str,
|
285 |
+
branch: str,
|
286 |
+
) -> Document:
|
287 |
+
"""Fetch a single file from GitHub API."""
|
288 |
+
# Clean file path
|
289 |
+
clean_path = file_path.strip("/")
|
290 |
+
|
291 |
+
# Build API URL
|
292 |
+
api_url = f"https://api.github.com/repos/{repo_name}/contents/{clean_path}"
|
293 |
+
params = {"ref": branch}
|
294 |
+
|
295 |
+
logger.debug(f"Fetching: {api_url}")
|
296 |
+
|
297 |
+
async with session.get(api_url, params=params) as response:
|
298 |
+
if response.status == 404:
|
299 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
300 |
+
elif response.status == 403:
|
301 |
+
raise PermissionError("API rate limit exceeded or access denied")
|
302 |
+
elif response.status != 200:
|
303 |
+
raise Exception(f"HTTP {response.status}: {await response.text()}")
|
304 |
+
|
305 |
+
data = await response.json()
|
306 |
+
|
307 |
+
# Handle directory case
|
308 |
+
if isinstance(data, list):
|
309 |
+
raise ValueError(f"Path {file_path} is a directory, not a file")
|
310 |
+
|
311 |
+
# Decode file content
|
312 |
+
if data.get("encoding") == "base64":
|
313 |
+
try:
|
314 |
+
content_bytes = base64.b64decode(data["content"])
|
315 |
+
content_text = content_bytes.decode("utf-8")
|
316 |
+
except Exception as e:
|
317 |
+
logger.warning(f"Failed to decode {file_path}: {e}")
|
318 |
+
# Try to decode as latin-1 as fallback
|
319 |
+
content_text = content_bytes.decode("latin-1", errors="ignore")
|
320 |
+
else:
|
321 |
+
raise ValueError(f"Unsupported encoding: {data.get('encoding')}")
|
322 |
+
|
323 |
+
# Create Document
|
324 |
+
document = self._create_document(
|
325 |
+
content=content_text,
|
326 |
+
file_path=clean_path,
|
327 |
+
repo_name=repo_name,
|
328 |
+
branch=branch,
|
329 |
+
file_data=data,
|
330 |
+
)
|
331 |
+
|
332 |
+
return document
|
333 |
+
|
334 |
+
def _create_document(
|
335 |
+
self, content: str, file_path: str, repo_name: str, branch: str, file_data: Dict
|
336 |
+
) -> Document:
|
337 |
+
"""Create a LlamaIndex Document from file content and metadata."""
|
338 |
+
|
339 |
+
# Extract file info
|
340 |
+
filename = Path(file_path).name
|
341 |
+
file_extension = Path(file_path).suffix.lower()
|
342 |
+
directory = (
|
343 |
+
str(Path(file_path).parent) if Path(file_path).parent != Path(".") else ""
|
344 |
+
)
|
345 |
+
|
346 |
+
# Build URLs
|
347 |
+
html_url = f"https://github.com/{repo_name}/blob/{branch}/{file_path}"
|
348 |
+
raw_url = file_data.get("download_url", "")
|
349 |
+
|
350 |
+
# Create metadata
|
351 |
+
metadata = {
|
352 |
+
"file_path": file_path,
|
353 |
+
"file_name": filename,
|
354 |
+
"file_extension": file_extension,
|
355 |
+
"directory": directory,
|
356 |
+
"repo": repo_name,
|
357 |
+
"branch": branch,
|
358 |
+
"sha": file_data.get("sha", ""),
|
359 |
+
"size": file_data.get("size", 0),
|
360 |
+
"url": html_url,
|
361 |
+
"raw_url": raw_url,
|
362 |
+
"type": file_data.get("type", "file"),
|
363 |
+
}
|
364 |
+
|
365 |
+
# Create document with unique ID
|
366 |
+
doc_id = f"{repo_name}:{branch}:{file_path}"
|
367 |
+
|
368 |
+
document = Document(
|
369 |
+
text=content,
|
370 |
+
doc_id=doc_id,
|
371 |
+
metadata=metadata, # For backward compatibility
|
372 |
+
)
|
373 |
+
|
374 |
+
return document
|
375 |
+
|
376 |
+
def load_files_sync(
|
377 |
+
self, repo_name: str, file_paths: List[str], branch: str = "main"
|
378 |
+
) -> Tuple[List[Document], List[str]]:
|
379 |
+
"""
|
380 |
+
Synchronous wrapper for load_files.
|
381 |
+
|
382 |
+
Args:
|
383 |
+
repo_name: Repository name in format "owner/repo"
|
384 |
+
file_paths: List of file paths to load
|
385 |
+
branch: Branch name to load from
|
386 |
+
|
387 |
+
Returns:
|
388 |
+
Tuple of (successfully_loaded_documents, failed_file_paths)
|
389 |
+
"""
|
390 |
+
|
391 |
+
return asyncio.run(self.load_files(repo_name, file_paths, branch))
|
392 |
+
|
393 |
+
|
394 |
+
# Convenience functions
|
395 |
+
async def load_github_files_async(
|
396 |
+
repo_name: str,
|
397 |
+
file_paths: List[str],
|
398 |
+
branch: str = "main",
|
399 |
+
github_token: Optional[str] = None,
|
400 |
+
concurrent_requests: int = 10,
|
401 |
+
) -> Tuple[List[Document], List[str]]:
|
402 |
+
"""
|
403 |
+
Convenience function to load GitHub files asynchronously.
|
404 |
+
|
405 |
+
Args:
|
406 |
+
repo_name: Repository name in format "owner/repo"
|
407 |
+
file_paths: List of file paths to load
|
408 |
+
branch: Branch name to load from
|
409 |
+
github_token: GitHub API token
|
410 |
+
concurrent_requests: Number of concurrent requests
|
411 |
+
|
412 |
+
Returns:
|
413 |
+
Tuple of (documents, failed_files)
|
414 |
+
"""
|
415 |
+
loader = GithubFileLoader(
|
416 |
+
github_token=github_token, concurrent_requests=concurrent_requests
|
417 |
+
)
|
418 |
+
return await loader.load_files(repo_name, file_paths, branch)
|
419 |
+
|
420 |
+
|
421 |
+
def load_github_files(
|
422 |
+
repo_name: str,
|
423 |
+
file_paths: List[str],
|
424 |
+
branch: str = "main",
|
425 |
+
github_token: Optional[str] = None,
|
426 |
+
concurrent_requests: int = 10,
|
427 |
+
) -> Tuple[List[Document], List[str]]:
|
428 |
+
"""
|
429 |
+
Convenience function to load GitHub files synchronously.
|
430 |
+
|
431 |
+
Args:
|
432 |
+
repo_name: Repository name in format "owner/repo"
|
433 |
+
file_paths: List of file paths to load
|
434 |
+
branch: Branch name to load from
|
435 |
+
github_token: GitHub API token
|
436 |
+
concurrent_requests: Number of concurrent requests
|
437 |
+
|
438 |
+
Returns:
|
439 |
+
Tuple of (documents, failed_files)
|
440 |
+
"""
|
441 |
+
loader = GithubFileLoader(
|
442 |
+
github_token=github_token, concurrent_requests=concurrent_requests
|
443 |
+
)
|
444 |
+
return loader.load_files_sync(repo_name, file_paths, branch)
|
445 |
+
|
446 |
+
|
447 |
+
def fetch_markdown_files(
|
448 |
+
repo_url: str, github_token: Optional[str] = None, branch: str = "main"
|
449 |
+
) -> Tuple[List[str], str]:
|
450 |
+
"""
|
451 |
+
Convenience function to fetch markdown files from GitHub repository
|
452 |
+
|
453 |
+
Args:
|
454 |
+
repo_url: GitHub repository URL or owner/repo format
|
455 |
+
github_token: GitHub API token for higher rate limits
|
456 |
+
branch: Branch name to fetch from
|
457 |
+
|
458 |
+
Returns:
|
459 |
+
Tuple of (list_of_files, status_message)
|
460 |
+
"""
|
461 |
+
loader = GithubFileLoader(github_token=github_token)
|
462 |
+
return loader.fetch_markdown_files(repo_url, branch)
|
463 |
+
|
464 |
+
|
465 |
+
def fetch_repository_files(
|
466 |
+
repo_url: str,
|
467 |
+
file_extensions: List[str] = [".md", ".mdx"],
|
468 |
+
github_token: Optional[str] = None,
|
469 |
+
branch: str = "main",
|
470 |
+
) -> Tuple[List[str], str]:
|
471 |
+
"""
|
472 |
+
Convenience function to fetch files with specific extensions from GitHub repository
|
473 |
+
|
474 |
+
Args:
|
475 |
+
repo_url: GitHub repository URL or owner/repo format
|
476 |
+
file_extensions: List of file extensions to filter
|
477 |
+
github_token: GitHub API token for higher rate limits
|
478 |
+
branch: Branch name to fetch from
|
479 |
+
|
480 |
+
Returns:
|
481 |
+
Tuple of (list_of_files, status_message)
|
482 |
+
"""
|
483 |
+
loader = GithubFileLoader(github_token=github_token)
|
484 |
+
return loader.fetch_repository_files(repo_url, file_extensions, branch)
|
485 |
+
|
486 |
+
|
487 |
+
# Example usage
|
488 |
+
if __name__ == "__main__":
|
489 |
+
# Example file paths
|
490 |
+
file_paths = [
|
491 |
+
"docs/contribute/docs.mdx",
|
492 |
+
"docs/contribute/ml-handlers.mdx",
|
493 |
+
"docs/contribute/community.mdx",
|
494 |
+
"docs/contribute/python-coding-standards.mdx",
|
495 |
+
"docs/features/data-integrations.mdx",
|
496 |
+
"docs/features/ai-integrations.mdx",
|
497 |
+
"docs/integrations/ai-engines/langchain_embedding.mdx",
|
498 |
+
"docs/integrations/ai-engines/langchain.mdx",
|
499 |
+
"docs/integrations/ai-engines/google_gemini.mdx",
|
500 |
+
"docs/integrations/ai-engines/anomaly.mdx",
|
501 |
+
"docs/integrations/ai-engines/amazon-bedrock.mdx",
|
502 |
+
]
|
503 |
+
|
504 |
+
# Load files synchronously
|
505 |
+
documents, failed = load_github_files(
|
506 |
+
repo_name="mindsdb/mindsdb",
|
507 |
+
file_paths=file_paths,
|
508 |
+
branch="main", # Optional
|
509 |
+
)
|
510 |
+
|
511 |
+
print(f"Loaded {len(documents)} documents")
|
512 |
+
print(f"Failed to load {len(failed)} files: {failed}")
|
513 |
+
|
514 |
+
# Print first document info
|
515 |
+
if documents:
|
516 |
+
doc = documents[0]
|
517 |
+
print("\nFirst document:")
|
518 |
+
print(f"ID: {doc.doc_id}")
|
519 |
+
print(f"File: {doc.metadata['file_path']}")
|
520 |
+
print(f"Size: {len(doc.text)} characters")
|
521 |
+
print(f"Content preview: {doc.text[:200]}...")
|
rag/ingest.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
from llama_index.core import StorageContext, VectorStoreIndex
|
4 |
+
from llama_index.core.schema import Document
|
5 |
+
|
6 |
+
from .config import get_vector_store, store_ingested_repo
|
7 |
+
|
8 |
+
|
9 |
+
async def ingest_documents_async(documents: List[Document], repo_name: str = None):
|
10 |
+
"""Async version of document ingestion with detailed logging and repo tracking"""
|
11 |
+
print(f"🔄 Starting async ingestion of {len(documents)} documents")
|
12 |
+
|
13 |
+
if repo_name:
|
14 |
+
print(f"📍 Repository: {repo_name}")
|
15 |
+
|
16 |
+
try:
|
17 |
+
# Get vector store
|
18 |
+
vector_store = get_vector_store()
|
19 |
+
print(f"✅ Vector store retrieved: {type(vector_store)}")
|
20 |
+
|
21 |
+
# Create storage context
|
22 |
+
vector_store_context = StorageContext.from_defaults(vector_store=vector_store)
|
23 |
+
print(f"✅ Vector Store context created: {type(vector_store_context)}")
|
24 |
+
|
25 |
+
# Process documents and ensure repo metadata
|
26 |
+
print("🔄 Processing documents through pipeline...")
|
27 |
+
ingested_files = []
|
28 |
+
|
29 |
+
for i, doc in enumerate(documents):
|
30 |
+
print(f"📄 Doc {i + 1}: {doc.doc_id} - {len(doc.text)} chars")
|
31 |
+
print(f" Metadata: {doc.metadata}")
|
32 |
+
|
33 |
+
# Ensure repo metadata is properly set
|
34 |
+
if repo_name and "repo" not in doc.metadata:
|
35 |
+
doc.metadata["repo"] = repo_name
|
36 |
+
print(f" ✅ Added repo metadata: {repo_name}")
|
37 |
+
|
38 |
+
# Track ingested file paths
|
39 |
+
file_path = doc.metadata.get("file_path", doc.doc_id)
|
40 |
+
if file_path not in ingested_files:
|
41 |
+
ingested_files.append(file_path)
|
42 |
+
|
43 |
+
# Run the ingestion
|
44 |
+
print("🚀 Starting vector store ingestion...")
|
45 |
+
vc_store_index = VectorStoreIndex.from_documents(
|
46 |
+
documents=documents,
|
47 |
+
storage_context=vector_store_context,
|
48 |
+
show_progress=True,
|
49 |
+
)
|
50 |
+
print("✅ Document Ingestion completed Successfully")
|
51 |
+
|
52 |
+
# Store repository metadata if repo_name is provided
|
53 |
+
if repo_name and ingested_files:
|
54 |
+
store_success = store_ingested_repo(repo_name, ingested_files)
|
55 |
+
if store_success:
|
56 |
+
print(f"✅ Repository metadata stored for {repo_name}")
|
57 |
+
else:
|
58 |
+
print(f"⚠️ Failed to store repository metadata for {repo_name}")
|
59 |
+
|
60 |
+
return vc_store_index
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
print(f"❌ Error in async ingestion: {str(e)}")
|
64 |
+
import traceback
|
65 |
+
traceback.print_exc()
|
66 |
+
raise e
|
rag/query.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional
|
2 |
+
|
3 |
+
from llama_index.core import VectorStoreIndex
|
4 |
+
from llama_index.core.vector_stores import (FilterOperator, MetadataFilter,
|
5 |
+
MetadataFilters)
|
6 |
+
from pydantic import BaseModel, Field
|
7 |
+
|
8 |
+
from .config import get_vector_store
|
9 |
+
|
10 |
+
|
11 |
+
class Node(BaseModel):
|
12 |
+
file_name: str = Field("Name of the file")
|
13 |
+
url: str = Field("GitHub repo url of the file")
|
14 |
+
score: float = Field("Relevance score of the node") # Changed to float
|
15 |
+
content: str = Field("Content of the node")
|
16 |
+
|
17 |
+
|
18 |
+
class ContextResponseModel(BaseModel):
|
19 |
+
response: str = Field("Response for user's query")
|
20 |
+
source_nodes: Optional[List[Node]] = Field(
|
21 |
+
"List of sources used to generate response"
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
class QueryRetriever:
|
26 |
+
def __init__(self, repo):
|
27 |
+
self.vector_store_index = VectorStoreIndex.from_vector_store(get_vector_store())
|
28 |
+
self.filters = MetadataFilters(
|
29 |
+
filters=[
|
30 |
+
MetadataFilter(
|
31 |
+
key="metadata.repo",
|
32 |
+
value=repo,
|
33 |
+
operator=FilterOperator.EQ,
|
34 |
+
)
|
35 |
+
]
|
36 |
+
)
|
37 |
+
|
38 |
+
def make_query(self, query: str, mode: str = "default") -> dict:
|
39 |
+
"""
|
40 |
+
Retrieve relevant documentation context for a given query using specified retrieval mode.
|
41 |
+
|
42 |
+
This function is designed to support Retrieval-Augmented Generation (RAG) by extracting
|
43 |
+
the most relevant context chunks from indexed documentation sources.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
query (str): The user's input query related to the documentation.
|
47 |
+
mode (str, optional): Retrieval strategy to use. One of:
|
48 |
+
- "default": Standard semantic similarity search.
|
49 |
+
- "text_search": Keyword-based search.
|
50 |
+
- "hybrid": Combines semantic and keyword-based methods.
|
51 |
+
Defaults to "default".
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
dict: Dictionary with 'response' and 'source_nodes' keys
|
55 |
+
"""
|
56 |
+
query_engine = self.vector_store_index.as_query_engine(
|
57 |
+
similarity_top_k=5,
|
58 |
+
vector_store_query_mode=mode,
|
59 |
+
filters=self.filters,
|
60 |
+
response_mode="refine",
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
response = query_engine.query(query)
|
66 |
+
nodes = []
|
67 |
+
for node in response.source_nodes:
|
68 |
+
nodes.append(
|
69 |
+
{
|
70 |
+
"file_name": node.metadata.get("file_name", "Unknown"),
|
71 |
+
"url": node.metadata.get("url", "#"),
|
72 |
+
"score": float(node.score) if node.score else 0.0,
|
73 |
+
"content": node.get_content(),
|
74 |
+
}
|
75 |
+
)
|
76 |
+
|
77 |
+
return {"response": str(response.response), "source_nodes": nodes}
|
78 |
+
|
79 |
+
@staticmethod
|
80 |
+
def get_available_repos() -> List[str]:
|
81 |
+
"""Get list of available repositories in the vector store"""
|
82 |
+
try:
|
83 |
+
from .config import get_available_repos as get_repos_from_db
|
84 |
+
print("fetching repos")
|
85 |
+
re = get_repos_from_db()
|
86 |
+
|
87 |
+
print(re)
|
88 |
+
return re
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error getting repos from database: {e}")
|
91 |
+
# Fallback to hardcoded list
|
92 |
+
return ["mindsdb/mindsdb", "run-llama/llama_index"]
|
93 |
+
|
94 |
+
|
requirements.txt
ADDED
Binary file (5.15 kB). View file
|
|