Spaces:
Running
Running
import asyncio | |
import os | |
import time | |
import traceback | |
from typing import Dict, List | |
import gradio as gr | |
from dotenv import load_dotenv | |
from llama_index.core import Settings | |
from llama_index.core.text_splitter import SentenceSplitter | |
from rag.config import ( | |
delete_repository_data, | |
embed_model, | |
get_available_repos, | |
get_repo_details, | |
get_repository_stats, | |
llm, | |
) | |
from rag.github_file_loader import fetch_markdown_files as fetch_files_with_loader | |
from rag.github_file_loader import fetch_repository_files, load_github_files | |
from rag.ingest import ingest_documents_async | |
from rag.query import QueryRetriever | |
load_dotenv() | |
Settings.llm = llm | |
Settings.embed_model = embed_model | |
Settings.node_parser = SentenceSplitter(chunk_size=3072) | |
# Environment variable to control repository management visibility | |
ENABLE_REPO_MANAGEMENT = os.getenv("ENABLE_REPO_MANAGEMENT", "true").lower() == "true" | |
def get_available_repositories(): | |
return get_available_repos() | |
def start_file_loading( | |
repo_url: str, selected_files: List[str], current_progress: Dict | |
): | |
"""Step 1: Load files from GitHub""" | |
print("\n๐ STARTING FILE LOADING STEP") | |
print(f"๐ Repository: {repo_url}") | |
print(f"๐ Selected files: {selected_files}") | |
if not selected_files: | |
return { | |
"status": "error", | |
"message": "โ No files selected for loading", | |
"progress": 0, | |
"details": "", | |
"step": "file_loading", | |
} | |
total_files = len(selected_files) | |
start_time = time.time() | |
# Parse repo name from URL | |
if "github.com" in repo_url: | |
repo_name = ( | |
repo_url.replace("https://github.com/", "") | |
.replace("http://github.com/", "") | |
.strip("/") | |
) | |
if "/" not in repo_name: | |
return { | |
"status": "error", | |
"message": "โ Invalid repository URL format", | |
"progress": 0, | |
"details": "", | |
"step": "file_loading", | |
} | |
else: | |
repo_name = repo_url.strip() | |
try: | |
batch_size = 25 | |
all_documents = [] | |
all_failed = [] | |
current_progress.update( | |
{ | |
"status": "loading", | |
"message": f"๐ Loading files from {repo_name}", | |
"progress": 0, | |
"total_files": total_files, | |
"processed_files": 0, | |
"phase": "File Loading", | |
"details": f"Processing {total_files} files in batches...", | |
"step": "file_loading", | |
} | |
) | |
for i in range(0, len(selected_files), batch_size): | |
batch = selected_files[i : i + batch_size] | |
print(f"\n๐ฆ PROCESSING BATCH {i // batch_size + 1}") | |
print(f" Files: {batch}") | |
# Update progress for current batch | |
progress_percentage = (i / total_files) * 100 | |
current_progress.update( | |
{ | |
"progress": progress_percentage, | |
"processed_files": i, | |
"current_batch": i // batch_size + 1, | |
"details": f"Loading batch {i // batch_size + 1}: {', '.join([f.split('/')[-1] for f in batch])}", | |
} | |
) | |
try: | |
documents, failed = load_github_files( | |
repo_name=repo_name, | |
file_paths=batch, | |
branch="main", | |
concurrent_requests=10, | |
github_token=os.getenv("GITHUB_API_KEY"), | |
) | |
print("โ Load results:") | |
print(f" - Documents: {len(documents)}") | |
print(f" - Failed: {len(failed)}") | |
if documents: | |
for j, doc in enumerate(documents): | |
print(f" ๐ Doc {j + 1}: {doc.doc_id}") | |
print(f" Size: {len(doc.text)} chars") | |
# Ensure repo metadata is set | |
if "repo" not in doc.metadata: | |
doc.metadata["repo"] = repo_name | |
print(f" โ Added repo metadata: {repo_name}") | |
all_documents.extend(documents) | |
all_failed.extend(failed) | |
except Exception as batch_error: | |
print(f"โ Batch processing error: {batch_error}") | |
all_failed.extend(batch) | |
loading_time = time.time() - start_time | |
# Store loaded documents in progress state for next step | |
current_progress.update( | |
{ | |
"status": "loaded", | |
"message": f"โ File Loading Complete! Loaded {len(all_documents)} documents", | |
"progress": 100, | |
"phase": "Files Loaded", | |
"details": f"Successfully loaded {len(all_documents)} documents in {loading_time:.1f}s", | |
"step": "file_loading_complete", | |
"loaded_documents": all_documents, # Store documents for next step | |
"failed_files": all_failed, | |
"loading_time": loading_time, | |
"repo_name": repo_name, | |
} | |
) | |
return current_progress | |
except Exception as e: | |
total_time = time.time() - start_time | |
error_msg = f"โ File loading error after {total_time:.1f}s: {str(e)}" | |
print(error_msg) | |
current_progress.update( | |
{ | |
"status": "error", | |
"message": error_msg, | |
"progress": 0, | |
"phase": "Failed", | |
"details": str(e), | |
"error": str(e), | |
"step": "file_loading", | |
} | |
) | |
return current_progress | |
def start_vector_ingestion(current_progress: Dict): | |
"""Step 2: Ingest loaded documents into vector store""" | |
print("\n๐ STARTING VECTOR INGESTION STEP") | |
# Check if we have loaded documents from previous step | |
if current_progress.get("step") != "file_loading_complete": | |
return { | |
"status": "error", | |
"message": "โ No loaded documents found. Please load files first.", | |
"progress": 0, | |
"details": "", | |
"step": "vector_ingestion", | |
} | |
all_documents = current_progress.get("loaded_documents", []) | |
repo_name = current_progress.get("repo_name", "") | |
if not all_documents: | |
return { | |
"status": "error", | |
"message": "โ No documents available for vector ingestion", | |
"progress": 0, | |
"details": "", | |
"step": "vector_ingestion", | |
} | |
vector_start_time = time.time() | |
# Update state for vector store phase | |
current_progress.update( | |
{ | |
"status": "vectorizing", | |
"message": "๐ Generating embeddings and storing in vector database", | |
"progress": 0, | |
"phase": "Vector Store Ingestion", | |
"details": f"Processing {len(all_documents)} documents for embedding...", | |
"step": "vector_ingestion", | |
} | |
) | |
try: | |
print("๐ STARTING VECTOR STORE INGESTION") | |
print(f" Repository: {repo_name}") | |
print(f" Documents to process: {len(all_documents)}") | |
# Call the async ingestion function with repo name | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
try: | |
loop.run_until_complete(ingest_documents_async(all_documents, repo_name)) | |
finally: | |
loop.close() | |
vector_time = time.time() - vector_start_time | |
loading_time = current_progress.get("loading_time", 0) | |
total_time = loading_time + vector_time | |
print(f"โ Vector ingestion completed in {vector_time:.2f} seconds") | |
failed_files_data = current_progress.get("failed_files", []) | |
if isinstance(failed_files_data, list): | |
failed_files_count = len(failed_files_data) | |
else: | |
failed_files_count = ( | |
failed_files_data if isinstance(failed_files_data, int) else 0 | |
) | |
# Update final success state with repository update flag | |
current_progress.update( | |
{ | |
"status": "complete", | |
"message": "โ Complete Ingestion Pipeline Finished!", | |
"progress": 100, | |
"phase": "Complete", | |
"details": f"Successfully processed {len(all_documents)} documents for {repo_name}", | |
"step": "complete", | |
"total_time": total_time, | |
"documents_processed": len(all_documents), | |
"failed_files_count": failed_files_count, # Use count instead of trying len() | |
"failed_files": failed_files_data, # Keep original data | |
"vector_time": vector_time, | |
"loading_time": loading_time, | |
"repo_name": repo_name, | |
"repository_updated": True, # Flag to trigger repo list refresh | |
} | |
) | |
return current_progress | |
except Exception as ingest_error: | |
vector_time = time.time() - vector_start_time | |
print(f"โ Vector ingestion failed after {vector_time:.2f} seconds") | |
print(f"โ Error: {ingest_error}") | |
# Get failed files data safely | |
failed_files_data = current_progress.get("failed_files", []) | |
if isinstance(failed_files_data, list): | |
failed_files_count = len(failed_files_data) | |
else: | |
failed_files_count = ( | |
failed_files_data if isinstance(failed_files_data, int) else 0 | |
) | |
current_progress.update( | |
{ | |
"status": "error", | |
"message": "โ Vector Store Ingestion Failed", | |
"progress": 0, | |
"phase": "Failed", | |
"details": f"Error: {str(ingest_error)}", | |
"error": str(ingest_error), | |
"step": "vector_ingestion", | |
"failed_files_count": failed_files_count, | |
"failed_files": failed_files_data, | |
} | |
) | |
return current_progress | |
def start_file_loading_generator( | |
repo_url: str, selected_files: List[str], current_progress: Dict | |
): | |
"""Step 1: Load files from GitHub with yield-based real-time updates""" | |
print("\n๐ STARTING FILE LOADING STEP") | |
print(f"๐ Repository: {repo_url}") | |
print(f"๐ Selected files: {len(selected_files)} files") | |
if not selected_files: | |
error_progress = { | |
"status": "error", | |
"message": "โ No files selected for loading", | |
"progress": 0, | |
"details": "Please select at least one file to proceed.", | |
"step": "file_loading", | |
} | |
yield error_progress | |
return error_progress | |
total_files = len(selected_files) | |
start_time = time.time() | |
# Parse repo name from URL | |
if "github.com" in repo_url: | |
repo_name = ( | |
repo_url.replace("https://github.com/", "") | |
.replace("http://github.com/", "") | |
.strip("/") | |
) | |
if "/" not in repo_name: | |
error_progress = { | |
"status": "error", | |
"message": "โ Invalid repository URL format", | |
"progress": 0, | |
"details": "Expected format: owner/repo or https://github.com/owner/repo", | |
"step": "file_loading", | |
} | |
yield error_progress | |
return error_progress | |
else: | |
repo_name = repo_url.strip() | |
try: | |
batch_size = 10 | |
all_documents = [] | |
all_failed = [] | |
# Initial progress update | |
initial_progress = { | |
"status": "loading", | |
"message": f"๐ Starting file loading from {repo_name}", | |
"progress": 0, | |
"total_files": total_files, | |
"processed_files": 0, | |
"successful_files": 0, | |
"failed_files": 0, | |
"phase": "File Loading", | |
"details": f"Preparing to load {total_files} files in batches of {batch_size}...", | |
"step": "file_loading", | |
"current_batch": 0, | |
"total_batches": (len(selected_files) + batch_size - 1) // batch_size, | |
"repo_name": repo_name, | |
} | |
yield initial_progress | |
time.sleep(0.5) | |
for i in range(0, len(selected_files), batch_size): | |
batch = selected_files[i : i + batch_size] | |
current_batch_num = i // batch_size + 1 | |
total_batches = (len(selected_files) + batch_size - 1) // batch_size | |
# Update progress at batch start | |
batch_start_progress = { | |
"status": "loading", | |
"message": f"๐ Loading batch {current_batch_num}/{total_batches}", | |
"progress": (i / total_files) * 90, | |
"processed_files": i, | |
"successful_files": len(all_documents), | |
"failed_files": len(all_failed), | |
"current_batch": current_batch_num, | |
"total_batches": total_batches, | |
"phase": "File Loading", | |
"details": f"Processing batch {current_batch_num}: {', '.join([f.split('/')[-1] for f in batch[:3]])}{'...' if len(batch) > 3 else ''}", | |
"step": "file_loading", | |
"repo_name": repo_name, | |
} | |
yield batch_start_progress | |
try: | |
print(f"\n๐ฆ PROCESSING BATCH {current_batch_num}/{total_batches}") | |
print(f" Files: {[f.split('/')[-1] for f in batch]}") | |
documents, failed = load_github_files( | |
repo_name=repo_name, | |
file_paths=batch, | |
branch="main", | |
concurrent_requests=10, | |
github_token=os.getenv("GITHUB_API_KEY"), | |
) | |
print("โ Load results:") | |
print(f" - Documents: {len(documents)}") | |
print(f" - Failed: {len(failed)}") | |
# Process documents | |
for j, doc in enumerate(documents): | |
print(f" ๐ Doc {j + 1}: {doc.doc_id}") | |
print(f" Size: {len(doc.text)} chars") | |
if "repo" not in doc.metadata: | |
doc.metadata["repo"] = repo_name | |
print(f" โ Added repo metadata: {repo_name}") | |
all_documents.extend(documents) | |
all_failed.extend(failed) | |
# Update progress after batch completion | |
batch_complete_progress = { | |
"status": "loading", | |
"message": f"โ Completed batch {current_batch_num}/{total_batches}", | |
"progress": ((i + len(batch)) / total_files) * 90, | |
"processed_files": i + len(batch), | |
"successful_files": len(all_documents), | |
"failed_files": len(all_failed), | |
"current_batch": current_batch_num, | |
"total_batches": total_batches, | |
"phase": "File Loading", | |
"details": f"โ Batch {current_batch_num} complete: {len(documents)} loaded, {len(failed)} failed. Total progress: {len(all_documents)} documents loaded.", | |
"step": "file_loading", | |
"repo_name": repo_name, | |
} | |
yield batch_complete_progress | |
time.sleep(0.3) | |
except Exception as batch_error: | |
print(f"โ Batch processing error: {batch_error}") | |
all_failed.extend(batch) | |
error_progress = { | |
"status": "loading", | |
"message": f"โ ๏ธ Error in batch {current_batch_num}", | |
"progress": ((i + len(batch)) / total_files) * 90, | |
"processed_files": i + len(batch), | |
"successful_files": len(all_documents), | |
"failed_files": len(all_failed), | |
"current_batch": current_batch_num, | |
"phase": "File Loading", | |
"details": f"โ Batch {current_batch_num} error: {str(batch_error)[:100]}... Continuing with next batch.", | |
"step": "file_loading", | |
"repo_name": repo_name, | |
} | |
yield error_progress | |
loading_time = time.time() - start_time | |
# Final completion update | |
completion_progress = { | |
"status": "loaded", | |
"message": f"โ File Loading Complete! Loaded {len(all_documents)} documents", | |
"progress": 100, | |
"phase": "Files Loaded Successfully", | |
"details": f"๐ฏ Final Results:\nโ Successfully loaded: {len(all_documents)} documents\nโ Failed files: {len(all_failed)}\nโฑ๏ธ Total time: {loading_time:.1f}s\n๐ Success rate: {(len(all_documents) / (len(all_documents) + len(all_failed)) * 100):.1f}%", | |
"step": "file_loading_complete", | |
"loaded_documents": all_documents, | |
"failed_files": all_failed, | |
"loading_time": loading_time, | |
"repo_name": repo_name, | |
"total_files": total_files, | |
"processed_files": total_files, | |
"successful_files": len(all_documents), | |
} | |
yield completion_progress | |
return completion_progress | |
except Exception as e: | |
total_time = time.time() - start_time | |
error_msg = f"โ File loading error after {total_time:.1f}s: {str(e)}" | |
print(error_msg) | |
error_progress = { | |
"status": "error", | |
"message": error_msg, | |
"progress": 0, | |
"phase": "Loading Failed", | |
"details": f"Critical error during file loading:\n{str(e)}", | |
"error": str(e), | |
"step": "file_loading", | |
} | |
yield error_progress | |
return error_progress | |
# Progress display component | |
def format_progress_display(progress_state: Dict) -> str: | |
"""Format progress state into readable display with enhanced details""" | |
if not progress_state: | |
return "๐ Ready to start ingestion...\n\n๐ **Two-Step Process:**\n1๏ธโฃ Load files from GitHub repository\n2๏ธโฃ Generate embeddings and store in vector database" | |
status = progress_state.get("status", "unknown") | |
message = progress_state.get("message", "") | |
progress = progress_state.get("progress", 0) | |
phase = progress_state.get("phase", "") | |
details = progress_state.get("details", "") | |
# Enhanced progress bar | |
filled = int(progress / 2.5) # 40 chars total | |
progress_bar = "โ" * filled + "โ" * (40 - filled) | |
# Status emoji mapping | |
status_emoji = { | |
"loading": "โณ", | |
"loaded": "โ ", | |
"vectorizing": "๐ง ", | |
"complete": "๐", | |
"error": "โ", | |
} | |
emoji = status_emoji.get(status, "๐") | |
output = f"{emoji} **{message}**\n\n" | |
# Phase and progress section | |
output += f"๐ **Current Phase:** {phase}\n" | |
output += f"๐ **Progress:** {progress:.1f}%\n" | |
output += f"[{progress_bar}] {progress:.1f}%\n\n" | |
# Step-specific details for file loading | |
if progress_state.get("step") == "file_loading": | |
processed = progress_state.get("processed_files", 0) | |
total = progress_state.get("total_files", 0) | |
successful = progress_state.get("successful_files", 0) | |
failed = progress_state.get("failed_files", 0) | |
if total > 0: | |
output += "๐ **File Processing Status:**\n" | |
output += f" โข Total files: {total}\n" | |
output += f" โข Processed: {processed}/{total}\n" | |
output += f" โข โ Successful: {successful}\n" | |
output += f" โข โ Failed: {failed}\n" | |
if "current_batch" in progress_state and "total_batches" in progress_state: | |
output += f" โข ๐ฆ Current batch: {progress_state['current_batch']}/{progress_state['total_batches']}\n" | |
output += "\n" | |
# Step-specific details for vector ingestion | |
elif progress_state.get("step") == "vector_ingestion": | |
docs_count = progress_state.get("documents_count", 0) | |
repo_name = progress_state.get("repo_name", "Unknown") | |
if docs_count > 0: | |
output += "๐ง **Vector Processing Status:**\n" | |
output += f" โข Repository: {repo_name}\n" | |
output += f" โข Documents: {docs_count:,}\n" | |
output += f" โข Stage: {phase}\n\n" | |
# Detailed information | |
output += f"๐ **Details:**\n{details}\n" | |
# Final summary for completion | |
if status == "complete": | |
total_time = progress_state.get("total_time", 0) | |
docs_processed = progress_state.get("documents_processed", 0) | |
failed_files = progress_state.get("failed_files", 0) | |
vector_time = progress_state.get("vector_time", 0) | |
loading_time = progress_state.get("loading_time", 0) | |
repo_name = progress_state.get("repo_name", "Unknown") | |
output += "\n๐ **INGESTION COMPLETED SUCCESSFULLY!**\n" | |
output += "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n" | |
output += f"๐ฏ **Repository:** {repo_name}\n" | |
output += f"๐ **Documents processed:** {docs_processed:,}\n" | |
output += f"โ **Failed files:** {len(failed_files) if isinstance(failed_files, list) else failed_files}\n" | |
output += f"โฑ๏ธ **Total time:** {total_time:.1f} seconds\n" | |
output += f" โโ File loading: {loading_time:.1f}s\n" | |
output += f" โโ Vector processing: {vector_time:.1f}s\n" | |
output += ( | |
f"๐ **Processing rate:** {docs_processed / total_time:.1f} docs/second\n\n" | |
) | |
output += "๐ **Next Step:** Go to the 'Query Interface' tab to start asking questions!" | |
elif status == "error": | |
error = progress_state.get("error", "Unknown error") | |
output += "\n๐ฅ **ERROR OCCURRED**\n" | |
output += "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n" | |
output += ( | |
f"โ **Error Details:** {error[:300]}{'...' if len(error) > 300 else ''}\n" | |
) | |
output += "\n๐ง **Troubleshooting Tips:**\n" | |
output += " โข Check your GitHub token permissions\n" | |
output += " โข Verify repository URL format\n" | |
output += " โข Ensure selected files exist\n" | |
output += " โข Check network connectivity\n" | |
return output | |
# Create the main Gradio interface | |
with gr.Blocks(title="Doc-MCP") as demo: | |
gr.Markdown("# ๐Doc-MCP: Documentation RAG System") | |
gr.Markdown( | |
"Transform GitHub documentation repositories into accessible MCP (Model Context Protocol) servers for AI agents. Upload documentation, generate vector embeddings, and query with intelligent context retrieval." | |
) | |
# State variables | |
files_state = gr.State([]) | |
progress_state = gr.State({}) | |
with gr.Tabs(): | |
with gr.TabItem("๐ฅ Documentation Ingestion"): | |
gr.Markdown("### ๐ Two-Step Documentation Processing Pipeline") | |
gr.Markdown( | |
"**Step 1:** Fetch markdown files from GitHub repository โ **Step 2:** Generate vector embeddings and store in MongoDB Atlas" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
repo_input = gr.Textbox( | |
label="๐ GitHub Repository URL", | |
placeholder="Enter: owner/repo or https://github.com/owner/repo (e.g., gradio-app/gradio)", | |
value="", | |
info="Enter any GitHub repository containing markdown documentation", | |
) | |
load_btn = gr.Button( | |
"๐ Discover Documentation Files", variant="secondary" | |
) | |
with gr.Column(scale=1): | |
status_output = gr.Textbox( | |
label="Repository Discovery Status", | |
interactive=False, | |
lines=4, | |
placeholder="Repository scanning results will appear here...", | |
) | |
with gr.Row(): | |
select_all_btn = gr.Button( | |
"๐ Select All Documents", variant="secondary" | |
) | |
clear_all_btn = gr.Button("๐๏ธ Clear Selection", variant="secondary") | |
# File selection | |
with gr.Accordion(label="Available Documentation Files"): | |
file_selector = gr.CheckboxGroup( | |
choices=[], | |
label="Select Markdown Files for RAG Processing", | |
visible=False, | |
) | |
# Two-step ingestion controls | |
gr.Markdown("### ๐ RAG Pipeline Execution") | |
gr.Markdown( | |
"Process your documentation through our advanced RAG pipeline using Nebius AI embeddings and MongoDB Atlas vector storage." | |
) | |
with gr.Row(): | |
with gr.Column(): | |
step1_btn = gr.Button( | |
"๐ฅ Step 1: Load Files from GitHub", | |
variant="primary", | |
size="lg", | |
interactive=False, | |
) | |
with gr.Column(): | |
step2_btn = gr.Button( | |
"๐ Step 2: Start Ingestion", | |
variant="primary", | |
size="lg", | |
interactive=False, | |
) | |
with gr.Row(): | |
refresh_btn = gr.Button("๐ Refresh Progress", variant="secondary") | |
reset_btn = gr.Button("๐๏ธ Reset Progress", variant="secondary") | |
# Progress display | |
progress_display = gr.Textbox( | |
label="๐ Real-time Ingestion Progress", | |
interactive=False, | |
lines=25, | |
value="๐ Ready to start two-step ingestion process...\n\n๐ Steps:\n1๏ธโฃ Load files from GitHub repository\n2๏ธโฃ Generate embeddings and store in vector database", | |
max_lines=30, | |
) | |
# Event handlers | |
def load_files_handler(repo_url: str): | |
if not repo_url.strip(): | |
return ( | |
gr.CheckboxGroup(choices=[], visible=False), | |
"Please enter a repository URL", | |
[], | |
gr.Button(interactive=False), | |
gr.Button(interactive=False), | |
) | |
files, message = fetch_files_with_loader(repo_url) | |
if files: | |
return ( | |
gr.CheckboxGroup( | |
choices=files, | |
value=[], | |
label=f"Select Files from {repo_url} ({len(files)} files)", | |
visible=True, | |
), | |
message, | |
files, | |
gr.Button(interactive=True), # Enable step 1 button | |
gr.Button(interactive=False), # Keep step 2 disabled | |
) | |
else: | |
return ( | |
gr.CheckboxGroup(choices=[], visible=False), | |
message, | |
[], | |
gr.Button(interactive=False), | |
gr.Button(interactive=False), | |
) | |
def start_step1_generator( | |
repo_url: str, selected_files: List[str], current_progress: Dict | |
): | |
"""Start Step 1 with generator-based real-time progress updates""" | |
for progress_update in start_file_loading_generator( | |
repo_url, selected_files, current_progress.copy() | |
): | |
progress_text = format_progress_display(progress_update) | |
step2_enabled = ( | |
progress_update.get("step") == "file_loading_complete" | |
) | |
yield ( | |
progress_update, | |
progress_text, | |
gr.Button(interactive=step2_enabled), | |
) | |
def start_step2(current_progress: Dict): | |
"""Start Step 2: Vector Ingestion""" | |
new_progress = start_vector_ingestion(current_progress.copy()) | |
progress_text = format_progress_display(new_progress) | |
return new_progress, progress_text | |
def refresh_progress(current_progress: Dict): | |
"""Refresh the progress display""" | |
progress_text = format_progress_display(current_progress) | |
return progress_text | |
def reset_progress(): | |
"""Reset all progress""" | |
return ( | |
{}, | |
"Ready to start two-step ingestion process...", | |
gr.Button(interactive=False), | |
) | |
def select_all_handler(available_files): | |
if available_files: | |
return gr.CheckboxGroup(value=available_files) | |
return gr.CheckboxGroup(value=[]) | |
def clear_all_handler(): | |
return gr.CheckboxGroup(value=[]) | |
# Wire up events | |
load_btn.click( | |
fn=load_files_handler, | |
inputs=[repo_input], | |
outputs=[ | |
file_selector, | |
status_output, | |
files_state, | |
step1_btn, | |
step2_btn, | |
], | |
show_api=False, | |
) | |
select_all_btn.click( | |
fn=select_all_handler, | |
inputs=[files_state], | |
outputs=[file_selector], | |
show_api=False, | |
) | |
clear_all_btn.click( | |
fn=clear_all_handler, outputs=[file_selector], show_api=False | |
) | |
step1_btn.click( | |
fn=start_step1_generator, | |
inputs=[repo_input, file_selector, progress_state], | |
outputs=[progress_state, progress_display, step2_btn], | |
show_api=False, | |
) | |
step2_btn.click( | |
fn=start_step2, | |
inputs=[progress_state], | |
outputs=[progress_state, progress_display], | |
show_api=False, | |
) | |
refresh_btn.click( | |
fn=refresh_progress, | |
inputs=[progress_state], | |
outputs=[progress_display], | |
show_api=False, | |
) | |
reset_btn.click( | |
fn=reset_progress, | |
outputs=[progress_state, progress_display, step2_btn], | |
show_api=False, | |
) | |
# ================================ | |
# Tab 2: Query Interface | |
# ================================ | |
with gr.TabItem("๐ค AI Documentation Assistant"): | |
gr.Markdown("### ๐ฌ Intelligent Documentation Q&A") | |
gr.Markdown( | |
"Query your processed documentation using advanced semantic search. Get contextual answers with source citations powered by Nebius LLM and vector similarity search." | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
# Repository selection - Dropdown that becomes textbox when selected | |
with gr.Row(): | |
repo_dropdown = gr.Dropdown( | |
choices=get_available_repositories() | |
or ["No repositories available"], | |
label="๐ Select Documentation Repository", | |
value=None, | |
interactive=True, | |
allow_custom_value=True, | |
info="Choose from available repositories", | |
) | |
# Hidden textbox that will become visible when repo is selected | |
selected_repo_textbox = gr.Textbox( | |
label="๐ฏ Selected Repository", | |
value="", | |
interactive=False, | |
visible=False, | |
info="Currently selected repository for querying", | |
) | |
refresh_repos_btn = gr.Button( | |
"๐ Refresh Repository List", variant="secondary", size="sm" | |
) | |
# Query mode selection | |
query_mode = gr.Radio( | |
choices=["default", "text_search", "hybrid"], | |
label="๐ Search Strategy", | |
value="default", | |
info="โข default: Semantic similarity (AI understanding)\nโข text_search: Keyword matching\nโข hybrid: Combined approach for best results", | |
) | |
# Query input | |
query_input = gr.Textbox( | |
label="๐ญ Ask About Your Documentation", | |
placeholder="How do I implement a custom component? What are the available API endpoints? How to configure the system?", | |
lines=3, | |
info="Ask natural language questions about your documentation", | |
) | |
query_btn = gr.Button( | |
"๐ Search Documentation", variant="primary", size="lg" | |
) | |
# Response display as text area | |
response_output = gr.Textbox( | |
label="๐ค AI Assistant Response", | |
value="Your AI-powered documentation response will appear here with contextual information and source citations...", | |
lines=10, | |
interactive=False, | |
info="Generated using Nebius LLM with retrieved documentation context", | |
) | |
with gr.Column(scale=2): | |
gr.Markdown("### ๐ Source References") | |
gr.Markdown( | |
"View the exact documentation sources used to generate the response, with relevance scores and GitHub links." | |
) | |
# Source nodes display as JSON | |
sources_output = gr.JSON( | |
label="๐ Source Citations & Metadata", | |
value={ | |
"message": "Source documentation excerpts with relevance scores will appear here after your query...", | |
"info": "Each source includes file path, relevance score, and content snippet", | |
}, | |
) | |
# Event handlers | |
def handle_repo_selection(selected_repo): | |
"""Handle repository selection from dropdown""" | |
if not selected_repo or selected_repo in [ | |
"No repositories available", | |
"", | |
]: | |
return ( | |
gr.Dropdown(visible=True), # Keep dropdown visible | |
gr.Textbox(visible=False, value=""), # Hide textbox | |
gr.Button(interactive=False), # Disable query button | |
) | |
else: | |
return ( | |
gr.Dropdown(visible=False), # Hide dropdown | |
gr.Textbox( | |
visible=True, value=selected_repo | |
), # Show textbox with selected repo | |
gr.Button(interactive=True), # Enable query button | |
) | |
def reset_repo_selection(): | |
"""Reset to show dropdown again""" | |
try: | |
repos = get_available_repositories() or [ | |
"No repositories available" | |
] | |
return ( | |
gr.Dropdown( | |
choices=repos, value=None, visible=True | |
), # Show dropdown with refreshed choices | |
gr.Textbox(visible=False, value=""), # Hide textbox | |
gr.Button(interactive=False), # Disable query button | |
) | |
except Exception as e: | |
print(f"Error refreshing repository list: {e}") | |
return ( | |
gr.Dropdown( | |
choices=["Error loading repositories"], | |
value=None, | |
visible=True, | |
), | |
gr.Textbox(visible=False, value=""), | |
gr.Button(interactive=False), | |
) | |
def get_available_docs_repo(): | |
""" | |
List the available docs of repositories - should be called first to list out all the available repo docs to chat with | |
Returns: | |
Updated dropdown with available repositories | |
""" | |
try: | |
repos = get_available_repositories() | |
if not repos: | |
repos = [ | |
"No repositories available - Please ingest documentation first" | |
] | |
return gr.Dropdown(choices=repos, value=None) | |
except Exception as e: | |
print(f"Error refreshing repository list: {e}") | |
return gr.Dropdown( | |
choices=["Error loading repositories"], value=None | |
) | |
# Simple query handler | |
def handle_query(repo: str, mode: str, query: str): | |
""" | |
Handle query request - returns raw data from retriever | |
Args: | |
repo: Selected repository from textbox | |
mode: Query mode (default, text_search, hybrid) | |
query: User's query | |
Returns: | |
Raw result dict from QueryRetriever.make_query() | |
""" | |
if not query.strip(): | |
return {"error": "Please enter a query."} | |
if not repo or repo in [ | |
"No repositories available", | |
"Error loading repositories", | |
"", | |
]: | |
return {"error": "Please select a valid repository."} | |
try: | |
# Create query retriever for the selected repo | |
retriever = QueryRetriever(repo) | |
# Make the query and return raw result | |
result = retriever.make_query(query, mode) | |
return result | |
except Exception as e: | |
print(f"Query error: {e}") | |
traceback.print_exc() | |
return {"error": f"Query failed: {str(e)}"} | |
def make_query(repo: str, mode: str, query: str): | |
""" | |
Retrieve relevant documentation context for a given query using specified retrieval mode. | |
This function is designed to support Retrieval-Augmented Generation (RAG) by extracting | |
the most relevant context chunks from indexed documentation sources. | |
Args: | |
repo: Selected repository from the textbox input | |
mode: Query mode (default, text_search, hybrid) | |
query: User's query | |
Returns: | |
Tuple of (response_text, source_nodes_json) | |
""" | |
# Get raw result | |
result = handle_query(repo, mode, query) | |
# Extract response text | |
if "error" in result: | |
response_text = f"Error: {result['error']}" | |
source_nodes = {"error": result["error"]} | |
else: | |
response_text = result.get("response", "No response available") | |
source_nodes = result.get("source_nodes", []) | |
return response_text, source_nodes | |
# Wire up events | |
# Handle repository selection from dropdown | |
repo_dropdown.change( | |
fn=handle_repo_selection, | |
inputs=[repo_dropdown], | |
outputs=[repo_dropdown, selected_repo_textbox, query_btn], | |
show_api=False, | |
) | |
# Handle refresh button - resets to dropdown view | |
refresh_repos_btn.click( | |
fn=reset_repo_selection, | |
outputs=[repo_dropdown, selected_repo_textbox, query_btn], | |
show_api=False, | |
) | |
# Also provide API endpoint for listing repositories | |
refresh_repos_btn.click( | |
fn=get_available_docs_repo, | |
outputs=[repo_dropdown], | |
api_name="list_available_docs", | |
) | |
# Query button uses the textbox value (not dropdown) | |
query_btn.click( | |
fn=make_query, | |
inputs=[ | |
selected_repo_textbox, | |
query_mode, | |
query_input, | |
], # Use textbox, not dropdown | |
outputs=[response_output, sources_output], | |
api_name="query_documentation", | |
) | |
# Also allow Enter key to trigger query | |
query_input.submit( | |
fn=make_query, | |
inputs=[ | |
selected_repo_textbox, | |
query_mode, | |
query_input, | |
], # Use textbox, not dropdown | |
outputs=[response_output, sources_output], | |
show_api=False, | |
) | |
# ================================ | |
# Tab 3: Repository Management | |
# ================================ | |
with gr.TabItem("๐๏ธ Repository Management", visible=ENABLE_REPO_MANAGEMENT): | |
gr.Markdown( | |
"Manage your ingested repositories - view details and delete repositories when needed." | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### ๐ Repository Statistics") | |
stats_display = gr.JSON( | |
label="Database Statistics", | |
value={"message": "Click refresh to load statistics..."}, | |
) | |
refresh_stats_btn = gr.Button( | |
"๐ Refresh Statistics", variant="secondary" | |
) | |
with gr.Column(scale=2): | |
gr.Markdown("### ๐ Repository Details") | |
repos_table = gr.Dataframe( | |
headers=["Repository", "Files", "Last Updated"], | |
datatype=["str", "number", "str"], | |
label="Ingested Repositories", | |
interactive=False, | |
wrap=True, | |
) | |
refresh_repos_btn = gr.Button( | |
"๐ Refresh Repository List", variant="secondary" | |
) | |
gr.Markdown("### ๐๏ธ Delete Repository") | |
gr.Markdown( | |
"**โ ๏ธ Warning:** This will permanently delete all documents and metadata for the selected repository." | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
delete_repo_dropdown = gr.Dropdown( | |
choices=[], | |
label="Select Repository to Delete", | |
value=None, | |
interactive=True, | |
allow_custom_value=False, | |
) | |
# Confirmation checkbox | |
confirm_delete = gr.Checkbox( | |
label="I understand this action cannot be undone", value=False | |
) | |
delete_btn = gr.Button( | |
"๐๏ธ Delete Repository", | |
variant="stop", | |
size="lg", | |
interactive=False, | |
) | |
with gr.Column(scale=1): | |
deletion_status = gr.Textbox( | |
label="Deletion Status", | |
value="Select a repository and confirm to enable deletion.", | |
interactive=False, | |
lines=6, | |
) | |
# Management functions | |
def load_repository_stats(): | |
"""Load overall repository statistics""" | |
try: | |
stats = get_repository_stats() | |
return stats | |
except Exception as e: | |
return {"error": f"Failed to load statistics: {str(e)}"} | |
def load_repository_details(): | |
"""Load detailed repository information as a table""" | |
try: | |
details = get_repo_details() | |
if not details: | |
return [["No repositories found", 0, "N/A"]] | |
# Format for dataframe | |
table_data = [] | |
for repo in details: | |
last_updated = repo.get("last_updated", "Unknown") | |
if hasattr(last_updated, "strftime"): | |
last_updated = last_updated.strftime("%Y-%m-%d %H:%M") | |
elif last_updated != "Unknown": | |
last_updated = str(last_updated) | |
table_data.append( | |
[ | |
repo.get("repo_name", "Unknown"), | |
repo.get("file_count", 0), | |
last_updated, | |
] | |
) | |
return table_data | |
except Exception as e: | |
return [["Error loading repositories", 0, str(e)]] | |
def update_delete_dropdown(): | |
"""Update the dropdown with available repositories""" | |
try: | |
repos = get_available_repositories() | |
return gr.Dropdown(choices=repos, value=None) | |
except Exception as e: | |
print(f"Error updating delete dropdown: {e}") | |
return gr.Dropdown(choices=[], value=None) | |
def check_delete_button_state(repo_selected, confirmation_checked): | |
"""Enable/disable delete button based on selection and confirmation""" | |
if repo_selected and confirmation_checked: | |
return gr.Button(interactive=True) | |
else: | |
return gr.Button(interactive=False) | |
def delete_repository(repo_name: str, confirmed: bool): | |
"""Delete the selected repository""" | |
if not repo_name: | |
return ( | |
"โ No repository selected.", | |
gr.Dropdown(choices=[]), | |
gr.Checkbox(value=False), | |
) | |
if not confirmed: | |
return ( | |
"โ Please confirm deletion by checking the checkbox.", | |
gr.Dropdown(choices=[]), | |
gr.Checkbox(value=False), | |
) | |
try: | |
# Perform deletion | |
result = delete_repository_data(repo_name) | |
# Prepare status message | |
status_msg = result["message"] | |
if result["success"]: | |
status_msg += "\n\n๐ Deletion Summary:" | |
status_msg += f"\n- Vector documents removed: {result['vector_docs_deleted']}" | |
status_msg += f"\n- Repository record deleted: {'Yes' if result['repo_record_deleted'] else 'No'}" | |
status_msg += f"\n\nโ Repository '{repo_name}' has been completely removed." | |
# Update dropdown (remove deleted repo) | |
updated_dropdown = update_delete_dropdown() | |
# Reset confirmation checkbox | |
reset_checkbox = gr.Checkbox(value=False) | |
return status_msg, updated_dropdown, reset_checkbox | |
except Exception as e: | |
error_msg = f"โ Error deleting repository: {str(e)}" | |
return error_msg, gr.Dropdown(choices=[]), gr.Checkbox(value=False) | |
# Wire up management events | |
refresh_stats_btn.click( | |
fn=load_repository_stats, outputs=[stats_display], show_api=False | |
) | |
refresh_repos_btn.click( | |
fn=load_repository_details, outputs=[repos_table], show_api=False | |
) | |
# Update delete dropdown when refreshing repos | |
refresh_repos_btn.click( | |
fn=update_delete_dropdown, | |
outputs=[delete_repo_dropdown], | |
show_api=False, | |
) | |
# Enable/disable delete button based on selection and confirmation | |
delete_repo_dropdown.change( | |
fn=check_delete_button_state, | |
inputs=[delete_repo_dropdown, confirm_delete], | |
outputs=[delete_btn], | |
show_api=False, | |
) | |
confirm_delete.change( | |
fn=check_delete_button_state, | |
inputs=[delete_repo_dropdown, confirm_delete], | |
outputs=[delete_btn], | |
show_api=False, | |
) | |
# Delete repository | |
delete_btn.click( | |
fn=delete_repository, | |
inputs=[delete_repo_dropdown, confirm_delete], | |
outputs=[deletion_status, delete_repo_dropdown, confirm_delete], | |
show_api=False, | |
) | |
# Load data on tab load | |
demo.load(fn=load_repository_stats, outputs=[stats_display], show_api=False) | |
demo.load(fn=load_repository_details, outputs=[repos_table], show_api=False) | |
demo.load( | |
fn=update_delete_dropdown, | |
outputs=[delete_repo_dropdown], | |
show_api=False, | |
) | |
# ================================ | |
# Tab 4: GitHub File Search (Hidden API) | |
# ================================ | |
with gr.TabItem("๐ GitHub File Search", visible=False): | |
gr.Markdown("### ๐ง GitHub Repository File Search API") | |
gr.Markdown( | |
"Pure API endpoints for GitHub file operations - all responses in JSON format" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("#### ๐ List Repository Files") | |
# Repository input for file operations | |
api_repo_input = gr.Textbox( | |
label="Repository URL", | |
placeholder="owner/repo or https://github.com/owner/repo", | |
value="", | |
info="GitHub repository to scan", | |
) | |
# Branch selection | |
api_branch_input = gr.Textbox( | |
label="Branch", | |
value="main", | |
placeholder="main", | |
info="Branch to search (default: main)", | |
) | |
# File extensions | |
api_extensions_input = gr.Textbox( | |
label="File Extensions (comma-separated)", | |
value=".md,.mdx", | |
placeholder=".md,.mdx,.txt", | |
info="File extensions to include", | |
) | |
# List files button | |
list_files_btn = gr.Button("๐ List Files", variant="primary") | |
with gr.Column(): | |
gr.Markdown("#### ๐ Get Single File") | |
# Single file inputs | |
single_repo_input = gr.Textbox( | |
label="Repository URL", | |
placeholder="owner/repo or https://github.com/owner/repo", | |
value="", | |
info="GitHub repository", | |
) | |
single_file_input = gr.Textbox( | |
label="File Path", | |
placeholder="docs/README.md", | |
value="", | |
info="Path to specific file in repository", | |
) | |
single_branch_input = gr.Textbox( | |
label="Branch", | |
value="main", | |
placeholder="main", | |
info="Branch name (default: main)", | |
) | |
# Get single file button | |
get_single_btn = gr.Button( | |
"๐ Get Single File", variant="secondary" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("#### ๐ Get Multiple Files") | |
# Multiple files inputs | |
multiple_repo_input = gr.Textbox( | |
label="Repository URL", | |
placeholder="owner/repo or https://github.com/owner/repo", | |
value="", | |
info="GitHub repository", | |
) | |
multiple_files_input = gr.Textbox( | |
label="File Paths (comma-separated)", | |
placeholder="README.md,docs/guide.md,api/overview.md", | |
value="", | |
lines=3, | |
info="Comma-separated list of file paths", | |
) | |
multiple_branch_input = gr.Textbox( | |
label="Branch", | |
value="main", | |
placeholder="main", | |
info="Branch name (default: main)", | |
) | |
# Get multiple files button | |
get_multiple_btn = gr.Button( | |
"๐ Get Multiple Files", variant="secondary" | |
) | |
# Single JSON output for all operations | |
gr.Markdown("### ๐ API Response") | |
api_response_output = gr.JSON( | |
label="JSON Response", | |
value={ | |
"message": "API responses will appear here", | |
"info": "Use the buttons above to interact with GitHub repositories", | |
}, | |
) | |
# Pure API Functions (JSON only responses) | |
def list_repository_files( | |
repo_url: str, branch: str = "main", extensions: str = ".md,.mdx" | |
): | |
""" | |
List all files in a GitHub repository with specified extensions | |
Args: | |
repo_url: GitHub repository URL or owner/repo format | |
branch: Branch name to search (default: main) | |
extensions: Comma-separated file extensions (default: .md,.mdx) | |
Returns: | |
JSON response with file list and metadata | |
""" | |
try: | |
if not repo_url.strip(): | |
return {"success": False, "error": "Repository URL is required"} | |
# Parse extensions list | |
ext_list = [ | |
ext.strip() for ext in extensions.split(",") if ext.strip() | |
] | |
if not ext_list: | |
ext_list = [".md", ".mdx"] | |
# Get files list | |
files, status_message = fetch_repository_files( | |
repo_url=repo_url, | |
file_extensions=ext_list, | |
github_token=os.getenv("GITHUB_API_KEY"), | |
branch=branch, | |
) | |
if files: | |
return { | |
"success": True, | |
"repository": repo_url, | |
"branch": branch, | |
"extensions": ext_list, | |
"total_files": len(files), | |
"files": files, | |
"status": status_message, | |
} | |
else: | |
return { | |
"success": False, | |
"repository": repo_url, | |
"branch": branch, | |
"extensions": ext_list, | |
"total_files": 0, | |
"files": [], | |
"error": status_message or "No files found", | |
} | |
except Exception as e: | |
return { | |
"success": False, | |
"error": f"Failed to list files: {str(e)}", | |
"repository": repo_url, | |
"branch": branch, | |
} | |
def get_single_file(repo_url: str, file_path: str, branch: str = "main"): | |
""" | |
Retrieve a single file from GitHub repository | |
Args: | |
repo_url: GitHub repository URL or owner/repo format | |
file_path: Path to the file in the repository | |
branch: Branch name (default: main) | |
Returns: | |
JSON response with file content and metadata | |
""" | |
try: | |
if not repo_url.strip(): | |
return {"success": False, "error": "Repository URL is required"} | |
if not file_path.strip(): | |
return {"success": False, "error": "File path is required"} | |
# Parse repo name | |
if "github.com" in repo_url: | |
repo_name = ( | |
repo_url.replace("https://github.com/", "") | |
.replace("http://github.com/", "") | |
.strip("/") | |
) | |
else: | |
repo_name = repo_url.strip() | |
# Load single file | |
documents, failed = load_github_files( | |
repo_name=repo_name, | |
file_paths=[file_path.strip()], | |
branch=branch, | |
github_token=os.getenv("GITHUB_API_KEY"), | |
) | |
if documents and len(documents) > 0: | |
doc = documents[0] | |
return { | |
"success": True, | |
"repository": repo_name, | |
"branch": branch, | |
"file_path": file_path, | |
"file_name": doc.metadata.get("file_name", ""), | |
"file_size": len(doc.text), | |
"content": doc.text, | |
"metadata": doc.metadata, | |
"url": doc.metadata.get("url", ""), | |
"raw_url": doc.metadata.get("raw_url", ""), | |
} | |
else: | |
error_msg = f"Failed to retrieve file: {failed[0] if failed else 'File not found or access denied'}" | |
return { | |
"success": False, | |
"repository": repo_name, | |
"branch": branch, | |
"file_path": file_path, | |
"error": error_msg, | |
} | |
except Exception as e: | |
return { | |
"success": False, | |
"error": f"Failed to get single file: {str(e)}", | |
"repository": repo_url, | |
"file_path": file_path, | |
"branch": branch, | |
} | |
def get_multiple_files( | |
repo_url: str, file_paths_str: str, branch: str = "main" | |
): | |
""" | |
Retrieve multiple files from GitHub repository | |
Args: | |
repo_url: GitHub repository URL or owner/repo format | |
file_paths_str: Comma-separated string of file paths | |
branch: Branch name (default: main) | |
Returns: | |
JSON response with multiple file contents and metadata | |
""" | |
try: | |
if not repo_url.strip(): | |
return {"success": False, "error": "Repository URL is required"} | |
if not file_paths_str.strip(): | |
return {"success": False, "error": "File paths are required"} | |
# Parse file paths from comma-separated string | |
file_paths = [ | |
path.strip() | |
for path in file_paths_str.split(",") | |
if path.strip() | |
] | |
if not file_paths: | |
return { | |
"success": False, | |
"error": "No valid file paths provided", | |
} | |
# Parse repo name | |
if "github.com" in repo_url: | |
repo_name = ( | |
repo_url.replace("https://github.com/", "") | |
.replace("http://github.com/", "") | |
.strip("/") | |
) | |
else: | |
repo_name = repo_url.strip() | |
# Load multiple files | |
documents, failed = load_github_files( | |
repo_name=repo_name, | |
file_paths=file_paths, | |
branch=branch, | |
github_token=os.getenv("GITHUB_API_KEY"), | |
) | |
# Process successful documents | |
successful_files = [] | |
for doc in documents: | |
file_data = { | |
"file_path": doc.metadata.get("file_path", ""), | |
"file_name": doc.metadata.get("file_name", ""), | |
"file_size": len(doc.text), | |
"content": doc.text, | |
"metadata": doc.metadata, | |
"url": doc.metadata.get("url", ""), | |
"raw_url": doc.metadata.get("raw_url", ""), | |
} | |
successful_files.append(file_data) | |
return { | |
"success": True, | |
"repository": repo_name, | |
"branch": branch, | |
"requested_files": len(file_paths), | |
"successful_files": len(successful_files), | |
"failed_files": len(failed), | |
"files": successful_files, | |
"failed_file_paths": failed, | |
"total_content_size": sum(len(doc.text) for doc in documents), | |
"requested_file_paths": file_paths, | |
} | |
except Exception as e: | |
return { | |
"success": False, | |
"error": f"Failed to get multiple files: {str(e)}", | |
"repository": repo_url, | |
"file_paths": file_paths_str, | |
"branch": branch, | |
} | |
# Wire up the GitHub file search events - all output to single JSON component | |
list_files_btn.click( | |
fn=list_repository_files, | |
inputs=[api_repo_input, api_branch_input, api_extensions_input], | |
outputs=[api_response_output], | |
api_name="list_repository_files", | |
) | |
get_single_btn.click( | |
fn=get_single_file, | |
inputs=[single_repo_input, single_file_input, single_branch_input], | |
outputs=[api_response_output], | |
api_name="get_single_file", | |
) | |
get_multiple_btn.click( | |
fn=get_multiple_files, | |
inputs=[ | |
multiple_repo_input, | |
multiple_files_input, | |
multiple_branch_input, | |
], | |
outputs=[api_response_output], | |
api_name="get_multiple_files", | |
) | |
# ================================ | |
# Tab 5: About & MCP Configuration | |
# ================================ | |
with gr.TabItem("โน๏ธ About & MCP Setup"): | |
gr.Markdown("# ๐ Doc-MCP: Documentation RAG System") | |
gr.Markdown( | |
"**Transform GitHub documentation repositories into accessible MCP servers for AI agents.**" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
# Project Overview | |
with gr.Accordion("๐ฏ What is Doc-MCP?", open=True): | |
gr.Markdown(""" | |
**Doc-MCP** converts GitHub documentation into AI-queryable knowledge bases via the Model Context Protocol. | |
**๐ Key Features:** | |
- ๐ฅ **GitHub Integration** - Automatic markdown file extraction | |
- ๐ง **AI Embeddings** - Nebius AI-powered vector search | |
- ๐ **Smart Search** - Semantic, keyword & hybrid modes | |
- ๐ค **MCP Server** - Direct AI agent integration | |
- โก **Real-time** - Live processing progress | |
""") | |
# Quick Start Guide | |
with gr.Accordion("๐ Quick Start", open=False): | |
gr.Markdown(""" | |
**1. Ingest Documentation** โ Enter GitHub repo URL โ Select files โ Run 2-step pipeline | |
**2. Query with AI** โ Select repository โ Ask questions โ Get answers with sources | |
**3. Manage Repos** โ View stats โ Delete old repositories | |
**4. Use MCP Tools** โ Configure your AI agent โ Query docs directly from IDE | |
""") | |
with gr.Column(scale=2): | |
# MCP Server Configuration | |
with gr.Accordion("๐ง MCP Server Setup", open=True): | |
gr.Markdown("### ๐ Server URL") | |
# Server URL | |
gr.Textbox( | |
value="https://agents-mcp-hackathon-doc-mcp.hf.space/gradio_api/mcp/sse", | |
label="MCP Endpoint", | |
interactive=False, | |
info="Copy this URL for your MCP client configuration", | |
) | |
gr.Markdown("### โ๏ธ Configuration") | |
# SSE Configuration | |
with gr.Accordion("For Cursor, Windsurf, Cline", open=False): | |
sse_config = """{ | |
"mcpServers": { | |
"doc-mcp": { | |
"url": "https://agents-mcp-hackathon-doc-mcp.hf.space/gradio_api/mcp/sse" | |
} | |
} | |
}""" | |
gr.Code( | |
value=sse_config, | |
label="SSE Configuration", | |
language="json", | |
interactive=False, | |
) | |
# STDIO Configuration | |
with gr.Accordion( | |
"For STDIO Clients (Experimental)", open=False | |
): | |
stdio_config = """{ | |
"mcpServers": { | |
"doc-mcp": { | |
"command": "npx", | |
"args": ["mcp-remote", "https://agents-mcp-hackathon-doc-mcp.hf.space/gradio_api/mcp/sse", "--transport", "sse-only"] | |
} | |
} | |
}""" | |
gr.Code( | |
value=stdio_config, | |
label="STDIO Configuration", | |
language="json", | |
interactive=False, | |
) | |
# MCP Tools Overview | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### ๐ ๏ธ Available MCP Tools") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("**๐ Documentation Query Tools**") | |
gr.Markdown( | |
"โข `get_available_docs_repo` - List repositories" | |
) | |
gr.Markdown("โข `make_query` - Search documentation with AI") | |
with gr.Column(): | |
gr.Markdown("**๐ GitHub File Tools**") | |
gr.Markdown("โข `list_repository_files` - Scan repo files") | |
gr.Markdown("โข `get_single_file` - Fetch one file") | |
gr.Markdown("โข `get_multiple_files` - Fetch multiple files") | |
# Technology Stack & Project Info | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion("โ๏ธ Technology Stack", open=False): | |
gr.Markdown("**๐ฅ๏ธ Frontend & API**") | |
gr.Markdown("โข **Gradio** - Web interface & API framework") | |
gr.Markdown("โข **Hugging Face Spaces** - Cloud hosting") | |
gr.Markdown("**๐ค AI & ML**") | |
gr.Markdown("โข **Nebius AI** - LLM & embedding models") | |
gr.Markdown("โข **LlamaIndex** - RAG framework") | |
gr.Markdown("**๐พ Database & Storage**") | |
gr.Markdown("โข **MongoDB Atlas** - Vector database") | |
gr.Markdown("โข **GitHub API** - Source file access") | |
gr.Markdown("**๐ Integration**") | |
gr.Markdown("โข **Model Context Protocol** - AI agent standard") | |
gr.Markdown( | |
"โข **Server-Sent Events** - Real-time communication" | |
) | |
with gr.Column(): | |
with gr.Accordion("๐ฅ Project Information", open=False): | |
gr.Markdown("**๐ MCP Hackathon Project**") | |
gr.Markdown( | |
"Created to showcase AI agent integration with documentation systems." | |
) | |
gr.Markdown("**๐ก Inspiration**") | |
gr.Markdown("โข Making Gradio docs easily searchable") | |
gr.Markdown("โข Leveraging Hugging Face AI ecosystem") | |
gr.Markdown( | |
"โข Improving developer experience with AI assistants" | |
) | |
gr.Markdown("**๐ฎ Future Plans**") | |
gr.Markdown("โข Support for PDF, HTML files") | |
gr.Markdown("โข Multi-language documentation") | |
gr.Markdown("โข Custom embedding fine-tuning") | |
gr.Markdown("**๐ License:** MIT - Free to use and modify") | |
# Usage Examples | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion("๐ก Usage Examples", open=False): | |
gr.Markdown("### Example Workflow") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("**๐ฅ Step 1: Ingest Docs**") | |
gr.Code( | |
value="1. Enter: gradio-app/gradio\n2. Select markdown files\n3. Run ingestion pipeline", | |
label="Ingestion Process", | |
interactive=False, | |
) | |
with gr.Column(): | |
gr.Markdown("**๐ค Step 2: Query with AI**") | |
gr.Code( | |
value='Query: "How to create custom components?"\nResponse: Detailed answer with source links', | |
label="AI Query Example", | |
interactive=False, | |
) | |
gr.Markdown("### MCP Tool Usage") | |
gr.Code( | |
value="""# In your AI agent: | |
1. Call: get_available_docs_repo() -> ["gradio-app/gradio", ...] | |
2. Call: make_query("gradio-app/gradio", "default", "custom components") | |
3. Get: AI response + source citations""", | |
label="MCP Integration Example", | |
language="python", | |
interactive=False, | |
) | |
if __name__ == "__main__": | |
demo.launch(mcp_server=True) | |