mdabidhussain commited on
Commit
3fcc667
·
1 Parent(s): 63c6853

added tools for github file listing and retrieval

Browse files
Files changed (1) hide show
  1. app.py +345 -9
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import asyncio
2
  import os
3
  import time
 
4
  from typing import Dict, List
5
 
6
  import gradio as gr
@@ -8,10 +9,14 @@ from dotenv import load_dotenv
8
  from llama_index.core import Settings
9
  from llama_index.core.text_splitter import SentenceSplitter
10
 
11
- from rag.config import embed_model, get_available_repos, llm
 
 
12
  from rag.github_file_loader import \
13
- fetch_markdown_files as fetch_files_with_loader, load_github_files
 
14
  from rag.ingest import ingest_documents_async
 
15
 
16
  load_dotenv()
17
 
@@ -922,8 +927,6 @@ with gr.Blocks(title="Doc-MCP") as demo:
922
  return {"error": "Please select a valid repository."}
923
 
924
  try:
925
- # Import QueryRetriever here to avoid circular imports
926
- from rag.query import QueryRetriever
927
 
928
  # Create query retriever for the selected repo
929
  retriever = QueryRetriever(repo)
@@ -934,8 +937,6 @@ with gr.Blocks(title="Doc-MCP") as demo:
934
 
935
  except Exception as e:
936
  print(f"Query error: {e}")
937
- import traceback
938
-
939
  traceback.print_exc()
940
  return {"error": f"Query failed: {str(e)}"}
941
 
@@ -1069,7 +1070,7 @@ with gr.Blocks(title="Doc-MCP") as demo:
1069
  def load_repository_stats():
1070
  """Load overall repository statistics"""
1071
  try:
1072
- from rag.config import get_repository_stats
1073
  stats = get_repository_stats()
1074
  return stats
1075
  except Exception as e:
@@ -1078,7 +1079,7 @@ with gr.Blocks(title="Doc-MCP") as demo:
1078
  def load_repository_details():
1079
  """Load detailed repository information as a table"""
1080
  try:
1081
- from rag.config import get_repo_details
1082
  details = get_repo_details()
1083
 
1084
  if not details:
@@ -1129,7 +1130,7 @@ with gr.Blocks(title="Doc-MCP") as demo:
1129
  return "❌ Please confirm deletion by checking the checkbox.", gr.Dropdown(choices=[]), gr.Checkbox(value=False)
1130
 
1131
  try:
1132
- from rag.config import delete_repository_data
1133
 
1134
  # Perform deletion
1135
  result = delete_repository_data(repo_name)
@@ -1216,6 +1217,341 @@ with gr.Blocks(title="Doc-MCP") as demo:
1216
  show_api=False
1217
  )
1218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1220
  if __name__ == "__main__":
1221
  demo.launch(mcp_server=True)
 
1
  import asyncio
2
  import os
3
  import time
4
+ import traceback
5
  from typing import Dict, List
6
 
7
  import gradio as gr
 
9
  from llama_index.core import Settings
10
  from llama_index.core.text_splitter import SentenceSplitter
11
 
12
+ from rag.config import (delete_repository_data, embed_model,
13
+ get_available_repos, get_repo_details,
14
+ get_repository_stats, llm)
15
  from rag.github_file_loader import \
16
+ fetch_markdown_files as fetch_files_with_loader
17
+ from rag.github_file_loader import fetch_repository_files, load_github_files
18
  from rag.ingest import ingest_documents_async
19
+ from rag.query import QueryRetriever
20
 
21
  load_dotenv()
22
 
 
927
  return {"error": "Please select a valid repository."}
928
 
929
  try:
 
 
930
 
931
  # Create query retriever for the selected repo
932
  retriever = QueryRetriever(repo)
 
937
 
938
  except Exception as e:
939
  print(f"Query error: {e}")
 
 
940
  traceback.print_exc()
941
  return {"error": f"Query failed: {str(e)}"}
942
 
 
1070
  def load_repository_stats():
1071
  """Load overall repository statistics"""
1072
  try:
1073
+
1074
  stats = get_repository_stats()
1075
  return stats
1076
  except Exception as e:
 
1079
  def load_repository_details():
1080
  """Load detailed repository information as a table"""
1081
  try:
1082
+
1083
  details = get_repo_details()
1084
 
1085
  if not details:
 
1130
  return "❌ Please confirm deletion by checking the checkbox.", gr.Dropdown(choices=[]), gr.Checkbox(value=False)
1131
 
1132
  try:
1133
+
1134
 
1135
  # Perform deletion
1136
  result = delete_repository_data(repo_name)
 
1217
  show_api=False
1218
  )
1219
 
1220
+ # ================================
1221
+ # Tab 4: GitHub File Search (Hidden API)
1222
+ # ================================
1223
+ with gr.TabItem("🔍 GitHub File Search", visible=False):
1224
+ gr.Markdown("### 🔧 GitHub Repository File Search API")
1225
+ gr.Markdown("Pure API endpoints for GitHub file operations - all responses in JSON format")
1226
+
1227
+ with gr.Row():
1228
+ with gr.Column():
1229
+ gr.Markdown("#### 📋 List Repository Files")
1230
+
1231
+ # Repository input for file operations
1232
+ api_repo_input = gr.Textbox(
1233
+ label="Repository URL",
1234
+ placeholder="owner/repo or https://github.com/owner/repo",
1235
+ value="",
1236
+ info="GitHub repository to scan"
1237
+ )
1238
+
1239
+ # Branch selection
1240
+ api_branch_input = gr.Textbox(
1241
+ label="Branch",
1242
+ value="main",
1243
+ placeholder="main",
1244
+ info="Branch to search (default: main)"
1245
+ )
1246
+
1247
+ # File extensions
1248
+ api_extensions_input = gr.Textbox(
1249
+ label="File Extensions (comma-separated)",
1250
+ value=".md,.mdx",
1251
+ placeholder=".md,.mdx,.txt",
1252
+ info="File extensions to include"
1253
+ )
1254
+
1255
+ # List files button
1256
+ list_files_btn = gr.Button("📋 List Files", variant="primary")
1257
+
1258
+ with gr.Column():
1259
+ gr.Markdown("#### 📄 Get Single File")
1260
+
1261
+ # Single file inputs
1262
+ single_repo_input = gr.Textbox(
1263
+ label="Repository URL",
1264
+ placeholder="owner/repo or https://github.com/owner/repo",
1265
+ value="",
1266
+ info="GitHub repository"
1267
+ )
1268
+
1269
+ single_file_input = gr.Textbox(
1270
+ label="File Path",
1271
+ placeholder="docs/README.md",
1272
+ value="",
1273
+ info="Path to specific file in repository"
1274
+ )
1275
+
1276
+ single_branch_input = gr.Textbox(
1277
+ label="Branch",
1278
+ value="main",
1279
+ placeholder="main",
1280
+ info="Branch name (default: main)"
1281
+ )
1282
+
1283
+ # Get single file button
1284
+ get_single_btn = gr.Button("📄 Get Single File", variant="secondary")
1285
+
1286
+ with gr.Row():
1287
+ with gr.Column():
1288
+ gr.Markdown("#### 📚 Get Multiple Files")
1289
+
1290
+ # Multiple files inputs
1291
+ multiple_repo_input = gr.Textbox(
1292
+ label="Repository URL",
1293
+ placeholder="owner/repo or https://github.com/owner/repo",
1294
+ value="",
1295
+ info="GitHub repository"
1296
+ )
1297
+
1298
+ multiple_files_input = gr.Textbox(
1299
+ label="File Paths (comma-separated)",
1300
+ placeholder="README.md,docs/guide.md,api/overview.md",
1301
+ value="",
1302
+ lines=3,
1303
+ info="Comma-separated list of file paths"
1304
+ )
1305
+
1306
+ multiple_branch_input = gr.Textbox(
1307
+ label="Branch",
1308
+ value="main",
1309
+ placeholder="main",
1310
+ info="Branch name (default: main)"
1311
+ )
1312
+
1313
+ # Get multiple files button
1314
+ get_multiple_btn = gr.Button("📚 Get Multiple Files", variant="secondary")
1315
+
1316
+ # Single JSON output for all operations
1317
+ gr.Markdown("### 📊 API Response")
1318
+ api_response_output = gr.JSON(
1319
+ label="JSON Response",
1320
+ value={
1321
+ "message": "API responses will appear here",
1322
+ "info": "Use the buttons above to interact with GitHub repositories"
1323
+ }
1324
+ )
1325
 
1326
+ # Pure API Functions (JSON only responses)
1327
+ def list_repository_files(repo_url: str, branch: str = "main", extensions: str = ".md,.mdx"):
1328
+ """
1329
+ List all files in a GitHub repository with specified extensions
1330
+
1331
+ Args:
1332
+ repo_url: GitHub repository URL or owner/repo format
1333
+ branch: Branch name to search (default: main)
1334
+ extensions: Comma-separated file extensions (default: .md,.mdx)
1335
+
1336
+ Returns:
1337
+ JSON response with file list and metadata
1338
+ """
1339
+ try:
1340
+ if not repo_url.strip():
1341
+ return {"success": False, "error": "Repository URL is required"}
1342
+
1343
+ # Parse extensions list
1344
+ ext_list = [ext.strip() for ext in extensions.split(",") if ext.strip()]
1345
+ if not ext_list:
1346
+ ext_list = [".md", ".mdx"]
1347
+
1348
+
1349
+ # Get files list
1350
+ files, status_message = fetch_repository_files(
1351
+ repo_url=repo_url,
1352
+ file_extensions=ext_list,
1353
+ github_token=os.getenv("GITHUB_API_KEY"),
1354
+ branch=branch
1355
+ )
1356
+
1357
+ if files:
1358
+ return {
1359
+ "success": True,
1360
+ "repository": repo_url,
1361
+ "branch": branch,
1362
+ "extensions": ext_list,
1363
+ "total_files": len(files),
1364
+ "files": files,
1365
+ "status": status_message
1366
+ }
1367
+ else:
1368
+ return {
1369
+ "success": False,
1370
+ "repository": repo_url,
1371
+ "branch": branch,
1372
+ "extensions": ext_list,
1373
+ "total_files": 0,
1374
+ "files": [],
1375
+ "error": status_message or "No files found"
1376
+ }
1377
+
1378
+ except Exception as e:
1379
+ return {
1380
+ "success": False,
1381
+ "error": f"Failed to list files: {str(e)}",
1382
+ "repository": repo_url,
1383
+ "branch": branch
1384
+ }
1385
+
1386
+ def get_single_file(repo_url: str, file_path: str, branch: str = "main"):
1387
+ """
1388
+ Retrieve a single file from GitHub repository
1389
+
1390
+ Args:
1391
+ repo_url: GitHub repository URL or owner/repo format
1392
+ file_path: Path to the file in the repository
1393
+ branch: Branch name (default: main)
1394
+
1395
+ Returns:
1396
+ JSON response with file content and metadata
1397
+ """
1398
+ try:
1399
+ if not repo_url.strip():
1400
+ return {"success": False, "error": "Repository URL is required"}
1401
+
1402
+ if not file_path.strip():
1403
+ return {"success": False, "error": "File path is required"}
1404
+
1405
+ # Parse repo name
1406
+ if "github.com" in repo_url:
1407
+ repo_name = (
1408
+ repo_url.replace("https://github.com/", "")
1409
+ .replace("http://github.com/", "")
1410
+ .strip("/")
1411
+ )
1412
+ else:
1413
+ repo_name = repo_url.strip()
1414
+
1415
+ # Load single file
1416
+ documents, failed = load_github_files(
1417
+ repo_name=repo_name,
1418
+ file_paths=[file_path.strip()],
1419
+ branch=branch,
1420
+ github_token=os.getenv("GITHUB_API_KEY")
1421
+ )
1422
+
1423
+ if documents and len(documents) > 0:
1424
+ doc = documents[0]
1425
+ return {
1426
+ "success": True,
1427
+ "repository": repo_name,
1428
+ "branch": branch,
1429
+ "file_path": file_path,
1430
+ "file_name": doc.metadata.get("file_name", ""),
1431
+ "file_size": len(doc.text),
1432
+ "content": doc.text,
1433
+ "metadata": doc.metadata,
1434
+ "url": doc.metadata.get("url", ""),
1435
+ "raw_url": doc.metadata.get("raw_url", "")
1436
+ }
1437
+ else:
1438
+ error_msg = f"Failed to retrieve file: {failed[0] if failed else 'File not found or access denied'}"
1439
+ return {
1440
+ "success": False,
1441
+ "repository": repo_name,
1442
+ "branch": branch,
1443
+ "file_path": file_path,
1444
+ "error": error_msg
1445
+ }
1446
+
1447
+ except Exception as e:
1448
+ return {
1449
+ "success": False,
1450
+ "error": f"Failed to get single file: {str(e)}",
1451
+ "repository": repo_url,
1452
+ "file_path": file_path,
1453
+ "branch": branch
1454
+ }
1455
+
1456
+ def get_multiple_files(repo_url: str, file_paths_str: str, branch: str = "main"):
1457
+ """
1458
+ Retrieve multiple files from GitHub repository
1459
+
1460
+ Args:
1461
+ repo_url: GitHub repository URL or owner/repo format
1462
+ file_paths_str: Comma-separated string of file paths
1463
+ branch: Branch name (default: main)
1464
+
1465
+ Returns:
1466
+ JSON response with multiple file contents and metadata
1467
+ """
1468
+ try:
1469
+ if not repo_url.strip():
1470
+ return {"success": False, "error": "Repository URL is required"}
1471
+
1472
+ if not file_paths_str.strip():
1473
+ return {"success": False, "error": "File paths are required"}
1474
+
1475
+ # Parse file paths from comma-separated string
1476
+ file_paths = [path.strip() for path in file_paths_str.split(",") if path.strip()]
1477
+
1478
+ if not file_paths:
1479
+ return {"success": False, "error": "No valid file paths provided"}
1480
+
1481
+ # Parse repo name
1482
+ if "github.com" in repo_url:
1483
+ repo_name = (
1484
+ repo_url.replace("https://github.com/", "")
1485
+ .replace("http://github.com/", "")
1486
+ .strip("/")
1487
+ )
1488
+ else:
1489
+ repo_name = repo_url.strip()
1490
+
1491
+ # Load multiple files
1492
+ documents, failed = load_github_files(
1493
+ repo_name=repo_name,
1494
+ file_paths=file_paths,
1495
+ branch=branch,
1496
+ github_token=os.getenv("GITHUB_API_KEY")
1497
+ )
1498
+
1499
+ # Process successful documents
1500
+ successful_files = []
1501
+ for doc in documents:
1502
+ file_data = {
1503
+ "file_path": doc.metadata.get("file_path", ""),
1504
+ "file_name": doc.metadata.get("file_name", ""),
1505
+ "file_size": len(doc.text),
1506
+ "content": doc.text,
1507
+ "metadata": doc.metadata,
1508
+ "url": doc.metadata.get("url", ""),
1509
+ "raw_url": doc.metadata.get("raw_url", "")
1510
+ }
1511
+ successful_files.append(file_data)
1512
+
1513
+ return {
1514
+ "success": True,
1515
+ "repository": repo_name,
1516
+ "branch": branch,
1517
+ "requested_files": len(file_paths),
1518
+ "successful_files": len(successful_files),
1519
+ "failed_files": len(failed),
1520
+ "files": successful_files,
1521
+ "failed_file_paths": failed,
1522
+ "total_content_size": sum(len(doc.text) for doc in documents),
1523
+ "requested_file_paths": file_paths
1524
+ }
1525
+
1526
+ except Exception as e:
1527
+ return {
1528
+ "success": False,
1529
+ "error": f"Failed to get multiple files: {str(e)}",
1530
+ "repository": repo_url,
1531
+ "file_paths": file_paths_str,
1532
+ "branch": branch
1533
+ }
1534
+
1535
+ # Wire up the GitHub file search events - all output to single JSON component
1536
+ list_files_btn.click(
1537
+ fn=list_repository_files,
1538
+ inputs=[api_repo_input, api_branch_input, api_extensions_input],
1539
+ outputs=[api_response_output],
1540
+ api_name="list_repository_files"
1541
+ )
1542
+
1543
+ get_single_btn.click(
1544
+ fn=get_single_file,
1545
+ inputs=[single_repo_input, single_file_input, single_branch_input],
1546
+ outputs=[api_response_output],
1547
+ api_name="get_single_file"
1548
+ )
1549
+
1550
+ get_multiple_btn.click(
1551
+ fn=get_multiple_files,
1552
+ inputs=[multiple_repo_input, multiple_files_input, multiple_branch_input],
1553
+ outputs=[api_response_output],
1554
+ api_name="get_multiple_files"
1555
+ )
1556
  if __name__ == "__main__":
1557
  demo.launch(mcp_server=True)