import asyncio import base64 import logging import re from pathlib import Path from typing import Dict, List, Optional, Tuple import aiohttp import requests from llama_index.core.schema import Document logger = logging.getLogger(__name__) class GithubFileLoader: """ GitHub file loader that fetches specific files asynchronously. Returns LlamaIndex Document objects for each successfully loaded file. """ def __init__( self, github_token: Optional[str] = None, concurrent_requests: int = 10, timeout: int = 30, retries: int = 3, ): """ Initialize GitHub file loader. Args: github_token: GitHub API token for higher rate limits concurrent_requests: Number of concurrent requests timeout: Request timeout in seconds retries: Number of retry attempts for failed requests """ self.github_token = github_token self.concurrent_requests = concurrent_requests self.timeout = timeout self.retries = retries # Setup headers self.headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "LlamaIndex-GitHub-Loader/1.0", } if self.github_token: self.headers["Authorization"] = f"token {self.github_token}" def fetch_repository_files( self, repo_url: str, file_extensions: List[str] = [".md", ".mdx"], branch: str = "main", ) -> Tuple[List[str], str]: """ Fetch files from GitHub repository using GitHub API Args: repo_url: GitHub repository URL or owner/repo format file_extensions: List of file extensions to filter (e.g., [".md", ".mdx", ".txt"]) branch: Branch name to fetch from Returns: Tuple of (list_of_file_paths, status_message) """ try: # Parse GitHub URL to extract owner and repo repo_name = self._parse_repo_name(repo_url) if not repo_name: return ( [], "Invalid GitHub URL format. Use: https://github.com/owner/repo or owner/repo", ) # GitHub API endpoint for repository tree api_url = f"https://api.github.com/repos/{repo_name}/git/trees/{branch}?recursive=1" # Make request with authentication if token is available response = requests.get(api_url, headers=self.headers, timeout=self.timeout) if response.status_code == 200: data = response.json() filtered_files = [] # Filter for specified file extensions for item in data.get("tree", []): if item["type"] == "blob": file_path = item["path"] # Check if file has any of the specified extensions if any( file_path.lower().endswith(ext.lower()) for ext in file_extensions ): filtered_files.append(file_path) if filtered_files: ext_str = ", ".join(file_extensions) return ( filtered_files, f"Found {len(filtered_files)} files with extensions ({ext_str}) in {repo_name}/{branch}", ) else: ext_str = ", ".join(file_extensions) return ( [], f"No files with extensions ({ext_str}) found in repository {repo_name}/{branch}", ) elif response.status_code == 404: return ( [], f"Repository '{repo_name}' not found or branch '{branch}' doesn't exist", ) elif response.status_code == 403: if "rate limit" in response.text.lower(): return ( [], "GitHub API rate limit exceeded. Consider using a GitHub token.", ) else: return ( [], "Access denied. Repository may be private or require authentication.", ) else: return ( [], f"GitHub API Error: {response.status_code} - {response.text[:200]}", ) except requests.exceptions.Timeout: return [], f"Request timeout after {self.timeout} seconds" except requests.exceptions.RequestException as e: return [], f"Network error: {str(e)}" except Exception as e: return [], f"Unexpected error: {str(e)}" def _parse_repo_name(self, repo_url: str) -> Optional[str]: """ Parse repository URL to extract owner/repo format Args: repo_url: GitHub repository URL or owner/repo format Returns: Repository name in "owner/repo" format or None if invalid """ if "github.com" in repo_url: # Extract from full URL parts = ( repo_url.replace("https://github.com/", "") .replace("http://github.com/", "") .strip("/") .split("/") ) if len(parts) >= 2: return f"{parts[0]}/{parts[1]}" else: # Assume format is owner/repo parts = repo_url.strip().split("/") if len(parts) == 2 and all(part.strip() for part in parts): return repo_url.strip() return None def fetch_markdown_files( self, repo_url: str, branch: str = "main" ) -> Tuple[List[str], str]: """ Fetch markdown files from GitHub repository (backward compatibility method) Args: repo_url: GitHub repository URL or owner/repo format branch: Branch name to fetch from Returns: Tuple of (list_of_markdown_files, status_message) """ return self.fetch_repository_files( repo_url=repo_url, file_extensions=[".md", ".mdx"], branch=branch ) async def load_files( self, repo_name: str, file_paths: List[str], branch: str = "main" ) -> Tuple[List[Document], List[str]]: """ Load files from GitHub repository asynchronously. Args: repo_name: Repository name in format "owner/repo" file_paths: List of file paths to load branch: Branch name to load from Returns: Tuple of (successfully_loaded_documents, failed_file_paths) """ if not file_paths: return [], [] # Validate repo name format if not re.match(r"^[^/]+/[^/]+$", repo_name): raise ValueError(f"Invalid repo format: {repo_name}. Expected 'owner/repo'") # Create semaphore to limit concurrent requests semaphore = asyncio.Semaphore(self.concurrent_requests) # Create session connector = aiohttp.TCPConnector(limit=self.concurrent_requests) timeout_config = aiohttp.ClientTimeout(total=self.timeout) async with aiohttp.ClientSession( headers=self.headers, connector=connector, timeout=timeout_config ) as session: # Create tasks for all files tasks = [] for file_path in file_paths: task = asyncio.create_task( self._fetch_file_with_retry( session, semaphore, repo_name, file_path, branch ) ) tasks.append(task) # Wait for all tasks to complete results = await asyncio.gather(*tasks, return_exceptions=True) # Process results documents = [] failed_files = [] for i, result in enumerate(results): file_path = file_paths[i] if isinstance(result, Exception): logger.error(f"Failed to load {file_path}: {result}") failed_files.append(file_path) elif result is None: logger.warning(f"No content returned for {file_path}") failed_files.append(file_path) else: documents.append(result) logger.info( f"Successfully loaded {len(documents)} files, failed: {len(failed_files)}" ) return documents, failed_files async def _fetch_file_with_retry( self, session: aiohttp.ClientSession, semaphore: asyncio.Semaphore, repo_name: str, file_path: str, branch: str, ) -> Optional[Document]: """Fetch a single file with retry logic.""" async with semaphore: for attempt in range(self.retries + 1): try: return await self._fetch_single_file( session, repo_name, file_path, branch ) except Exception as e: if attempt == self.retries: logger.error( f"Failed to fetch {file_path} after {self.retries + 1} attempts: {e}" ) raise else: logger.warning( f"Attempt {attempt + 1} failed for {file_path}: {e}" ) await asyncio.sleep(2**attempt) # Exponential backoff return None async def _fetch_single_file( self, session: aiohttp.ClientSession, repo_name: str, file_path: str, branch: str, ) -> Document: """Fetch a single file from GitHub API.""" # Clean file path clean_path = file_path.strip("/") # Build API URL api_url = f"https://api.github.com/repos/{repo_name}/contents/{clean_path}" params = {"ref": branch} logger.debug(f"Fetching: {api_url}") async with session.get(api_url, params=params) as response: if response.status == 404: raise FileNotFoundError(f"File not found: {file_path}") elif response.status == 403: raise PermissionError("API rate limit exceeded or access denied") elif response.status != 200: raise Exception(f"HTTP {response.status}: {await response.text()}") data = await response.json() # Handle directory case if isinstance(data, list): raise ValueError(f"Path {file_path} is a directory, not a file") # Decode file content if data.get("encoding") == "base64": try: content_bytes = base64.b64decode(data["content"]) content_text = content_bytes.decode("utf-8") except Exception as e: logger.warning(f"Failed to decode {file_path}: {e}") # Try to decode as latin-1 as fallback content_text = content_bytes.decode("latin-1", errors="ignore") else: raise ValueError(f"Unsupported encoding: {data.get('encoding')}") # Create Document document = self._create_document( content=content_text, file_path=clean_path, repo_name=repo_name, branch=branch, file_data=data, ) return document def _create_document( self, content: str, file_path: str, repo_name: str, branch: str, file_data: Dict ) -> Document: """Create a LlamaIndex Document from file content and metadata.""" # Extract file info filename = Path(file_path).name file_extension = Path(file_path).suffix.lower() directory = ( str(Path(file_path).parent) if Path(file_path).parent != Path(".") else "" ) # Build URLs html_url = f"https://github.com/{repo_name}/blob/{branch}/{file_path}" raw_url = file_data.get("download_url", "") # Create metadata metadata = { "file_path": file_path, "file_name": filename, "file_extension": file_extension, "directory": directory, "repo": repo_name, "branch": branch, "sha": file_data.get("sha", ""), "size": file_data.get("size", 0), "url": html_url, "raw_url": raw_url, "type": file_data.get("type", "file"), } # Create document with unique ID doc_id = f"{repo_name}:{branch}:{file_path}" document = Document( text=content, doc_id=doc_id, metadata=metadata, # For backward compatibility ) return document def load_files_sync( self, repo_name: str, file_paths: List[str], branch: str = "main" ) -> Tuple[List[Document], List[str]]: """ Synchronous wrapper for load_files. Args: repo_name: Repository name in format "owner/repo" file_paths: List of file paths to load branch: Branch name to load from Returns: Tuple of (successfully_loaded_documents, failed_file_paths) """ return asyncio.run(self.load_files(repo_name, file_paths, branch)) # Convenience functions async def load_github_files_async( repo_name: str, file_paths: List[str], branch: str = "main", github_token: Optional[str] = None, concurrent_requests: int = 10, ) -> Tuple[List[Document], List[str]]: """ Convenience function to load GitHub files asynchronously. Args: repo_name: Repository name in format "owner/repo" file_paths: List of file paths to load branch: Branch name to load from github_token: GitHub API token concurrent_requests: Number of concurrent requests Returns: Tuple of (documents, failed_files) """ loader = GithubFileLoader( github_token=github_token, concurrent_requests=concurrent_requests ) return await loader.load_files(repo_name, file_paths, branch) def load_github_files( repo_name: str, file_paths: List[str], branch: str = "main", github_token: Optional[str] = None, concurrent_requests: int = 10, ) -> Tuple[List[Document], List[str]]: """ Convenience function to load GitHub files synchronously. Args: repo_name: Repository name in format "owner/repo" file_paths: List of file paths to load branch: Branch name to load from github_token: GitHub API token concurrent_requests: Number of concurrent requests Returns: Tuple of (documents, failed_files) """ loader = GithubFileLoader( github_token=github_token, concurrent_requests=concurrent_requests ) return loader.load_files_sync(repo_name, file_paths, branch) def fetch_markdown_files( repo_url: str, github_token: Optional[str] = None, branch: str = "main" ) -> Tuple[List[str], str]: """ Convenience function to fetch markdown files from GitHub repository Args: repo_url: GitHub repository URL or owner/repo format github_token: GitHub API token for higher rate limits branch: Branch name to fetch from Returns: Tuple of (list_of_files, status_message) """ loader = GithubFileLoader(github_token=github_token) return loader.fetch_markdown_files(repo_url, branch) def fetch_repository_files( repo_url: str, file_extensions: List[str] = [".md", ".mdx"], github_token: Optional[str] = None, branch: str = "main", ) -> Tuple[List[str], str]: """ Convenience function to fetch files with specific extensions from GitHub repository Args: repo_url: GitHub repository URL or owner/repo format file_extensions: List of file extensions to filter github_token: GitHub API token for higher rate limits branch: Branch name to fetch from Returns: Tuple of (list_of_files, status_message) """ loader = GithubFileLoader(github_token=github_token) return loader.fetch_repository_files(repo_url, file_extensions, branch) # Example usage if __name__ == "__main__": # Example file paths file_paths = [ "docs/contribute/docs.mdx", "docs/contribute/ml-handlers.mdx", "docs/contribute/community.mdx", "docs/contribute/python-coding-standards.mdx", "docs/features/data-integrations.mdx", "docs/features/ai-integrations.mdx", "docs/integrations/ai-engines/langchain_embedding.mdx", "docs/integrations/ai-engines/langchain.mdx", "docs/integrations/ai-engines/google_gemini.mdx", "docs/integrations/ai-engines/anomaly.mdx", "docs/integrations/ai-engines/amazon-bedrock.mdx", ] # Load files synchronously documents, failed = load_github_files( repo_name="mindsdb/mindsdb", file_paths=file_paths, branch="main", # Optional ) print(f"Loaded {len(documents)} documents") print(f"Failed to load {len(failed)} files: {failed}") # Print first document info if documents: doc = documents[0] print("\nFirst document:") print(f"ID: {doc.doc_id}") print(f"File: {doc.metadata['file_path']}") print(f"Size: {len(doc.text)} characters") print(f"Content preview: {doc.text[:200]}...")