import os import pathlib import shutil import subprocess import tempfile from loguru import logger from llm_engineering.domain.documents import RepositoryDocument from .base import BaseCrawler class GithubCrawler(BaseCrawler): model = RepositoryDocument def __init__( self, include=( ".txt", ".md", ".rst", ".json", ".yml", ".yaml", ".xml", ".html", ".csv", ".py", ".sh", ".cfg", ".conf", ".js", ".css", ".scss", ".cpp", ".hpp", ".h", ".cc", ".hh", ".cmake", ".bat", ".rb", ".bash", ".qml", ".proto", ".properties", ".template", ".in", ".inc", ".pyi", ".typed", ), ignore=( ".git", ".toml", ".lock", ".png", ".gitignore", ".ico", ".jpg", ".jpeg", ".webp", ".svg", ".gif", ".stl", ".dae", ".jar", ".pdf", ), ) -> None: super().__init__() self._ignore = ignore self._include = include def extract(self, link: str, **kwargs) -> None: old_model = self.model.find(link=link) if old_model is not None: logger.info(f"Repository already exists in the database: {link}") return logger.info(f"Starting scrapping GitHub repository: {link}") repo_name = link.rstrip("/").split("/")[-1] local_temp = tempfile.mkdtemp() file_types = {} try: os.chdir(local_temp) subprocess.run(["git", "clone", link], check=True) repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118 tree = {} current_size = 0 max_size = 16793598 - 100000 # 16 MB in bytes for root, _, files in os.walk(repo_path): dir = root.replace(repo_path, "").lstrip("/") if dir.startswith(tuple(self._ignore)): continue for file in files: if file.endswith(tuple(self._ignore)) or file.startswith("."): continue if not file.endswith(tuple(self._include)): continue file_path = os.path.join(dir, file) # noqa: PTH118 full_file_path = os.path.join(root, file) # noqa: PTH118 try: with open(full_file_path, "r", errors="ignore") as f: # noqa: PTH123 file_extension = pathlib.Path(full_file_path).suffix file_types[file_extension] = 1 content = f.read().replace(" ", "") file_size = len(content.encode("utf-8")) # Check if adding this file exceeds the size limit if current_size + file_size > max_size: # Save the current tree and clear it self.save_tree(tree, repo_name, link) tree.clear() current_size = 0 # Add file to tree tree[file_path] = content current_size += file_size except Exception as e: logger.error(f"Failed to process file {file_path}: {e}") # Save any remaining files in the tree if tree: self.save_tree(tree, repo_name, link) except Exception as e: logger.error(f"Error while processing repository: {e}") raise finally: shutil.rmtree(local_temp, ignore_errors=True) logger.info(f"Finished scrapping GitHub repository: {link}") logger.info(file_types) def save_tree(self, tree, repo_name, link): """Helper method to save the current tree.""" try: instance = self.model( content=tree, name=repo_name, link=link, platform="github", author_id="46648381-8bf3-4877-b6b4-d48c9de9d870", author_full_name="CS370 Project", ) instance.save() except Exception as e: logger.error(f"Failed to save tree: {e}")