SkazuHD's picture
init space
d660b02
import os
import pathlib
import shutil
import subprocess
import tempfile
from loguru import logger
from llm_engineering.domain.documents import RepositoryDocument
from .base import BaseCrawler
class GithubCrawler(BaseCrawler):
model = RepositoryDocument
def __init__(
self,
include=(
".txt",
".md",
".rst",
".json",
".yml",
".yaml",
".xml",
".html",
".csv",
".py",
".sh",
".cfg",
".conf",
".js",
".css",
".scss",
".cpp",
".hpp",
".h",
".cc",
".hh",
".cmake",
".bat",
".rb",
".bash",
".qml",
".proto",
".properties",
".template",
".in",
".inc",
".pyi",
".typed",
),
ignore=(
".git",
".toml",
".lock",
".png",
".gitignore",
".ico",
".jpg",
".jpeg",
".webp",
".svg",
".gif",
".stl",
".dae",
".jar",
".pdf",
),
) -> None:
super().__init__()
self._ignore = ignore
self._include = include
def extract(self, link: str, **kwargs) -> None:
old_model = self.model.find(link=link)
if old_model is not None:
logger.info(f"Repository already exists in the database: {link}")
return
logger.info(f"Starting scrapping GitHub repository: {link}")
repo_name = link.rstrip("/").split("/")[-1]
local_temp = tempfile.mkdtemp()
file_types = {}
try:
os.chdir(local_temp)
subprocess.run(["git", "clone", link], check=True)
repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118
tree = {}
current_size = 0
max_size = 16793598 - 100000 # 16 MB in bytes
for root, _, files in os.walk(repo_path):
dir = root.replace(repo_path, "").lstrip("/")
if dir.startswith(tuple(self._ignore)):
continue
for file in files:
if file.endswith(tuple(self._ignore)) or file.startswith("."):
continue
if not file.endswith(tuple(self._include)):
continue
file_path = os.path.join(dir, file) # noqa: PTH118
full_file_path = os.path.join(root, file) # noqa: PTH118
try:
with open(full_file_path, "r", errors="ignore") as f: # noqa: PTH123
file_extension = pathlib.Path(full_file_path).suffix
file_types[file_extension] = 1
content = f.read().replace(" ", "")
file_size = len(content.encode("utf-8"))
# Check if adding this file exceeds the size limit
if current_size + file_size > max_size:
# Save the current tree and clear it
self.save_tree(tree, repo_name, link)
tree.clear()
current_size = 0
# Add file to tree
tree[file_path] = content
current_size += file_size
except Exception as e:
logger.error(f"Failed to process file {file_path}: {e}")
# Save any remaining files in the tree
if tree:
self.save_tree(tree, repo_name, link)
except Exception as e:
logger.error(f"Error while processing repository: {e}")
raise
finally:
shutil.rmtree(local_temp, ignore_errors=True)
logger.info(f"Finished scrapping GitHub repository: {link}")
logger.info(file_types)
def save_tree(self, tree, repo_name, link):
"""Helper method to save the current tree."""
try:
instance = self.model(
content=tree,
name=repo_name,
link=link,
platform="github",
author_id="46648381-8bf3-4877-b6b4-d48c9de9d870",
author_full_name="CS370 Project",
)
instance.save()
except Exception as e:
logger.error(f"Failed to save tree: {e}")