Spaces:

SkazuHD
/

docker-test

Build error

App Files Files Community

docker-test / llm_engineering /application /crawlers /github.py

SkazuHD

init space

d660b02 7 months ago

raw

history blame contribute delete

4.89 kB

	import os
	import pathlib
	import shutil
	import subprocess
	import tempfile

	from loguru import logger

	from llm_engineering.domain.documents import RepositoryDocument

	from .base import BaseCrawler


	class GithubCrawler(BaseCrawler):
	model = RepositoryDocument

	def __init__(
	self,
	include=(
	".txt",
	".md",
	".rst",
	".json",
	".yml",
	".yaml",
	".xml",
	".html",
	".csv",
	".py",
	".sh",
	".cfg",
	".conf",
	".js",
	".css",
	".scss",
	".cpp",
	".hpp",
	".h",
	".cc",
	".hh",
	".cmake",
	".bat",
	".rb",
	".bash",
	".qml",
	".proto",
	".properties",
	".template",
	".in",
	".inc",
	".pyi",
	".typed",
	),
	ignore=(
	".git",
	".toml",
	".lock",
	".png",
	".gitignore",
	".ico",
	".jpg",
	".jpeg",
	".webp",
	".svg",
	".gif",
	".stl",
	".dae",
	".jar",
	".pdf",
	),
	) -> None:
	super().__init__()
	self._ignore = ignore
	self._include = include

	def extract(self, link: str, **kwargs) -> None:
	old_model = self.model.find(link=link)
	if old_model is not None:
	logger.info(f"Repository already exists in the database: {link}")

	return

	logger.info(f"Starting scrapping GitHub repository: {link}")

	repo_name = link.rstrip("/").split("/")[-1]

	local_temp = tempfile.mkdtemp()
	file_types = {}
	try:
	os.chdir(local_temp)
	subprocess.run(["git", "clone", link], check=True)

	repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118

	tree = {}
	current_size = 0
	max_size = 16793598 - 100000 # 16 MB in bytes

	for root, _, files in os.walk(repo_path):
	dir = root.replace(repo_path, "").lstrip("/")
	if dir.startswith(tuple(self._ignore)):
	continue
	for file in files:
	if file.endswith(tuple(self._ignore)) or file.startswith("."):
	continue
	if not file.endswith(tuple(self._include)):
	continue
	file_path = os.path.join(dir, file) # noqa: PTH118
	full_file_path = os.path.join(root, file) # noqa: PTH118

	try:
	with open(full_file_path, "r", errors="ignore") as f: # noqa: PTH123
	file_extension = pathlib.Path(full_file_path).suffix
	file_types[file_extension] = 1
	content = f.read().replace(" ", "")
	file_size = len(content.encode("utf-8"))

	# Check if adding this file exceeds the size limit
	if current_size + file_size > max_size:
	# Save the current tree and clear it
	self.save_tree(tree, repo_name, link)
	tree.clear()
	current_size = 0

	# Add file to tree
	tree[file_path] = content
	current_size += file_size

	except Exception as e:
	logger.error(f"Failed to process file {file_path}: {e}")

	# Save any remaining files in the tree
	if tree:
	self.save_tree(tree, repo_name, link)

	except Exception as e:
	logger.error(f"Error while processing repository: {e}")
	raise
	finally:
	shutil.rmtree(local_temp, ignore_errors=True)

	logger.info(f"Finished scrapping GitHub repository: {link}")
	logger.info(file_types)

	def save_tree(self, tree, repo_name, link):
	"""Helper method to save the current tree."""
	try:
	instance = self.model(
	content=tree,
	name=repo_name,
	link=link,
	platform="github",
	author_id="46648381-8bf3-4877-b6b4-d48c9de9d870",
	author_full_name="CS370 Project",
	)
	instance.save()
	except Exception as e:
	logger.error(f"Failed to save tree: {e}")