Repository_Scaner / core /file_scanner.py
DeL-TaiseiOzaki
ss
560aacd
raw
history blame
2.29 kB
from pathlib import Path
from typing import List, Dict, Optional
from dataclasses import dataclass
import chardet
@dataclass
class FileInfo:
path: Path
size: int
extension: str
content: Optional[str] = None
encoding: Optional[str] = None
@property
def formatted_size(self) -> str:
if self.size < 1024:
return f"{self.size} B"
elif self.size < 1024 * 1024:
return f"{self.size/1024:.1f} KB"
else:
return f"{self.size/(1024*1024):.1f} MB"
class FileScanner:
TARGET_EXTENSIONS = {'.py', '.sh', '.rb', '.js', '.ts', '.java', '.cpp',
'.hpp', '.c', '.h', '.go', '.rs', '.php', '.json',
'.yml', '.yaml', '.toml', '.ini', '.md', '.txt'}
EXCLUDED_DIRS = {'.git', '__pycache__', 'node_modules', 'venv', '.env'}
MAX_FILE_SIZE = 1 * 1024 * 1024
def __init__(self, base_dir: Path):
self.base_dir = base_dir
def scan_files(self) -> List[FileInfo]:
if not self.base_dir.exists():
raise FileNotFoundError(f"ディレクトリが見つかりません: {self.base_dir}")
files = []
for entry in self.base_dir.glob("**/*"):
if (entry.is_file() and
entry.suffix.lower() in self.TARGET_EXTENSIONS and
not any(excluded in entry.parts for excluded in self.EXCLUDED_DIRS) and
entry.stat().st_size <= self.MAX_FILE_SIZE):
try:
with entry.open('rb') as f:
raw_data = f.read(4096)
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
with entry.open('r', encoding=encoding) as f:
content = f.read()
files.append(FileInfo(
path=entry.absolute(),
size=entry.stat().st_size,
extension=entry.suffix.lower(),
content=content,
encoding=encoding
))
except:
continue
return sorted(files, key=lambda x: str(x.path))