from pathlib import Path from typing import List, Dict, Optional from dataclasses import dataclass import chardet @dataclass class FileInfo: path: Path size: int extension: str content: Optional[str] = None encoding: Optional[str] = None @property def formatted_size(self) -> str: if self.size < 1024: return f"{self.size} B" elif self.size < 1024 * 1024: return f"{self.size/1024:.1f} KB" else: return f"{self.size/(1024*1024):.1f} MB" class FileScanner: TARGET_EXTENSIONS = {'.py', '.sh', '.rb', '.js', '.ts', '.java', '.cpp', '.hpp', '.c', '.h', '.go', '.rs', '.php', '.json', '.yml', '.yaml', '.toml', '.ini', '.md', '.txt'} EXCLUDED_DIRS = {'.git', '__pycache__', 'node_modules', 'venv', '.env'} MAX_FILE_SIZE = 1 * 1024 * 1024 def __init__(self, base_dir: Path): self.base_dir = base_dir def scan_files(self) -> List[FileInfo]: if not self.base_dir.exists(): raise FileNotFoundError(f"ディレクトリが見つかりません: {self.base_dir}") files = [] for entry in self.base_dir.glob("**/*"): if (entry.is_file() and entry.suffix.lower() in self.TARGET_EXTENSIONS and not any(excluded in entry.parts for excluded in self.EXCLUDED_DIRS) and entry.stat().st_size <= self.MAX_FILE_SIZE): try: with entry.open('rb') as f: raw_data = f.read(4096) encoding = chardet.detect(raw_data)['encoding'] or 'utf-8' with entry.open('r', encoding=encoding) as f: content = f.read() files.append(FileInfo( path=entry.absolute(), size=entry.stat().st_size, extension=entry.suffix.lower(), content=content, encoding=encoding )) except: continue return sorted(files, key=lambda x: str(x.path))