|
import chardet |
|
from pathlib import Path |
|
from typing import List, Optional, Set |
|
from dataclasses import dataclass |
|
|
|
@dataclass |
|
class FileInfo: |
|
path: Path |
|
size: int |
|
extension: str |
|
content: Optional[str] = None |
|
encoding: Optional[str] = None |
|
|
|
@property |
|
def formatted_size(self) -> str: |
|
"""ファイルサイズを見やすい単位で表示""" |
|
if self.size < 1024: |
|
return f"{self.size} B" |
|
elif self.size < 1024 * 1024: |
|
return f"{self.size / 1024:.1f} KB" |
|
else: |
|
return f"{self.size / (1024 * 1024):.1f} MB" |
|
|
|
|
|
class FileScanner: |
|
EXCLUDED_DIRS = { |
|
'.git', '__pycache__', 'node_modules', 'venv', |
|
'.env', 'build', 'dist', 'target', 'bin', 'obj' |
|
} |
|
|
|
def __init__(self, base_dir: Path, target_extensions: Set[str]): |
|
""" |
|
base_dir: 解析を開始するディレクトリ |
|
target_extensions: 対象とする拡張子の集合(例: {'.py', '.js', '.md'}) |
|
""" |
|
self.base_dir = base_dir |
|
self.target_extensions = {ext.lower() for ext in target_extensions} |
|
|
|
def _should_scan_file(self, path: Path) -> bool: |
|
"""対象外フォルダ・拡張子を除外""" |
|
|
|
if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS): |
|
return False |
|
|
|
if path.suffix.lower() in self.target_extensions: |
|
return True |
|
return False |
|
|
|
def _read_file_content(self, file_path: Path) -> (Optional[str], Optional[str]): |
|
"""ファイル内容を読み込み、エンコーディングを判定""" |
|
try: |
|
|
|
with file_path.open('rb') as rb: |
|
raw_data = rb.read(4096) |
|
detect_result = chardet.detect(raw_data) |
|
encoding = detect_result['encoding'] if detect_result['confidence'] > 0.7 else 'utf-8' |
|
|
|
|
|
try: |
|
with file_path.open('r', encoding=encoding) as f: |
|
return f.read(), encoding |
|
except UnicodeDecodeError: |
|
|
|
with file_path.open('r', encoding='cp932') as f: |
|
return f.read(), 'cp932' |
|
except Exception: |
|
return None, None |
|
|
|
def scan_files(self) -> List[FileInfo]: |
|
"""再帰的にファイルを探して、指定拡張子ならFileInfoリストにまとめる""" |
|
if not self.base_dir.exists(): |
|
raise FileNotFoundError(f"指定ディレクトリが見つかりません: {self.base_dir}") |
|
|
|
collected_files = [] |
|
for entry in self.base_dir.glob("**/*"): |
|
if entry.is_file() and self._should_scan_file(entry): |
|
content, encoding = self._read_file_content(entry) |
|
file_info = FileInfo( |
|
path=entry.resolve(), |
|
size=entry.stat().st_size, |
|
extension=entry.suffix.lower(), |
|
content=content, |
|
encoding=encoding |
|
) |
|
collected_files.append(file_info) |
|
|
|
return sorted(collected_files, key=lambda x: str(x.path)) |
|
|