Spaces:
Sleeping
Sleeping
Update core/file_scanner.py
Browse files- core/file_scanner.py +49 -34
core/file_scanner.py
CHANGED
@@ -1,7 +1,7 @@
|
|
|
|
1 |
from pathlib import Path
|
2 |
-
from typing import List,
|
3 |
from dataclasses import dataclass
|
4 |
-
import chardet
|
5 |
|
6 |
@dataclass
|
7 |
class FileInfo:
|
@@ -10,63 +10,78 @@ class FileInfo:
|
|
10 |
extension: str
|
11 |
content: Optional[str] = None
|
12 |
encoding: Optional[str] = None
|
13 |
-
|
14 |
@property
|
15 |
def formatted_size(self) -> str:
|
|
|
16 |
if self.size < 1024:
|
17 |
return f"{self.size} B"
|
18 |
elif self.size < 1024 * 1024:
|
19 |
-
return f"{self.size/1024:.1f} KB"
|
20 |
else:
|
21 |
-
return f"{self.size/(1024*1024):.1f} MB"
|
|
|
22 |
|
23 |
class FileScanner:
|
24 |
EXCLUDED_DIRS = {
|
25 |
-
'.git', '__pycache__', 'node_modules', 'venv',
|
26 |
'.env', 'build', 'dist', 'target', 'bin', 'obj'
|
27 |
}
|
28 |
|
29 |
def __init__(self, base_dir: Path, target_extensions: Set[str]):
|
|
|
|
|
|
|
|
|
30 |
self.base_dir = base_dir
|
31 |
-
self.target_extensions = target_extensions
|
32 |
-
|
33 |
def _should_scan_file(self, path: Path) -> bool:
|
|
|
|
|
34 |
if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS):
|
35 |
return False
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
39 |
try:
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
44 |
try:
|
45 |
with file_path.open('r', encoding=encoding) as f:
|
46 |
return f.read(), encoding
|
47 |
except UnicodeDecodeError:
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
return None, None
|
53 |
-
except (OSError, ValueError):
|
54 |
return None, None
|
55 |
-
|
56 |
def scan_files(self) -> List[FileInfo]:
|
|
|
57 |
if not self.base_dir.exists():
|
58 |
-
raise FileNotFoundError(f"
|
59 |
-
|
60 |
-
|
61 |
for entry in self.base_dir.glob("**/*"):
|
62 |
if entry.is_file() and self._should_scan_file(entry):
|
63 |
content, encoding = self._read_file_content(entry)
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
1 |
+
import chardet
|
2 |
from pathlib import Path
|
3 |
+
from typing import List, Optional, Set
|
4 |
from dataclasses import dataclass
|
|
|
5 |
|
6 |
@dataclass
|
7 |
class FileInfo:
|
|
|
10 |
extension: str
|
11 |
content: Optional[str] = None
|
12 |
encoding: Optional[str] = None
|
13 |
+
|
14 |
@property
|
15 |
def formatted_size(self) -> str:
|
16 |
+
"""ファイルサイズを見やすい単位で表示"""
|
17 |
if self.size < 1024:
|
18 |
return f"{self.size} B"
|
19 |
elif self.size < 1024 * 1024:
|
20 |
+
return f"{self.size / 1024:.1f} KB"
|
21 |
else:
|
22 |
+
return f"{self.size / (1024 * 1024):.1f} MB"
|
23 |
+
|
24 |
|
25 |
class FileScanner:
|
26 |
EXCLUDED_DIRS = {
|
27 |
+
'.git', '__pycache__', 'node_modules', 'venv',
|
28 |
'.env', 'build', 'dist', 'target', 'bin', 'obj'
|
29 |
}
|
30 |
|
31 |
def __init__(self, base_dir: Path, target_extensions: Set[str]):
|
32 |
+
"""
|
33 |
+
base_dir: 解析を開始するディレクトリ
|
34 |
+
target_extensions: 対象とする拡張子の集合(例: {'.py', '.js', '.md'})
|
35 |
+
"""
|
36 |
self.base_dir = base_dir
|
37 |
+
self.target_extensions = {ext.lower() for ext in target_extensions}
|
38 |
+
|
39 |
def _should_scan_file(self, path: Path) -> bool:
|
40 |
+
"""対象外フォルダ・拡張子を除外"""
|
41 |
+
# 除外フォルダ
|
42 |
if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS):
|
43 |
return False
|
44 |
+
# 拡張子チェック
|
45 |
+
if path.suffix.lower() in self.target_extensions:
|
46 |
+
return True
|
47 |
+
return False
|
48 |
+
|
49 |
+
def _read_file_content(self, file_path: Path) -> (Optional[str], Optional[str]):
|
50 |
+
"""ファイル内容を読み込み、エンコーディングを判定"""
|
51 |
try:
|
52 |
+
# 先頭数KBを読み込み、エンコーディングを推定
|
53 |
+
with file_path.open('rb') as rb:
|
54 |
+
raw_data = rb.read(4096)
|
55 |
+
detect_result = chardet.detect(raw_data)
|
56 |
+
encoding = detect_result['encoding'] if detect_result['confidence'] > 0.7 else 'utf-8'
|
57 |
+
|
58 |
+
# 推定したエンコーディングで読み込み
|
59 |
try:
|
60 |
with file_path.open('r', encoding=encoding) as f:
|
61 |
return f.read(), encoding
|
62 |
except UnicodeDecodeError:
|
63 |
+
# ダメなら cp932 を試す
|
64 |
+
with file_path.open('r', encoding='cp932') as f:
|
65 |
+
return f.read(), 'cp932'
|
66 |
+
except Exception:
|
|
|
|
|
67 |
return None, None
|
68 |
+
|
69 |
def scan_files(self) -> List[FileInfo]:
|
70 |
+
"""再帰的にファイルを探して、指定拡張子ならFileInfoリストにまとめる"""
|
71 |
if not self.base_dir.exists():
|
72 |
+
raise FileNotFoundError(f"指定ディレクトリが見つかりません: {self.base_dir}")
|
73 |
+
|
74 |
+
collected_files = []
|
75 |
for entry in self.base_dir.glob("**/*"):
|
76 |
if entry.is_file() and self._should_scan_file(entry):
|
77 |
content, encoding = self._read_file_content(entry)
|
78 |
+
file_info = FileInfo(
|
79 |
+
path=entry.resolve(),
|
80 |
+
size=entry.stat().st_size,
|
81 |
+
extension=entry.suffix.lower(),
|
82 |
+
content=content,
|
83 |
+
encoding=encoding
|
84 |
+
)
|
85 |
+
collected_files.append(file_info)
|
86 |
+
# path の文字列表現でソート
|
87 |
+
return sorted(collected_files, key=lambda x: str(x.path))
|