DeL-TaiseiOzaki commited on
Commit
681ede6
·
verified ·
1 Parent(s): 5bdd26d

Update core/file_scanner.py

Browse files
Files changed (1) hide show
  1. core/file_scanner.py +49 -34
core/file_scanner.py CHANGED
@@ -1,7 +1,7 @@
 
1
  from pathlib import Path
2
- from typing import List, Dict, Optional, Set
3
  from dataclasses import dataclass
4
- import chardet
5
 
6
  @dataclass
7
  class FileInfo:
@@ -10,63 +10,78 @@ class FileInfo:
10
  extension: str
11
  content: Optional[str] = None
12
  encoding: Optional[str] = None
13
-
14
  @property
15
  def formatted_size(self) -> str:
 
16
  if self.size < 1024:
17
  return f"{self.size} B"
18
  elif self.size < 1024 * 1024:
19
- return f"{self.size/1024:.1f} KB"
20
  else:
21
- return f"{self.size/(1024*1024):.1f} MB"
 
22
 
23
  class FileScanner:
24
  EXCLUDED_DIRS = {
25
- '.git', '__pycache__', 'node_modules', 'venv',
26
  '.env', 'build', 'dist', 'target', 'bin', 'obj'
27
  }
28
 
29
  def __init__(self, base_dir: Path, target_extensions: Set[str]):
 
 
 
 
30
  self.base_dir = base_dir
31
- self.target_extensions = target_extensions
32
-
33
  def _should_scan_file(self, path: Path) -> bool:
 
 
34
  if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS):
35
  return False
36
- return path.suffix.lower() in self.target_extensions
37
-
38
- def _read_file_content(self, file_path: Path) -> tuple[Optional[str], Optional[str]]:
 
 
 
 
39
  try:
40
- with file_path.open('rb') as f:
41
- raw_data = f.read(4096)
42
- result = chardet.detect(raw_data)
43
- encoding = result['encoding'] if result['confidence'] > 0.7 else 'utf-8'
 
 
 
44
  try:
45
  with file_path.open('r', encoding=encoding) as f:
46
  return f.read(), encoding
47
  except UnicodeDecodeError:
48
- try:
49
- with file_path.open('r', encoding='cp932') as f:
50
- return f.read(), 'cp932'
51
- except UnicodeDecodeError:
52
- return None, None
53
- except (OSError, ValueError):
54
  return None, None
55
-
56
  def scan_files(self) -> List[FileInfo]:
 
57
  if not self.base_dir.exists():
58
- raise FileNotFoundError(f"ディレクトリが見つかりません: {self.base_dir}")
59
-
60
- files = []
61
  for entry in self.base_dir.glob("**/*"):
62
  if entry.is_file() and self._should_scan_file(entry):
63
  content, encoding = self._read_file_content(entry)
64
- if content is not None:
65
- files.append(FileInfo(
66
- path=entry.absolute(),
67
- size=entry.stat().st_size,
68
- extension=entry.suffix.lower(),
69
- content=content,
70
- encoding=encoding
71
- ))
72
- return sorted(files, key=lambda x: str(x.path))
 
 
1
+ import chardet
2
  from pathlib import Path
3
+ from typing import List, Optional, Set
4
  from dataclasses import dataclass
 
5
 
6
  @dataclass
7
  class FileInfo:
 
10
  extension: str
11
  content: Optional[str] = None
12
  encoding: Optional[str] = None
13
+
14
  @property
15
  def formatted_size(self) -> str:
16
+ """ファイルサイズを見やすい単位で表示"""
17
  if self.size < 1024:
18
  return f"{self.size} B"
19
  elif self.size < 1024 * 1024:
20
+ return f"{self.size / 1024:.1f} KB"
21
  else:
22
+ return f"{self.size / (1024 * 1024):.1f} MB"
23
+
24
 
25
  class FileScanner:
26
  EXCLUDED_DIRS = {
27
+ '.git', '__pycache__', 'node_modules', 'venv',
28
  '.env', 'build', 'dist', 'target', 'bin', 'obj'
29
  }
30
 
31
  def __init__(self, base_dir: Path, target_extensions: Set[str]):
32
+ """
33
+ base_dir: 解析を開始するディレクトリ
34
+ target_extensions: 対象とする拡張子の集合(例: {'.py', '.js', '.md'})
35
+ """
36
  self.base_dir = base_dir
37
+ self.target_extensions = {ext.lower() for ext in target_extensions}
38
+
39
  def _should_scan_file(self, path: Path) -> bool:
40
+ """対象外フォルダ・拡張子を除外"""
41
+ # 除外フォルダ
42
  if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS):
43
  return False
44
+ # 拡張子チェック
45
+ if path.suffix.lower() in self.target_extensions:
46
+ return True
47
+ return False
48
+
49
+ def _read_file_content(self, file_path: Path) -> (Optional[str], Optional[str]):
50
+ """ファイル内容を読み込み、エンコーディングを判定"""
51
  try:
52
+ # 先頭数KBを読み込み、エンコーディングを推定
53
+ with file_path.open('rb') as rb:
54
+ raw_data = rb.read(4096)
55
+ detect_result = chardet.detect(raw_data)
56
+ encoding = detect_result['encoding'] if detect_result['confidence'] > 0.7 else 'utf-8'
57
+
58
+ # 推定したエンコーディングで読み込み
59
  try:
60
  with file_path.open('r', encoding=encoding) as f:
61
  return f.read(), encoding
62
  except UnicodeDecodeError:
63
+ # ダメなら cp932 を試す
64
+ with file_path.open('r', encoding='cp932') as f:
65
+ return f.read(), 'cp932'
66
+ except Exception:
 
 
67
  return None, None
68
+
69
  def scan_files(self) -> List[FileInfo]:
70
+ """再帰的にファイルを探して、指定拡張子ならFileInfoリストにまとめる"""
71
  if not self.base_dir.exists():
72
+ raise FileNotFoundError(f"指定ディレクトリが見つかりません: {self.base_dir}")
73
+
74
+ collected_files = []
75
  for entry in self.base_dir.glob("**/*"):
76
  if entry.is_file() and self._should_scan_file(entry):
77
  content, encoding = self._read_file_content(entry)
78
+ file_info = FileInfo(
79
+ path=entry.resolve(),
80
+ size=entry.stat().st_size,
81
+ extension=entry.suffix.lower(),
82
+ content=content,
83
+ encoding=encoding
84
+ )
85
+ collected_files.append(file_info)
86
+ # path の文字列表現でソート
87
+ return sorted(collected_files, key=lambda x: str(x.path))