Spaces:

DeL-TaiseiOzaki
/

Repository_Scaner

Running

App Files Files Community

DeL-TaiseiOzaki commited on Oct 30, 2024

Commit

230b1a5

1 Parent(s): bc9366d

first commit

Browse files

Files changed (16) hide show

.gitignore +13 -0
README.md +96 -1
app.py +182 -0
config/__init__.py +0 -0
config/llm_setting.py +24 -0
config/settings.py +18 -0
core/__init__.py +0 -0
core/file_scanner.py +60 -0
core/git_manager.py +34 -0
main.py +69 -0
requirements.txt +6 -0
scan.sh +49 -0
services/llm_service.py +110 -0
utils/__init__.py +0 -0
utils/file_writer.py +24 -0
utils/logger.py +26 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+__pycache__/
+*.py[cod]
+*$py.class
+.env
+.venv
+env/
+venv/
+ENV/
+.streamlit/secrets.toml
+output/
+.idea/
+.vscode/
+*.log

README.md CHANGED Viewed

@@ -11,4 +11,99 @@ license: apache-2.0
 short_description: プログラミング関連ファイルを再帰的にスキャンし、内容を単一のテキストファイルにエクスポートするツールです。GitHubリ
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: プログラミング関連ファイルを再帰的にスキャンし、内容を単一のテキストファイルにエクスポートするツールです。GitHubリ
 ---
+# get_repository_info_by_llm
+プログラミング関連ファイルを再帰的にスキャンし、内容を単一のテキストファイルにエクスポートするツールです。GitHubリポジトリまたはローカルディレクトリに対応しています。
+## 機能
+- GitHubリポジトリのクローンとスキャン
+- ローカルディレクトリのスキャン
+- 再帰的なファイル検索
+- 主要なプログラミング言語ファイルの検出
+- UTF-8/CP932エンコーディングの自動検出
+- 結果のテキストファイル出力
+## 必要条件
+- Python 3.7以上
+- Git（GitHubリポジトリをスキャンする場合）
+## インストール
+1. リポジトリをクローン
+```bash
+git clone [このリポジトリのURL]
+cd directory-scanner
+```
+2. 必要なディレクトリを作成
+```bash
+mkdir output
+```
+## 使用方法
+### コマンドライン
+```bash
+# GitHubリポジトリをスキャン
+python main.py https://github.com/username/repository.git
+# ローカルディレクトリをスキャン
+python main.py /path/to/directory
+```
+### シェルスクリプトを使用
+```bash
+# スクリプトに実行権限を付与
+chmod +x scan.sh
+# GitHubリポジトリをスキャン
+./scan.sh https://github.com/username/repository.git
+# ローカルディレクトリをスキャン
+./scan.sh /path/to/directory
+```
+## 出力形式
+スキャン結果は `output` ディレクトリに保存され、以下の形式で出力されます：
+```
+#ファイルパス
+path/to/file.py
+------------
+ファイルの内容
+```
+## スキャン対象
+### 対象となるファイル拡張子
+- Python (.py)
+- JavaScript (.js)
+- Java (.java)
+- C/C++ (.c, .h, .cpp, .hpp)
+- Go (.go)
+- Rust (.rs)
+- PHP (.php)
+- Ruby (.rb)
+- TypeScript (.ts)
+- その他 (.scala, .kt, .cs, .swift, .m, .sh, .pl, .r)
+### 除外されるディレクトリ
+- .git
+- __pycache__
+- node_modules
+- venv
+- .env
+- build
+- dist
+- target
+- bin
+- obj
+## 注意事項
+- GitHubリポジトリをスキャンする場合、一時的にローカルにクローンされます
+- スキャン完了後、クローンされたリポジトリは自動的に削除されます
+- 大きなファイルや特殊なエンコーディングのファイルは読み取れない場合があります

app.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import streamlit as st
+import tempfile
+import git
+from core.file_scanner import FileScanner
+from pathlib import Path
+from datetime import datetime
+from services.llm_service import LLMService
+from core.file_scanner import FileInfo
+from typing import List
+# ページ設定
+st.set_page_config(
+    page_title="Repository Code Analysis",
+    page_icon="🔍",
+    layout="wide"
+)
+# ダークテーマの設定
+st.markdown("""
+<style>
+    .stApp {
+        background-color: #0e1117;
+        color: #ffffff;
+    }
+    .chat-message {
+        padding: 1rem;
+        margin: 1rem 0;
+        border-radius: 0.5rem;
+    }
+    .assistant-message {
+        background-color: #1e2329;
+        color: #ffffff;
+    }
+    .stButton button {
+        background-color: #2ea44f;
+        color: #ffffff;
+    }
+    .stTextArea textarea {
+        background-color: #1e2329;
+        color: #ffffff;
+    }
+</style>
+""", unsafe_allow_html=True)
+def clone_repository(repo_url: str) -> Path:
+    """リポジトリをクローンして一時ディレクトリに保存"""
+    temp_dir = Path(tempfile.mkdtemp())
+    git.Repo.clone_from(repo_url, temp_dir)
+    return temp_dir
+def create_download_content(files: List[FileInfo]) -> str:
+    content = "# スキャン結果\n\n"
+    for file in files:
+        content += f"## {file.path}\n"
+        content += f"サイズ: {file.formatted_size}\n"
+        content += f"エンコーディング: {file.encoding or '不明'}\n\n"
+        if file.content:
+            content += f"```{file.extension[1:] if file.extension else ''}\n"
+            content += file.content
+            content += "\n```\n\n"
+    return content
+# セッション状態の初期化
+if 'repo_content' not in st.session_state:
+    st.session_state.repo_content = None
+if 'temp_dir' not in st.session_state:
+    st.session_state.temp_dir = None
+if 'llm_service' not in st.session_state:
+    try:
+        st.session_state.llm_service = LLMService()
+    except ValueError as e:
+        st.error(str(e))
+        st.stop()
+# メインのUIレイアウト
+st.title("🔍 リポジトリ解析・質問システム")
+# サイドバーでモデル選択
+available_models = st.session_state.llm_service.settings.get_available_models()
+if len(available_models) > 1:
+    selected_model = st.sidebar.selectbox(
+        "使用するモデル",
+        available_models,
+        index=available_models.index(st.session_state.llm_service.current_model)
+    )
+    st.session_state.llm_service.switch_model(selected_model)
+# URLの入力
+repo_url = st.text_input(
+    "GitHubリポジトリのURLを入力",
+    placeholder="https://github.com/username/repository.git"
+)
+# スキャン実行ボタン
+if st.button("スキャン開始", disabled=not repo_url):
+    try:
+        with st.spinner('リポジトリをクローン中...'):
+            temp_dir = clone_repository(repo_url)
+            st.session_state.temp_dir = temp_dir
+        with st.spinner('ファイルをスキャン中...'):
+            scanner = FileScanner(temp_dir)
+            files = scanner.scan_files()  # List[FileInfo] を取得
+            st.session_state.repo_content = LLMService.format_code_content(files)
+        st.success(f"スキャン完了: {len(files)}個のファイルを検出")
+        # 新しいスキャン時に会話履歴をクリア
+        st.session_state.llm_service.clear_history()
+    except Exception as e:
+        st.error(f"エラーが発生しました: {str(e)}")
+# スキャン完了後の質問セクション
+if st.session_state.repo_content:
+    st.divider()
+    st.subheader("💭 コードについて質問する")
+    # スキャン結果のダウンロードボタン
+    scan_result = create_download_content(files)  # filesはスキャン結果
+    st.download_button(
+        label="スキャン結果をダウンロード",
+        data=scan_result,
+        file_name=f"scan_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md",
+        mime="text/markdown"
+    )
+    # 会話履歴の表示（アシスタントの回答のみ）
+    for message in st.session_state.llm_service.conversation_history:
+        if message.role == "assistant":  # アシスタントの回答のみを表示
+            st.markdown(f'<div class="chat-message assistant-message">{message.content}</div>',
+                       unsafe_allow_html=True)
+    query = st.text_area(
+        "質問を入力してください",
+        placeholder="例: このコードの主な機能は何ですか？"
+    )
+    col1, col2 = st.columns([1, 5])
+    with col1:
+        if st.button("履歴クリア"):
+            st.session_state.llm_service.clear_history()
+            st.rerun()
+    with col2:
+        if st.button("質問する", disabled=not query):
+            with st.spinner('回答を生成中...'):
+                response, error = st.session_state.llm_service.get_response(
+                    st.session_state.repo_content,
+                    query
+                )
+                if error:
+                    st.error(error)
+                else:
+                    st.rerun()  # 会話履歴を更新するために再表示
+# セッション終了時のクリーンアップ
+if st.session_state.temp_dir and Path(st.session_state.temp_dir).exists():
+    try:
+        import shutil
+        shutil.rmtree(st.session_state.temp_dir)
+    except:
+        pass
+# サイドバー情報
+with st.sidebar:
+    st.subheader("📌 使い方")
+    st.markdown("""
+    1. GitHubリポジトリのURLを入力
+    2. スキャンを実行
+    3. コードについて質問（最大5ターンの会話が可能）
+    """)
+    st.subheader("🔍 スキャン対象")
+    st.markdown("""
+    - Python (.py)
+    - JavaScript (.js)
+    - Java (.java)
+    - C/C++ (.c, .h, .cpp, .hpp)
+    - その他の主要なプログラミング言語
+    """)

config/__init__.py ADDED Viewed

File without changes

config/llm_setting.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+from dotenv import load_dotenv
+from typing import Literal
+class LLMSettings:
+    def __init__(self):
+        load_dotenv()
+        self.openai_api_key = os.getenv('OPENAI_API_KEY')
+        self.anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
+        self.default_llm = os.getenv('DEFAULT_LLM', 'claude')
+        # API キーの存在確認
+        if not self.openai_api_key and not self.anthropic_api_key:
+            raise ValueError("少なくとも1つのAPIキーが必要です。")
+    def get_available_models(self) -> list[Literal['claude', 'gpt']]:
+        """利用可能なモデルのリストを返す"""
+        models = []
+        if self.anthropic_api_key:
+            models.append('claude')
+        if self.openai_api_key:
+            models.append('gpt')
+        return models

config/settings.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from pathlib import Path
+from datetime import datetime
+class Settings:
+    DEFAULT_OUTPUT_DIR = Path("output")
+    TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S"
+    @classmethod
+    def get_timestamp(cls) -> str:
+        return datetime.now().strftime(cls.TIMESTAMP_FORMAT)
+    @classmethod
+    def get_clone_dir(cls, timestamp: str) -> Path:
+        return cls.DEFAULT_OUTPUT_DIR / f"repo_clone_{timestamp}"
+    @classmethod
+    def get_output_file(cls, timestamp: str) -> Path:
+        return cls.DEFAULT_OUTPUT_DIR / f"scan_result_{timestamp}.txt"

core/__init__.py ADDED Viewed

File without changes

core/file_scanner.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from pathlib import Path
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+@dataclass
+class FileInfo:
+    path: Path
+    content: Optional[str] = None
+class FileScanner:
+    # スキャン対象の拡張子
+    TARGET_EXTENSIONS = {
+        '.py', '.js', '.java', '.cpp', '.hpp', '.c', '.h',
+        '.go', '.rs', '.php', '.rb', '.ts', '.scala', '.kt',
+        '.cs', '.swift', '.m', '.sh', '.pl', '.r'
+    }
+    # スキャン対象から除外するディレクトリ
+    EXCLUDED_DIRS = {
+        '.git', '__pycache__', 'node_modules', 'venv', '.env',
+        'build', 'dist', 'target', 'bin', 'obj'
+    }
+    def __init__(self, base_dir: Path):
+        self.base_dir = base_dir
+    def _should_scan_file(self, path: Path) -> bool:
+        if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS):
+            return False
+        return path.suffix.lower() in self.TARGET_EXTENSIONS
+    def _read_file_content(self, file_path: Path) -> Optional[str]:
+        try:
+            # まずUTF-8で試す
+            try:
+                with file_path.open('r', encoding='utf-8') as f:
+                    return f.read()
+            except UnicodeDecodeError:
+                # UTF-8で失敗したらcp932を試す
+                with file_path.open('r', encoding='cp932') as f:
+                    return f.read()
+        except (OSError, UnicodeDecodeError):
+            return None
+    def scan_files(self) -> List[FileInfo]:
+        if not self.base_dir.exists():
+            raise FileNotFoundError(f"Directory not found: {self.base_dir}")
+        files = []
+        for entry in self.base_dir.rglob('*'):
+            if entry.is_file() and self._should_scan_file(entry):
+                content = self._read_file_content(entry)
+                if content is not None:
+                    files.append(FileInfo(
+                        path=entry.relative_to(self.base_dir),
+                        content=content
+                    ))
+        return sorted(files, key=lambda x: str(x.path))

core/git_manager.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import subprocess
+from pathlib import Path
+class GitManager:
+    def __init__(self, repo_url: str, target_dir: Path):
+        self.repo_url = repo_url
+        self.target_dir = target_dir
+    def clone_repository(self) -> bool:
+        try:
+            if self.target_dir.exists():
+                raise FileExistsError(f"Directory already exists: {self.target_dir}")
+            self.target_dir.parent.mkdir(parents=True, exist_ok=True)
+            subprocess.run(
+                ["git", "clone", self.repo_url, str(self.target_dir)],
+                check=True,
+                capture_output=True,
+                text=True
+            )
+            return True
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Clone error: {e.stderr}")
+    def cleanup(self):
+        if self.target_dir.exists():
+            subprocess.run(
+                ["rm", "-rf", str(self.target_dir)],
+                check=True,
+                capture_output=True,
+                text=True
+            )

main.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import sys
+from pathlib import Path
+from config.settings import Settings
+from core.git_manager import GitManager
+from core.file_scanner import FileScanner
+from utils.file_writer import FileWriter
+def main():
+    # コマンドライン引数からパスを取得
+    if len(sys.argv) != 2:
+        print("Usage: python main.py <github_url or directory_path>")
+        return 1
+    target_path = sys.argv[1]
+    timestamp = Settings.get_timestamp()
+    output_file = Settings.get_output_file(timestamp)
+    # GitHubのURLかローカルパスかを判定
+    is_github = target_path.startswith(('http://', 'https://')) and 'github.com' in target_path
+    try:
+        if is_github:
+            # GitHubリポジトリの場合
+            clone_dir = Settings.get_clone_dir(timestamp)
+            print(f"Cloning repository: {target_path}")
+            git_manager = GitManager(target_path, clone_dir)
+            git_manager.clone_repository()
+            scanner = FileScanner(clone_dir)
+            cleanup_needed = True
+        else:
+            # ローカルディレクトリの場合
+            target_dir = Path(target_path)
+            if not target_dir.exists():
+                print(f"Error: Directory not found: {target_dir}")
+                return 1
+            scanner = FileScanner(target_dir)
+            cleanup_needed = False
+        # ファイルスキャンと保存
+        print("Scanning files...")
+        files = scanner.scan_files()
+        print(f"Writing contents to {output_file}")
+        writer = FileWriter(output_file)
+        writer.write_contents(files)
+        print(f"Found {len(files)} files")
+        print(f"Results saved to {output_file}")
+    except Exception as e:
+        print(f"Error: {e}")
+        return 1
+    finally:
+        # GitHubリポジトリの場合はクリーンアップ
+        if is_github and cleanup_needed and 'git_manager' in locals():
+            try:
+                git_manager.cleanup()
+                print("Cleanup completed")
+            except Exception as e:
+                print(f"Cleanup error: {e}")
+    return 0
+if __name__ == "__main__":
+    exit(main())

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+openai
+gitpython
+python-dotenv
+pathlib
+chardet

scan.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/bin/bash
+# エラーが発生した場合に停止
+set -e
+# デフォルトのターゲットパスを設定
+# ここを変更することで対象を変更できます
+TARGET_PATH="https://github.com/DeL-TaiseiOzaki/idebate_scraping.git"  # 例: Linuxカーネル
+# TARGET_PATH="/path/to/your/directory"  # ローカルディレクトリの例
+# 必要なディレクトリの存在確認
+if [ ! -d "output" ]; then
+    mkdir output
+fi
+# Pythonの存在確認
+if ! command -v python3 &> /dev/null; then
+    echo "Error: Python3 is not installed"
+    exit 1
+fi
+# GitHubリポジトリの場合、Gitの存在確認
+if [[ $TARGET_PATH == http* ]] && [[ $TARGET_PATH == *github.com* ]]; then
+    if ! command -v git &> /dev/null; then
+        echo "Error: Git is not installed"
+        exit 1
+    fi
+    echo "Scanning GitHub repository: $TARGET_PATH"
+else
+    if [ ! -d "$TARGET_PATH" ]; then
+        echo "Error: Directory not found: $TARGET_PATH"
+        exit 1
+    fi
+    echo "Scanning local directory: $TARGET_PATH"
+fi
+# スキャンの実行
+echo "Starting directory scan..."
+python3 main.py "$TARGET_PATH"
+exit_code=$?
+if [ $exit_code -eq 0 ]; then
+    echo "Scan completed successfully!"
+    echo "Results are saved in the 'output' directory"
+else
+    echo "Scan failed with exit code: $exit_code"
+    exit $exit_code
+fi

services/llm_service.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from typing import Optional, List, Dict, Any
+import openai
+import anthropic
+from dataclasses import dataclass
+from config.llm_settings import LLMSettings
+from core.file_scanner import FileInfo
+@dataclass
+class Message:
+    role: str
+    content: str
+class LLMService:
+    MAX_TURNS = 5
+    def __init__(self):
+        """LLMサービスの初期化"""
+        self.settings = LLMSettings()
+        self.current_model = self.settings.default_llm
+        # API クライアントの初期化
+        if self.settings.anthropic_api_key:
+            self.claude_client = anthropic.Anthropic(api_key=self.settings.anthropic_api_key)
+        if self.settings.openai_api_key:
+            openai.api_key = self.settings.openai_api_key
+        self.conversation_history: List[Message] = []
+    def switch_model(self, model: str):
+        """使用するモデルを切り替え"""
+        if model not in self.settings.get_available_models():
+            raise ValueError(f"モデル {model} は利用できません")
+        self.current_model = model
+    def create_prompt(self, content: str, query: str) -> str:
+        """プロンプトを生成"""
+        return f"""以下はGitHubリポジトリのコード解析結果です。このコードについて質問に答えてください。
+コード解析結果:
+{content}
+質問: {query}
+できるだけ具体的に、コードの内容を参照しながら回答してください。"""
+    def _add_to_history(self, role: str, content: str):
+        """会話履歴に追加（最大5ターン）"""
+        self.conversation_history.append(Message(role=role, content=content))
+        # 最大ターン数を超えた場合、古い会話を削除
+        if len(self.conversation_history) > self.MAX_TURNS * 2:  # 各ターンは質問と回答で2メッセージ
+            self.conversation_history = self.conversation_history[-self.MAX_TURNS * 2:]
+    def _format_messages_for_claude(self) -> List[Dict[str, str]]:
+        """Claude用にメッセージをフォーマット"""
+        return [{"role": msg.role, "content": msg.content}
+                for msg in self.conversation_history]
+    def _format_messages_for_gpt(self) -> List[Dict[str, str]]:
+        """GPT用にメッセージをフォーマット"""
+        return [
+            {"role": "system", "content": "あなたはコードアナリストとして、リポジトリの解析と質問への回答を行います。"},
+            *[{"role": msg.role, "content": msg.content}
+              for msg in self.conversation_history]
+        ]
+    def get_conversation_history(self) -> List[Dict[str, str]]:
+        """会話履歴を取得"""
+        return [{"role": msg.role, "content": msg.content}
+                for msg in self.conversation_history]
+    def clear_history(self):
+        """会話履歴をクリア"""
+        self.conversation_history = []
+    def get_response(self, content: str, query: str) -> tuple[Optional[str], Optional[str]]:
+        """LLMを使用して回答を生成"""
+        try:
+            prompt = self.create_prompt(content, query)
+            self._add_to_history("user", prompt)
+            if self.current_model == 'claude':
+                response = self.claude_client.messages.create(
+                    model="claude-3-sonnet-20240229",
+                    max_tokens=4000,
+                    messages=self._format_messages_for_claude()
+                )
+                answer = response.content[0].text
+            else:  # gpt
+                response = openai.ChatCompletion.create(
+                    model="gpt-4o",
+                    messages=self._format_messages_for_gpt()
+                )
+                answer = response.choices[0].message.content
+            self._add_to_history("assistant", answer)
+            return answer, None
+        except Exception as e:
+            return None, f"エラーが発生しました: {str(e)}"
+    @staticmethod
+    def format_code_content(files: List[FileInfo]) -> str:
+        """ファイル内容をプロンプト用にフォーマット"""
+        formatted_content = []
+        for file_info in files:
+            formatted_content.append(
+                f"#ファイルパス\n{file_info.path}\n------------\n{file_info.content}\n"
+            )
+        return "\n".join(formatted_content)

utils/__init__.py ADDED Viewed

File without changes

utils/file_writer.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pathlib import Path
+from typing import List
+from core.file_scanner import FileInfo
+class FileWriter:
+    def __init__(self, output_file: Path):
+        self.output_file = output_file
+    def write_contents(self, files: List[FileInfo]) -> None:
+        self.output_file.parent.mkdir(parents=True, exist_ok=True)
+        with self.output_file.open('w', encoding='utf-8') as f:
+            for file_info in files:
+                # ファイルパスのセクション
+                f.write("#ファイルパス\n")
+                f.write(str(file_info.path))
+                f.write("\n------------\n")
+                # ファイル内容
+                if file_info.content is not None:
+                    f.write(file_info.content)
+                else:
+                    f.write("# Failed to read content")
+                f.write("\n\n")

utils/logger.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from pathlib import Path
+from typing import List
+from datetime import datetime
+from core.file_scanner import FileInfo
+class ScanLogger:
+    def __init__(self, log_file: Path):
+        self.log_file = log_file
+    def write_log(self, repo_url: str, files: List[FileInfo], stats: dict):
+        """スキャン結果をログファイルに書き込みます"""
+        self.log_file.parent.mkdir(parents=True, exist_ok=True)
+        with self.log_file.open('w', encoding='utf-8') as f:
+            f.write(f"スキャン日時: {datetime.now()}\n")
+            f.write(f"リポジトリ: {repo_url}\n")
+            f.write(f"ファイル数: {len(files)}\n\n")
+            f.write("=== ファイル種類の統計 ===\n")
+            for ext, count in stats.items():
+                f.write(f"{ext}: {count}個\n")
+            f.write("\n")
+            f.write("=== ファイルパス一覧 ===\n")
+            for file_info in files:
+                f.write(f"{file_info.path} ({file_info.formatted_size})\n")