#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Language Detector This module provides functionality for detecting programming languages in a repository. """ import os import logging from collections import Counter logger = logging.getLogger(__name__) # File extension to language mapping EXTENSION_TO_LANGUAGE = { '.py': 'Python', '.js': 'JavaScript', '.jsx': 'JavaScript', '.ts': 'TypeScript', '.tsx': 'TypeScript', '.java': 'Java', '.go': 'Go', '.rs': 'Rust', '.cpp': 'C++', '.cc': 'C++', '.cxx': 'C++', '.c': 'C', '.h': 'C', '.hpp': 'C++', '.cs': 'C#', '.php': 'PHP', '.rb': 'Ruby', '.swift': 'Swift', '.kt': 'Kotlin', '.scala': 'Scala', '.r': 'R', '.sh': 'Shell', '.bash': 'Shell', '.zsh': 'Shell', '.html': 'HTML', '.htm': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sass': 'SCSS', '.less': 'Less', '.md': 'Markdown', '.json': 'JSON', '.xml': 'XML', '.yaml': 'YAML', '.yml': 'YAML', '.sql': 'SQL', '.graphql': 'GraphQL', '.gql': 'GraphQL', } # Special files to language mapping SPECIAL_FILES_TO_LANGUAGE = { 'Dockerfile': 'Docker', 'docker-compose.yml': 'Docker', 'docker-compose.yaml': 'Docker', 'Makefile': 'Make', 'CMakeLists.txt': 'CMake', 'package.json': 'JavaScript', 'tsconfig.json': 'TypeScript', 'requirements.txt': 'Python', 'setup.py': 'Python', 'pom.xml': 'Java', 'build.gradle': 'Java', 'Cargo.toml': 'Rust', 'go.mod': 'Go', } class LanguageDetector: """ Detects programming languages in a repository. """ def __init__(self): """ Initialize the LanguageDetector. """ logger.info("Initialized LanguageDetector") def detect_languages(self, repo_path): """ Detect programming languages in a repository. Args: repo_path (str): The path to the repository. Returns: list: A list of detected programming languages, sorted by prevalence. """ logger.info(f"Detecting languages in repository: {repo_path}") language_counter = Counter() for root, dirs, files in os.walk(repo_path): # Skip hidden directories and common non-code directories dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']] for file in files: file_path = os.path.join(root, file) # Check if it's a special file if file in SPECIAL_FILES_TO_LANGUAGE: language = SPECIAL_FILES_TO_LANGUAGE[file] language_counter[language] += 1 continue # Check file extension _, ext = os.path.splitext(file) if ext in EXTENSION_TO_LANGUAGE: language = EXTENSION_TO_LANGUAGE[ext] language_counter[language] += 1 # Get the top languages (limit to supported languages) supported_languages = [ "Python", "JavaScript", "TypeScript", "Java", "Go", "Rust", "C++", "C#", "PHP", "Ruby", "Swift", "Kotlin", "Scala", "R", "Shell" ] detected_languages = [lang for lang, _ in language_counter.most_common() if lang in supported_languages] logger.info(f"Detected languages: {detected_languages}") return detected_languages def get_language_breakdown(self, repo_path): """ Get a breakdown of programming languages in a repository by lines of code. Args: repo_path (str): The path to the repository. Returns: dict: A dictionary mapping languages to lines of code. """ logger.info(f"Getting language breakdown for repository: {repo_path}") language_loc = {} for root, dirs, files in os.walk(repo_path): # Skip hidden directories and common non-code directories dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']] for file in files: file_path = os.path.join(root, file) # Determine the language language = None # Check if it's a special file if file in SPECIAL_FILES_TO_LANGUAGE: language = SPECIAL_FILES_TO_LANGUAGE[file] else: # Check file extension _, ext = os.path.splitext(file) if ext in EXTENSION_TO_LANGUAGE: language = EXTENSION_TO_LANGUAGE[ext] if language: # Count lines of code try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: line_count = sum(1 for _ in f) if language in language_loc: language_loc[language] += line_count else: language_loc[language] = line_count except Exception as e: logger.warning(f"Error counting lines in {file_path}: {e}") logger.info(f"Language breakdown: {language_loc}") return language_loc