CodeReviewAgent / src /core /language_detector.py
c1r3x's picture
Review Agent: first commit
88d205f
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Language Detector
This module provides functionality for detecting programming languages in a repository.
"""
import os
import logging
from collections import Counter
logger = logging.getLogger(__name__)
# File extension to language mapping
EXTENSION_TO_LANGUAGE = {
'.py': 'Python',
'.js': 'JavaScript',
'.jsx': 'JavaScript',
'.ts': 'TypeScript',
'.tsx': 'TypeScript',
'.java': 'Java',
'.go': 'Go',
'.rs': 'Rust',
'.cpp': 'C++',
'.cc': 'C++',
'.cxx': 'C++',
'.c': 'C',
'.h': 'C',
'.hpp': 'C++',
'.cs': 'C#',
'.php': 'PHP',
'.rb': 'Ruby',
'.swift': 'Swift',
'.kt': 'Kotlin',
'.scala': 'Scala',
'.r': 'R',
'.sh': 'Shell',
'.bash': 'Shell',
'.zsh': 'Shell',
'.html': 'HTML',
'.htm': 'HTML',
'.css': 'CSS',
'.scss': 'SCSS',
'.sass': 'SCSS',
'.less': 'Less',
'.md': 'Markdown',
'.json': 'JSON',
'.xml': 'XML',
'.yaml': 'YAML',
'.yml': 'YAML',
'.sql': 'SQL',
'.graphql': 'GraphQL',
'.gql': 'GraphQL',
}
# Special files to language mapping
SPECIAL_FILES_TO_LANGUAGE = {
'Dockerfile': 'Docker',
'docker-compose.yml': 'Docker',
'docker-compose.yaml': 'Docker',
'Makefile': 'Make',
'CMakeLists.txt': 'CMake',
'package.json': 'JavaScript',
'tsconfig.json': 'TypeScript',
'requirements.txt': 'Python',
'setup.py': 'Python',
'pom.xml': 'Java',
'build.gradle': 'Java',
'Cargo.toml': 'Rust',
'go.mod': 'Go',
}
class LanguageDetector:
"""
Detects programming languages in a repository.
"""
def __init__(self):
"""
Initialize the LanguageDetector.
"""
logger.info("Initialized LanguageDetector")
def detect_languages(self, repo_path):
"""
Detect programming languages in a repository.
Args:
repo_path (str): The path to the repository.
Returns:
list: A list of detected programming languages, sorted by prevalence.
"""
logger.info(f"Detecting languages in repository: {repo_path}")
language_counter = Counter()
for root, dirs, files in os.walk(repo_path):
# Skip hidden directories and common non-code directories
dirs[:] = [d for d in dirs if not d.startswith('.') and
d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']]
for file in files:
file_path = os.path.join(root, file)
# Check if it's a special file
if file in SPECIAL_FILES_TO_LANGUAGE:
language = SPECIAL_FILES_TO_LANGUAGE[file]
language_counter[language] += 1
continue
# Check file extension
_, ext = os.path.splitext(file)
if ext in EXTENSION_TO_LANGUAGE:
language = EXTENSION_TO_LANGUAGE[ext]
language_counter[language] += 1
# Get the top languages (limit to supported languages)
supported_languages = [
"Python", "JavaScript", "TypeScript", "Java",
"Go", "Rust", "C++", "C#", "PHP", "Ruby",
"Swift", "Kotlin", "Scala", "R", "Shell"
]
detected_languages = [lang for lang, _ in language_counter.most_common()
if lang in supported_languages]
logger.info(f"Detected languages: {detected_languages}")
return detected_languages
def get_language_breakdown(self, repo_path):
"""
Get a breakdown of programming languages in a repository by lines of code.
Args:
repo_path (str): The path to the repository.
Returns:
dict: A dictionary mapping languages to lines of code.
"""
logger.info(f"Getting language breakdown for repository: {repo_path}")
language_loc = {}
for root, dirs, files in os.walk(repo_path):
# Skip hidden directories and common non-code directories
dirs[:] = [d for d in dirs if not d.startswith('.') and
d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']]
for file in files:
file_path = os.path.join(root, file)
# Determine the language
language = None
# Check if it's a special file
if file in SPECIAL_FILES_TO_LANGUAGE:
language = SPECIAL_FILES_TO_LANGUAGE[file]
else:
# Check file extension
_, ext = os.path.splitext(file)
if ext in EXTENSION_TO_LANGUAGE:
language = EXTENSION_TO_LANGUAGE[ext]
if language:
# Count lines of code
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
line_count = sum(1 for _ in f)
if language in language_loc:
language_loc[language] += line_count
else:
language_loc[language] = line_count
except Exception as e:
logger.warning(f"Error counting lines in {file_path}: {e}")
logger.info(f"Language breakdown: {language_loc}")
return language_loc