|
|
|
|
|
|
|
""" |
|
Language Detector |
|
|
|
This module provides functionality for detecting programming languages in a repository. |
|
""" |
|
|
|
import os |
|
import logging |
|
from collections import Counter |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
EXTENSION_TO_LANGUAGE = { |
|
'.py': 'Python', |
|
'.js': 'JavaScript', |
|
'.jsx': 'JavaScript', |
|
'.ts': 'TypeScript', |
|
'.tsx': 'TypeScript', |
|
'.java': 'Java', |
|
'.go': 'Go', |
|
'.rs': 'Rust', |
|
'.cpp': 'C++', |
|
'.cc': 'C++', |
|
'.cxx': 'C++', |
|
'.c': 'C', |
|
'.h': 'C', |
|
'.hpp': 'C++', |
|
'.cs': 'C#', |
|
'.php': 'PHP', |
|
'.rb': 'Ruby', |
|
'.swift': 'Swift', |
|
'.kt': 'Kotlin', |
|
'.scala': 'Scala', |
|
'.r': 'R', |
|
'.sh': 'Shell', |
|
'.bash': 'Shell', |
|
'.zsh': 'Shell', |
|
'.html': 'HTML', |
|
'.htm': 'HTML', |
|
'.css': 'CSS', |
|
'.scss': 'SCSS', |
|
'.sass': 'SCSS', |
|
'.less': 'Less', |
|
'.md': 'Markdown', |
|
'.json': 'JSON', |
|
'.xml': 'XML', |
|
'.yaml': 'YAML', |
|
'.yml': 'YAML', |
|
'.sql': 'SQL', |
|
'.graphql': 'GraphQL', |
|
'.gql': 'GraphQL', |
|
} |
|
|
|
|
|
SPECIAL_FILES_TO_LANGUAGE = { |
|
'Dockerfile': 'Docker', |
|
'docker-compose.yml': 'Docker', |
|
'docker-compose.yaml': 'Docker', |
|
'Makefile': 'Make', |
|
'CMakeLists.txt': 'CMake', |
|
'package.json': 'JavaScript', |
|
'tsconfig.json': 'TypeScript', |
|
'requirements.txt': 'Python', |
|
'setup.py': 'Python', |
|
'pom.xml': 'Java', |
|
'build.gradle': 'Java', |
|
'Cargo.toml': 'Rust', |
|
'go.mod': 'Go', |
|
} |
|
|
|
|
|
class LanguageDetector: |
|
""" |
|
Detects programming languages in a repository. |
|
""" |
|
|
|
def __init__(self): |
|
""" |
|
Initialize the LanguageDetector. |
|
""" |
|
logger.info("Initialized LanguageDetector") |
|
|
|
def detect_languages(self, repo_path): |
|
""" |
|
Detect programming languages in a repository. |
|
|
|
Args: |
|
repo_path (str): The path to the repository. |
|
|
|
Returns: |
|
list: A list of detected programming languages, sorted by prevalence. |
|
""" |
|
logger.info(f"Detecting languages in repository: {repo_path}") |
|
|
|
language_counter = Counter() |
|
|
|
for root, dirs, files in os.walk(repo_path): |
|
|
|
dirs[:] = [d for d in dirs if not d.startswith('.') and |
|
d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']] |
|
|
|
for file in files: |
|
file_path = os.path.join(root, file) |
|
|
|
|
|
if file in SPECIAL_FILES_TO_LANGUAGE: |
|
language = SPECIAL_FILES_TO_LANGUAGE[file] |
|
language_counter[language] += 1 |
|
continue |
|
|
|
|
|
_, ext = os.path.splitext(file) |
|
if ext in EXTENSION_TO_LANGUAGE: |
|
language = EXTENSION_TO_LANGUAGE[ext] |
|
language_counter[language] += 1 |
|
|
|
|
|
supported_languages = [ |
|
"Python", "JavaScript", "TypeScript", "Java", |
|
"Go", "Rust", "C++", "C#", "PHP", "Ruby", |
|
"Swift", "Kotlin", "Scala", "R", "Shell" |
|
] |
|
|
|
detected_languages = [lang for lang, _ in language_counter.most_common() |
|
if lang in supported_languages] |
|
|
|
logger.info(f"Detected languages: {detected_languages}") |
|
return detected_languages |
|
|
|
def get_language_breakdown(self, repo_path): |
|
""" |
|
Get a breakdown of programming languages in a repository by lines of code. |
|
|
|
Args: |
|
repo_path (str): The path to the repository. |
|
|
|
Returns: |
|
dict: A dictionary mapping languages to lines of code. |
|
""" |
|
logger.info(f"Getting language breakdown for repository: {repo_path}") |
|
|
|
language_loc = {} |
|
|
|
for root, dirs, files in os.walk(repo_path): |
|
|
|
dirs[:] = [d for d in dirs if not d.startswith('.') and |
|
d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']] |
|
|
|
for file in files: |
|
file_path = os.path.join(root, file) |
|
|
|
|
|
language = None |
|
|
|
|
|
if file in SPECIAL_FILES_TO_LANGUAGE: |
|
language = SPECIAL_FILES_TO_LANGUAGE[file] |
|
else: |
|
|
|
_, ext = os.path.splitext(file) |
|
if ext in EXTENSION_TO_LANGUAGE: |
|
language = EXTENSION_TO_LANGUAGE[ext] |
|
|
|
if language: |
|
|
|
try: |
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: |
|
line_count = sum(1 for _ in f) |
|
|
|
if language in language_loc: |
|
language_loc[language] += line_count |
|
else: |
|
language_loc[language] = line_count |
|
except Exception as e: |
|
logger.warning(f"Error counting lines in {file_path}: {e}") |
|
|
|
logger.info(f"Language breakdown: {language_loc}") |
|
return language_loc |