File size: 5,754 Bytes
88d205f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Language Detector

This module provides functionality for detecting programming languages in a repository.
"""

import os
import logging
from collections import Counter

logger = logging.getLogger(__name__)

# File extension to language mapping
EXTENSION_TO_LANGUAGE = {
    '.py': 'Python',
    '.js': 'JavaScript',
    '.jsx': 'JavaScript',
    '.ts': 'TypeScript',
    '.tsx': 'TypeScript',
    '.java': 'Java',
    '.go': 'Go',
    '.rs': 'Rust',
    '.cpp': 'C++',
    '.cc': 'C++',
    '.cxx': 'C++',
    '.c': 'C',
    '.h': 'C',
    '.hpp': 'C++',
    '.cs': 'C#',
    '.php': 'PHP',
    '.rb': 'Ruby',
    '.swift': 'Swift',
    '.kt': 'Kotlin',
    '.scala': 'Scala',
    '.r': 'R',
    '.sh': 'Shell',
    '.bash': 'Shell',
    '.zsh': 'Shell',
    '.html': 'HTML',
    '.htm': 'HTML',
    '.css': 'CSS',
    '.scss': 'SCSS',
    '.sass': 'SCSS',
    '.less': 'Less',
    '.md': 'Markdown',
    '.json': 'JSON',
    '.xml': 'XML',
    '.yaml': 'YAML',
    '.yml': 'YAML',
    '.sql': 'SQL',
    '.graphql': 'GraphQL',
    '.gql': 'GraphQL',
}

# Special files to language mapping
SPECIAL_FILES_TO_LANGUAGE = {
    'Dockerfile': 'Docker',
    'docker-compose.yml': 'Docker',
    'docker-compose.yaml': 'Docker',
    'Makefile': 'Make',
    'CMakeLists.txt': 'CMake',
    'package.json': 'JavaScript',
    'tsconfig.json': 'TypeScript',
    'requirements.txt': 'Python',
    'setup.py': 'Python',
    'pom.xml': 'Java',
    'build.gradle': 'Java',
    'Cargo.toml': 'Rust',
    'go.mod': 'Go',
}


class LanguageDetector:
    """
    Detects programming languages in a repository.
    """
    
    def __init__(self):
        """
        Initialize the LanguageDetector.
        """
        logger.info("Initialized LanguageDetector")
    
    def detect_languages(self, repo_path):
        """
        Detect programming languages in a repository.
        
        Args:
            repo_path (str): The path to the repository.
        
        Returns:
            list: A list of detected programming languages, sorted by prevalence.
        """
        logger.info(f"Detecting languages in repository: {repo_path}")
        
        language_counter = Counter()
        
        for root, dirs, files in os.walk(repo_path):
            # Skip hidden directories and common non-code directories
            dirs[:] = [d for d in dirs if not d.startswith('.') and 
                      d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']]
            
            for file in files:
                file_path = os.path.join(root, file)
                
                # Check if it's a special file
                if file in SPECIAL_FILES_TO_LANGUAGE:
                    language = SPECIAL_FILES_TO_LANGUAGE[file]
                    language_counter[language] += 1
                    continue
                
                # Check file extension
                _, ext = os.path.splitext(file)
                if ext in EXTENSION_TO_LANGUAGE:
                    language = EXTENSION_TO_LANGUAGE[ext]
                    language_counter[language] += 1
        
        # Get the top languages (limit to supported languages)
        supported_languages = [
            "Python", "JavaScript", "TypeScript", "Java", 
            "Go", "Rust", "C++", "C#", "PHP", "Ruby",
            "Swift", "Kotlin", "Scala", "R", "Shell"
        ]
        
        detected_languages = [lang for lang, _ in language_counter.most_common() 
                            if lang in supported_languages]
        
        logger.info(f"Detected languages: {detected_languages}")
        return detected_languages
    
    def get_language_breakdown(self, repo_path):
        """
        Get a breakdown of programming languages in a repository by lines of code.
        
        Args:
            repo_path (str): The path to the repository.
        
        Returns:
            dict: A dictionary mapping languages to lines of code.
        """
        logger.info(f"Getting language breakdown for repository: {repo_path}")
        
        language_loc = {}
        
        for root, dirs, files in os.walk(repo_path):
            # Skip hidden directories and common non-code directories
            dirs[:] = [d for d in dirs if not d.startswith('.') and 
                      d not in ['node_modules', 'venv', '.git', '__pycache__', 'dist', 'build']]
            
            for file in files:
                file_path = os.path.join(root, file)
                
                # Determine the language
                language = None
                
                # Check if it's a special file
                if file in SPECIAL_FILES_TO_LANGUAGE:
                    language = SPECIAL_FILES_TO_LANGUAGE[file]
                else:
                    # Check file extension
                    _, ext = os.path.splitext(file)
                    if ext in EXTENSION_TO_LANGUAGE:
                        language = EXTENSION_TO_LANGUAGE[ext]
                
                if language:
                    # Count lines of code
                    try:
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                            line_count = sum(1 for _ in f)
                        
                        if language in language_loc:
                            language_loc[language] += line_count
                        else:
                            language_loc[language] = line_count
                    except Exception as e:
                        logger.warning(f"Error counting lines in {file_path}: {e}")
        
        logger.info(f"Language breakdown: {language_loc}")
        return language_loc