code-summarizer / code_summarizer /language_parsers.py
Vishisht16's picture
Add application code
188e720
raw
history blame
3.46 kB
import os
from pathlib import Path
from typing import List, Tuple, Dict, Optional
import re
import ast
import logging
log = logging.getLogger(__name__)
# Note: ast.get_source_segment requires Python 3.8+
SUPPORTED_EXTENSIONS: Dict[str, str] = {
".py": "python", ".js": "javascript", ".java": "java", ".cpp": "cpp",
".c": "c", ".cs": "csharp", ".ts": "typescript", ".go": "go"
}
# Regex patterns (simplified, may need adjustment per language)
# WARNING: Regex-based parsing is fragile.
patterns = {
"javascript": r"^(?:async\s+)?function\s+\w+\s*\([^)]*\)\s*\{[\s\S]*?^\}|(?:export\s+)?(?:const|let|var)\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*\{[\s\S]*?^\}",
"typescript": r"^(?:async\s+)?function\s+\w+\s*\([^)]*\)\s*\{[\s\S]*?^\}|(?:export\s+)?(?:const|let|var)\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*\{[\s\S]*?^\}",
"java": r"^(?:public|private|protected|static|\s)*\s*[\w<>\[\]]+\s+\w+\s*\([^)]*\)\s*(?:throws\s+[\w,\s]+)?\s*\{[\s\S]*?^\}",
"cpp": r"^(?:[\w:]+)\s+\**\s*[\w:]+\s*\([^)]*\)\s*(?:const)?\s*\{[\s\S]*?^\}",
"c": r"^(?:[\w:]+)\s+\**\s*[\w:]+\s*\([^)]*\)\s*(?:const)?\s*\{[\s\S]*?^\}",
"csharp": r"^(?:public|private|protected|internal|static|virtual|async|override|\s)*\s*[\w<>\[\],?]+\s+\w+\s*\([^)]*\)\s*\{[\s\S]*?^\}",
"go": r"^func(?:\s*\(\s*\w+\s+\*?\w+\s*\))?\s+\w+\s*\([^)]*\)\s*(?:[\w\s,()]+)?\s*\{[\s\S]*?^\}"
}
def get_language_by_extension(file_path: Path) -> Optional[str]:
return SUPPORTED_EXTENSIONS.get(file_path.suffix.lower())
def extract_python_functions(file_path: Path) -> List[str]:
functions = []
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
source = f.read()
tree = ast.parse(source, filename=str(file_path))
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
try:
segment = ast.get_source_segment(source, node)
if segment:
functions.append(segment)
except Exception: # Ignore segment extraction errors
pass
except (FileNotFoundError, SyntaxError, UnicodeDecodeError) as e:
log.warning(f"Skipping file {file_path} due to parsing error: {e}")
except Exception as e:
log.error(f"Unexpected error parsing Python file {file_path}: {e}", exc_info=True)
return functions
def extract_functions_by_regex(file_path: Path, pattern: str) -> List[str]:
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
code = f.read()
return re.findall(pattern, code, re.DOTALL | re.MULTILINE)
except (FileNotFoundError, UnicodeDecodeError) as e:
log.warning(f"Skipping file {file_path} due to read error: {e}")
return []
except Exception as e:
log.error(f"Failed regex extraction on {file_path}: {e}", exc_info=True)
return []
def extract_code_snippets(file_path: Path) -> Tuple[Optional[str], List[str]]:
language = get_language_by_extension(file_path)
if language is None:
return None, []
if language == "python":
return language, extract_python_functions(file_path)
pattern = patterns.get(language)
if pattern:
return language, extract_functions_by_regex(file_path, pattern)
else:
log.debug(f"No regex pattern defined for language: {language} in file {file_path}")
return language, []