File size: 3,455 Bytes
188e720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
from pathlib import Path
from typing import List, Tuple, Dict, Optional
import re
import ast
import logging

log = logging.getLogger(__name__)

# Note: ast.get_source_segment requires Python 3.8+
SUPPORTED_EXTENSIONS: Dict[str, str] = {
    ".py": "python", ".js": "javascript", ".java": "java", ".cpp": "cpp",
    ".c": "c", ".cs": "csharp", ".ts": "typescript", ".go": "go"
}

# Regex patterns (simplified, may need adjustment per language)
# WARNING: Regex-based parsing is fragile.
patterns = {
    "javascript": r"^(?:async\s+)?function\s+\w+\s*\([^)]*\)\s*\{[\s\S]*?^\}|(?:export\s+)?(?:const|let|var)\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*\{[\s\S]*?^\}",
    "typescript": r"^(?:async\s+)?function\s+\w+\s*\([^)]*\)\s*\{[\s\S]*?^\}|(?:export\s+)?(?:const|let|var)\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*\{[\s\S]*?^\}",
    "java": r"^(?:public|private|protected|static|\s)*\s*[\w<>\[\]]+\s+\w+\s*\([^)]*\)\s*(?:throws\s+[\w,\s]+)?\s*\{[\s\S]*?^\}",
    "cpp": r"^(?:[\w:]+)\s+\**\s*[\w:]+\s*\([^)]*\)\s*(?:const)?\s*\{[\s\S]*?^\}",
    "c": r"^(?:[\w:]+)\s+\**\s*[\w:]+\s*\([^)]*\)\s*(?:const)?\s*\{[\s\S]*?^\}",
    "csharp": r"^(?:public|private|protected|internal|static|virtual|async|override|\s)*\s*[\w<>\[\],?]+\s+\w+\s*\([^)]*\)\s*\{[\s\S]*?^\}",
    "go": r"^func(?:\s*\(\s*\w+\s+\*?\w+\s*\))?\s+\w+\s*\([^)]*\)\s*(?:[\w\s,()]+)?\s*\{[\s\S]*?^\}"
}

def get_language_by_extension(file_path: Path) -> Optional[str]:
    return SUPPORTED_EXTENSIONS.get(file_path.suffix.lower())

def extract_python_functions(file_path: Path) -> List[str]:
    functions = []
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            source = f.read()
        tree = ast.parse(source, filename=str(file_path))
        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef):
                try:
                    segment = ast.get_source_segment(source, node)
                    if segment:
                        functions.append(segment)
                except Exception: # Ignore segment extraction errors
                    pass
    except (FileNotFoundError, SyntaxError, UnicodeDecodeError) as e:
        log.warning(f"Skipping file {file_path} due to parsing error: {e}")
    except Exception as e:
        log.error(f"Unexpected error parsing Python file {file_path}: {e}", exc_info=True)
    return functions

def extract_functions_by_regex(file_path: Path, pattern: str) -> List[str]:
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            code = f.read()
        return re.findall(pattern, code, re.DOTALL | re.MULTILINE)
    except (FileNotFoundError, UnicodeDecodeError) as e:
         log.warning(f"Skipping file {file_path} due to read error: {e}")
         return []
    except Exception as e:
        log.error(f"Failed regex extraction on {file_path}: {e}", exc_info=True)
        return []

def extract_code_snippets(file_path: Path) -> Tuple[Optional[str], List[str]]:
    language = get_language_by_extension(file_path)
    if language is None:
        return None, []

    if language == "python":
        return language, extract_python_functions(file_path)

    pattern = patterns.get(language)
    if pattern:
        return language, extract_functions_by_regex(file_path, pattern)
    else:
        log.debug(f"No regex pattern defined for language: {language} in file {file_path}")
        return language, []