code-chunker / CodeParser.py
Jordi577's picture
Resolved an issue in CodeParser.py where Git returned exit status 129 due to repository paths containing whitespace.
19c34f6
raw
history blame
13.5 kB
import os
import subprocess
from typing import List, Dict, Union, Tuple
from tree_sitter import Language, Parser, Node
from typing import Union, List
import logging
def return_simple_line_numbers_with_code(code: str) -> str:
code_lines = code.split('\n')
code_with_line_numbers = [f"Line {i + 1}: {line}" for i, line in enumerate(code_lines)]
joined_lines = "\n".join(code_with_line_numbers)
return joined_lines
class CodeParser:
# Added a CACHE_DIR class attribute for caching
CACHE_DIR = os.path.expanduser("~/.code_parser_cache")
def __init__(self, file_extensions: Union[None, List[str], str] = None):
if isinstance(file_extensions, str):
file_extensions = [file_extensions]
self.language_extension_map = {
"py": "python",
"js": "javascript",
"jsx": "javascript",
"css": "css",
"ts": "typescript",
"tsx": "typescript",
"php": "php",
"rb": "ruby"
}
if file_extensions is None:
self.language_names = []
else:
self.language_names = [self.language_extension_map.get(ext) for ext in file_extensions if
ext in self.language_extension_map]
self.languages = {}
self._install_parsers()
def _install_parsers(self):
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
try:
# Ensure cache directory exists
if not os.path.exists(self.CACHE_DIR):
os.makedirs(self.CACHE_DIR)
for language in self.language_names:
repo_path = os.path.join(self.CACHE_DIR, f"tree-sitter-{language}")
# Check if the repository exists and contains necessary files
if not os.path.exists(repo_path) or not self._is_repo_valid(repo_path, language):
try:
if os.path.exists(repo_path):
logging.info(f"Updating existing repository for {language}")
update_command = f'cd "{repo_path}" && git pull'
subprocess.run(update_command, shell=True, check=True)
else:
logging.info(f"Cloning repository for {language}")
clone_command = f'git clone https://github.com/tree-sitter/tree-sitter-{language} "{repo_path}"'
subprocess.run(clone_command, shell=True, check=True)
except subprocess.CalledProcessError as e:
logging.error(f"Failed to clone/update repository for {language}. Error: {e}")
continue
try:
build_path = os.path.join(self.CACHE_DIR, f"build/{language}.so")
# Special handling for TypeScript
if language == 'typescript':
ts_dir = os.path.join(repo_path, 'typescript')
tsx_dir = os.path.join(repo_path, 'tsx')
if os.path.exists(ts_dir) and os.path.exists(tsx_dir):
Language.build_library(build_path, [ts_dir, tsx_dir])
else:
raise FileNotFoundError(f"TypeScript or TSX directory not found in {repo_path}")
if language == 'php':
php_dir = os.path.join(repo_path, 'php')
Language.build_library(build_path, [php_dir])
else:
Language.build_library(build_path, [repo_path])
self.languages[language] = Language(build_path, language)
logging.info(f"Successfully built and loaded {language} parser")
except Exception as e:
logging.error(f"Failed to build or load language {language}. Error: {str(e)}")
except Exception as e:
logging.error(f"An unexpected error occurred during parser installation: {str(e)}")
def _is_repo_valid(self, repo_path: str, language: str) -> bool:
"""Check if the repository contains necessary files."""
if language == 'typescript':
return (os.path.exists(os.path.join(repo_path, 'typescript', 'src', 'parser.c')) and
os.path.exists(os.path.join(repo_path, 'tsx', 'src', 'parser.c')))
elif language == 'php':
return os.path.exists(os.path.join(repo_path, 'php', 'src', 'parser.c'))
else:
return os.path.exists(os.path.join(repo_path, 'src', 'parser.c'))
def parse_code(self, code: str, file_extension: str) -> Union[None, Node]:
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
print(f"Unsupported file type: {file_extension}")
return None
language = self.languages.get(language_name)
if language is None:
print("Language parser not found")
return None
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
if tree is None:
print("Failed to parse the code")
return None
return tree.root_node
def extract_points_of_interest(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
node_types_of_interest = self._get_node_types_of_interest(file_extension)
points_of_interest = []
if node.type in node_types_of_interest.keys():
points_of_interest.append((node, node_types_of_interest[node.type]))
for child in node.children:
points_of_interest.extend(self.extract_points_of_interest(child, file_extension))
return points_of_interest
def _get_node_types_of_interest(self, file_extension: str) -> Dict[str, str]:
node_types = {
'py': {
'import_statement': 'Import',
'export_statement': 'Export',
'class_definition': 'Class',
'function_definition': 'Function',
},
'css': {
'tag_name': 'Tag',
'@media': 'Media Query',
},
'js': {
'import_statement': 'Import',
'export_statement': 'Export',
'class_declaration': 'Class',
'function_declaration': 'Function',
'arrow_function': 'Arrow Function',
'statement_block': 'Block',
},
'ts': {
'import_statement': 'Import',
'export_statement': 'Export',
'class_declaration': 'Class',
'function_declaration': 'Function',
'arrow_function': 'Arrow Function',
'statement_block': 'Block',
'interface_declaration': 'Interface',
'type_alias_declaration': 'Type Alias',
},
'php': {
'namespace_definition': 'Namespace',
'class_declaration': 'Class',
'method_declaration': 'Method',
'function_definition': 'Function',
'interface_declaration': 'Interface',
'trait_declaration': 'Trait',
},
'rb': {
'class': 'Class',
'method': 'Method',
'module': 'Module',
'singleton_class': 'Singleton Class',
'begin': 'Begin Block',
}
}
if file_extension in node_types.keys():
return node_types[file_extension]
elif file_extension == "jsx":
return node_types["js"]
elif file_extension == "tsx":
return node_types["ts"]
else:
raise ValueError("Unsupported file type")
def _get_nodes_for_comments(self, file_extension: str) -> Dict[str, str]:
node_types = {
'py': {
'comment': 'Comment',
'decorator': 'Decorator', # Broadened category
},
'css': {
'comment': 'Comment'
},
'js': {
'comment': 'Comment',
'decorator': 'Decorator', # Broadened category
},
'ts': {
'comment': 'Comment',
'decorator': 'Decorator',
},
'php': {
'comment': 'Comment',
'attribute': 'Attribute',
},
'rb': {
'comment': 'Comment',
}
}
if file_extension in node_types.keys():
return node_types[file_extension]
elif file_extension == "jsx":
return node_types["js"]
else:
raise ValueError("Unsupported file type")
def extract_comments(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
node_types_of_interest = self._get_nodes_for_comments(file_extension)
comments = []
if node.type in node_types_of_interest:
comments.append((node, node_types_of_interest[node.type]))
for child in node.children:
comments.extend(self.extract_comments(child, file_extension))
return comments
def get_lines_for_points_of_interest(self, code: str, file_extension: str) -> List[int]:
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
raise ValueError("Unsupported file type")
language = self.languages.get(language_name)
if language is None:
raise ValueError("Language parser not found")
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
points_of_interest = self.extract_points_of_interest(root_node, file_extension)
line_numbers_with_type_of_interest = {}
for node, type_of_interest in points_of_interest:
start_line = node.start_point[0]
if type_of_interest not in line_numbers_with_type_of_interest:
line_numbers_with_type_of_interest[type_of_interest] = []
if start_line not in line_numbers_with_type_of_interest[type_of_interest]:
line_numbers_with_type_of_interest[type_of_interest].append(start_line)
lines_of_interest = []
for _, line_numbers in line_numbers_with_type_of_interest.items():
lines_of_interest.extend(line_numbers)
return lines_of_interest
def get_lines_for_comments(self, code: str, file_extension: str) -> List[int]:
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
raise ValueError("Unsupported file type")
language = self.languages.get(language_name)
if language is None:
raise ValueError("Language parser not found")
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
comments = self.extract_comments(root_node, file_extension)
line_numbers_with_comments = {}
for node, type_of_interest in comments:
start_line = node.start_point[0]
if type_of_interest not in line_numbers_with_comments:
line_numbers_with_comments[type_of_interest] = []
if start_line not in line_numbers_with_comments[type_of_interest]:
line_numbers_with_comments[type_of_interest].append(start_line)
lines_of_interest = []
for _, line_numbers in line_numbers_with_comments.items():
lines_of_interest.extend(line_numbers)
return lines_of_interest
def print_all_line_types(self, code: str, file_extension: str):
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
print(f"Unsupported file type: {file_extension}")
return
language = self.languages.get(language_name)
if language is None:
print("Language parser not found")
return
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
line_to_node_type = self.map_line_to_node_type(root_node)
code_lines = code.split('\n')
for line_num, node_types in line_to_node_type.items():
line_content = code_lines[line_num - 1] # Adjusting index for zero-based indexing
print(f"line {line_num}: {', '.join(node_types)} | Code: {line_content}")
def map_line_to_node_type(self, node, line_to_node_type=None, depth=0):
if line_to_node_type is None:
line_to_node_type = {}
start_line = node.start_point[0] + 1 # Tree-sitter lines are 0-indexed; converting to 1-indexed
# Only add the node type if it's the start line of the node
if start_line not in line_to_node_type:
line_to_node_type[start_line] = []
line_to_node_type[start_line].append(node.type)
for child in node.children:
self.map_line_to_node_type(child, line_to_node_type, depth + 1)
return line_to_node_type