Spaces:
Sleeping
Sleeping
File size: 9,465 Bytes
6f6019f 0c52573 6f6019f c66cf01 f26d824 c66cf01 f26d824 0429b48 f26d824 9039b90 f26d824 d37f774 f26d824 bd50c11 0429b48 6f6019f c5b5a5b 6f6019f dc07873 2fd774b d0a3932 0429b48 2fd774b a446f18 d37f774 bd50c11 dc07873 0429b48 d37f774 0c52573 91a25ef dc07873 6f6019f 0c52573 c66cf01 6f6019f c66cf01 6f6019f 1aeb49e 2fd774b c66cf01 1aeb49e 0de61ba c66cf01 6f6019f 2fd774b 6f6019f 2fd774b 9039b90 6f6019f 2fd774b 9039b90 1aeb49e 2fd774b 9039b90 c5b5a5b 9039b90 2fd774b dc07873 9039b90 6f6019f 0a9a29e 6f6019f 2fd774b 9039b90 0a9a29e 2fd774b 66a05ca 0a9a29e dc07873 6f6019f 0a9a29e 6f6019f dc07873 2fd774b 6f6019f dc07873 9039b90 0a9a29e dc07873 6f6019f dc07873 0a9a29e 2fd774b dc07873 6f6019f 2fd774b 0a9a29e 64756f0 9039b90 64756f0 9039b90 64756f0 9039b90 6f6019f 0a9a29e 6f6019f 2fd774b 6f6019f 2fd774b 6f6019f 2fd774b 9039b90 0a9a29e 2fd774b 0a9a29e dc07873 0a9a29e 2fd774b 0a9a29e dc07873 0a9a29e 66a05ca 0a9a29e dc07873 2fd774b dc07873 2fd774b 0a9a29e 2fd774b 0114c32 64756f0 9039b90 64756f0 9039b90 64756f0 c66cf01 64756f0 c66cf01 64756f0 9039b90 64756f0 c66cf01 64756f0 c66cf01 64756f0 9039b90 64756f0 0114c32 9039b90 0114c32 9039b90 0114c32 2fd774b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 |
import ast
from langchain.schema import Document
def chunk_pythoncode_and_add_metadata(code_files_content, code_files_path):
chunks = []
for code_file_content, code_file_path in zip(code_files_content, code_files_path):
"""
Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
e.g visbility: public, _internal
type: "class", "methods", "command"(CLI commands)
source:
with the intend to use a filter when retrieving potentaion useful snippets.
"""
document_chunks = generate_code_chunks_with_metadata(code_file_content, code_file_path)
chunks.extend(document_chunks)
return chunks
# Split text into chunks
def chunk_text_and_add_metadata(texts, references, chunk_size, chunk_overlap):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunks = []
for text, reference in zip(texts, references):
chunks.extend([
Document(
page_content=chunk,
metadata={
"source": reference,
"directory": "doc/"
}
)
for chunk in text_splitter.split_text(text)
])
return chunks
def generate_code_chunks_with_metadata(code_file_content, code_file_path):
"""
Custom Python Code Splitter
chunks python file by length of func/method body
aims to have one full method/function in a chunk and full body of a class, but cutting of when first method declaration is met
able to handles decorators on methods
Entry point method to process the Python file.
It invokes the iterate_ast function.
"""
documents = []
#print(f"Processing file: {file_path}")
_iterate_ast(code_file_content, documents, code_file_path)
# Determine usage based on the file_path
if code_file_path.startswith("kadi_apy"):
directory = "kadi_apy/"
if code_file_path.startswith("kadi_apy/lib/"):
usage = "kadi_apy/lib/"
elif code_file_path.startswith("kadi_apy/cli/"):
usage = "kadi_apy/cli/"
else:
usage = "kadi_apy/top_level_file.py"
else:
directory = "undefined"
usage = "undefined"
# Add metadata-type "usage" to all documents
for doc in documents:
doc.metadata["source"] = code_file_path
doc.metadata["directory"] = directory
doc.metadata["usage"] = usage # Add the determined usage metadata
#print(doc)
return documents
def _iterate_ast(code_file_content, documents, code_file_path):
"""
Parses the AST of the given Python file and delegates
handling to specific methods based on node types.
"""
tree = ast.parse(code_file_content, filename=code_file_path)
first_level_nodes = list(ast.iter_child_nodes(tree))
# Check if there are no first-level nodes
if not first_level_nodes:
documents.extend(
_chunk_nodeless_code_file_content(code_file_content, code_file_path))
return
all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
if all_imports:
documents.extend(
_chunk_import_only_code_file_content(code_file_content, code_file_path))
# Iterate over first-level nodes
for first_level_node in ast.iter_child_nodes(tree):
if isinstance(first_level_node, ast.ClassDef):
documents.extend(
_handle_first_level_class(first_level_node, code_file_content))
elif isinstance(first_level_node, ast.FunctionDef):
documents.extend(
_chunk_first_level_func_node(first_level_node, code_file_content))
elif isinstance(first_level_node, ast.Assign):
documents.extend(
_chunk_first_level_assign_node(first_level_node, code_file_content))
# else:
# documents.extend(
# _handle_not_defined_case(code_file_content))
def _handle_first_level_class(ast_node , code_file_content):
"""
Handles classes at the first level of the AST.
"""
documents = []
class_start_line = ast_node.lineno
class_body_lines = [child.lineno for child in ast_node.body if isinstance(child, ast.FunctionDef)]
class_end_line = min(class_body_lines, default=ast_node.end_lineno) - 1
class_source = '\n'.join(code_file_content.splitlines()[class_start_line-1:class_end_line])
metadata = {
"type": "class",
"class": ast_node.name,
"visibility": "public"
}
# Create and store Document for the class
doc = Document(
page_content=class_source,
metadata=metadata
)
documents.append(doc)
# Handle methods within the class
for second_level_node in ast.iter_child_nodes(ast_node):
if isinstance(second_level_node, ast.FunctionDef):
method_start_line = (
second_level_node.decorator_list[0].lineno
if second_level_node.decorator_list else second_level_node.lineno
)
method_end_line = second_level_node.end_lineno
method_source = '\n'.join(code_file_content.splitlines()[method_start_line-1:method_end_line])
visibility = "internal" if second_level_node.name.startswith("_") else "public"
doc = Document(
page_content=method_source,
metadata={
"type": "method",
"method": second_level_node.name,
"visibility": visibility,
"class": ast_node.name
}
)
documents.append(doc)
return documents
def _handle_not_defined_case(code_file_content):
documents = []
documents.extend(
_chunk_code_file_content_by_character(code_file_content))
return documents
def _chunk_first_level_func_node(ast_node, code_file_content):
"""
Handles functions at the first level of the AST.
"""
documents = []
function_start_line = (
ast_node.decorator_list[0].lineno
if ast_node.decorator_list else ast_node.lineno
)
function_end_line = ast_node.end_lineno
function_source = '\n'.join(code_file_content.splitlines()[function_start_line-1:function_end_line])
visibility = "internal" if ast_node.name.startswith("_") else "public"
is_command = any(
decorator.id == "apy_command"
for decorator in ast_node.decorator_list
if hasattr(decorator, "id")
)
metadata = {
"type": "command" if is_command else "function",
"visibility": visibility
}
if is_command:
metadata["command"] = ast_node.name
else:
metadata["method"] = ast_node.name
doc = Document(
page_content=function_source,
metadata=metadata
)
documents.append(doc)
return documents
def _chunk_first_level_assign_node(ast_node, code_file_content):
"""
Handles assignment statements at the first level of the AST.
"""
documents = []
assign_start_line = ast_node.lineno
assign_end_line = ast_node.end_lineno
assign_source = '\n'.join(code_file_content.splitlines()[assign_start_line-1:assign_end_line])
# Create metadata without imports
metadata = {"type": "Assign"}
# Create and store Document for the assignment
doc = Document(
page_content=assign_source,
metadata=metadata
)
documents.append(doc)
return documents
def _chunk_import_only_code_file_content(code_file_content, code_file_path):
"""
Handles cases where the first-level nodes are only imports.
"""
documents = []
if code_file_path.endswith("__init__.py"):
type = "__init__-file"
else:
type = "undefined"
# Create metadata without imports
metadata = {"type": type}
# Create and store a Document with the full source code
doc = Document(
page_content=code_file_content,
metadata=metadata
)
documents.append(doc)
return documents
def _chunk_nodeless_code_file_content(code_file_content, code_file_path):
"""
Handles cases where no top-level nodes are found in the AST.
"""
documents = []
if code_file_path.endswith("__init__.py"):
type = "__init__-file"
else:
type = "undefined"
# Create metadata without imports
metadata = {"type": type}
# Create and store a Document with the full source code
doc = Document(
page_content=code_file_content,
metadata=metadata
)
documents.append(doc)
return documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
def _chunk_code_file_content_by_character(code_file_content):
documents = []
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=128,
separators=[]
)
chunks = text_splitter.split_text(code_file_content)
for chunk in chunks:
doc = Document(
page_content=chunk
)
documents.append(doc)
return documents |