Spaces:
Sleeping
Sleeping
Update chunking.py
Browse files- chunking.py +13 -13
chunking.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import ast
|
2 |
from langchain.schema import Document
|
3 |
|
4 |
-
def chunk_pythoncode_and_add_metadata(code_files_content,
|
5 |
chunks = []
|
6 |
-
for code_file_content,
|
7 |
"""
|
8 |
Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
|
9 |
aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
|
@@ -17,7 +17,7 @@ def chunk_pythoncode_and_add_metadata(code_files_content, code_files_source):
|
|
17 |
|
18 |
|
19 |
"""
|
20 |
-
document_chunks = generate_code_chunks_with_metadata(code_files_content,
|
21 |
chunks.extend(document_chunks)
|
22 |
return chunks
|
23 |
|
@@ -41,7 +41,7 @@ def chunk_text_and_add_metadata(texts, references, chunk_size, chunk_overlap):
|
|
41 |
return chunks
|
42 |
|
43 |
|
44 |
-
def generate_code_chunks_with_metadata(code_file_content,
|
45 |
"""
|
46 |
Custom Python Code Splitter
|
47 |
chunks python file by length of func/method body
|
@@ -55,7 +55,7 @@ def generate_code_chunks_with_metadata(code_file_content, code_file_source):
|
|
55 |
documents = []
|
56 |
#print(f"Processing file: {file_path}")
|
57 |
|
58 |
-
_iterate_ast(code_file_content, documents,
|
59 |
# Determine usage based on the file_path
|
60 |
if file_path.startswith("kadi_apy/lib/"):
|
61 |
usage = "kadi-apy python library"
|
@@ -72,25 +72,25 @@ def generate_code_chunks_with_metadata(code_file_content, code_file_source):
|
|
72 |
return documents
|
73 |
|
74 |
|
75 |
-
def _iterate_ast(code_file_content, documents,
|
76 |
"""
|
77 |
Parses the AST of the given Python file and delegates
|
78 |
handling to specific methods based on node types.
|
79 |
"""
|
80 |
-
tree = ast.parse(code_file_content, filename=
|
81 |
|
82 |
first_level_nodes = list(ast.iter_child_nodes(tree))
|
83 |
|
84 |
# Check if there are no first-level nodes
|
85 |
if not first_level_nodes:
|
86 |
documents.extend(
|
87 |
-
_chunk_nodeless_code_file_content(code_file_content,
|
88 |
return
|
89 |
|
90 |
all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
|
91 |
if all_imports:
|
92 |
documents.extend(
|
93 |
-
_chunk_import_only_code_file_content(code_file_content,
|
94 |
|
95 |
# Iterate over first-level nodes
|
96 |
for first_level_node in ast.iter_child_nodes(tree):
|
@@ -229,12 +229,12 @@ def _chunk_first_level_assign_node(ast_node, code_file_content):
|
|
229 |
|
230 |
|
231 |
|
232 |
-
def _chunk_import_only_code_file_content(code_file_content,
|
233 |
"""
|
234 |
Handles cases where the first-level nodes are only imports.
|
235 |
"""
|
236 |
documents = []
|
237 |
-
if
|
238 |
type = "__init__-file"
|
239 |
else:
|
240 |
type = "undefined"
|
@@ -250,12 +250,12 @@ def _chunk_import_only_code_file_content(code_file_content, file_path):
|
|
250 |
documents.append(doc)
|
251 |
return documents
|
252 |
|
253 |
-
def _chunk_nodeless_code_file_content(code_file_content,
|
254 |
"""
|
255 |
Handles cases where no top-level nodes are found in the AST.
|
256 |
"""
|
257 |
documents = []
|
258 |
-
if
|
259 |
type = "__init__-file"
|
260 |
else:
|
261 |
type = "undefined"
|
|
|
1 |
import ast
|
2 |
from langchain.schema import Document
|
3 |
|
4 |
+
def chunk_pythoncode_and_add_metadata(code_files_content, code_files_path):
|
5 |
chunks = []
|
6 |
+
for code_file_content, code_file_path in zip(code_files_content, code_files_path):
|
7 |
"""
|
8 |
Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
|
9 |
aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
|
|
|
17 |
|
18 |
|
19 |
"""
|
20 |
+
document_chunks = generate_code_chunks_with_metadata(code_files_content, code_files_path)
|
21 |
chunks.extend(document_chunks)
|
22 |
return chunks
|
23 |
|
|
|
41 |
return chunks
|
42 |
|
43 |
|
44 |
+
def generate_code_chunks_with_metadata(code_file_content, code_files_path):
|
45 |
"""
|
46 |
Custom Python Code Splitter
|
47 |
chunks python file by length of func/method body
|
|
|
55 |
documents = []
|
56 |
#print(f"Processing file: {file_path}")
|
57 |
|
58 |
+
_iterate_ast(code_file_content, documents, s)
|
59 |
# Determine usage based on the file_path
|
60 |
if file_path.startswith("kadi_apy/lib/"):
|
61 |
usage = "kadi-apy python library"
|
|
|
72 |
return documents
|
73 |
|
74 |
|
75 |
+
def _iterate_ast(code_file_content, documents, code_file_path):
|
76 |
"""
|
77 |
Parses the AST of the given Python file and delegates
|
78 |
handling to specific methods based on node types.
|
79 |
"""
|
80 |
+
tree = ast.parse(code_file_content, filename=code_file_path)
|
81 |
|
82 |
first_level_nodes = list(ast.iter_child_nodes(tree))
|
83 |
|
84 |
# Check if there are no first-level nodes
|
85 |
if not first_level_nodes:
|
86 |
documents.extend(
|
87 |
+
_chunk_nodeless_code_file_content(code_file_content, code_file_path))
|
88 |
return
|
89 |
|
90 |
all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
|
91 |
if all_imports:
|
92 |
documents.extend(
|
93 |
+
_chunk_import_only_code_file_content(code_file_content, code_file_path))
|
94 |
|
95 |
# Iterate over first-level nodes
|
96 |
for first_level_node in ast.iter_child_nodes(tree):
|
|
|
229 |
|
230 |
|
231 |
|
232 |
+
def _chunk_import_only_code_file_content(code_file_content, code_file_path):
|
233 |
"""
|
234 |
Handles cases where the first-level nodes are only imports.
|
235 |
"""
|
236 |
documents = []
|
237 |
+
if code_file_path.endswith("__init__.py"):
|
238 |
type = "__init__-file"
|
239 |
else:
|
240 |
type = "undefined"
|
|
|
250 |
documents.append(doc)
|
251 |
return documents
|
252 |
|
253 |
+
def _chunk_nodeless_code_file_content(code_file_content, code_file_path):
|
254 |
"""
|
255 |
Handles cases where no top-level nodes are found in the AST.
|
256 |
"""
|
257 |
documents = []
|
258 |
+
if code_file_path.endswith("__init__.py"):
|
259 |
type = "__init__-file"
|
260 |
else:
|
261 |
type = "undefined"
|