bupa1018 commited on
Commit
c66cf01
·
1 Parent(s): 2d813f3

Update chunking.py

Browse files
Files changed (1) hide show
  1. chunking.py +13 -13
chunking.py CHANGED
@@ -1,9 +1,9 @@
1
  import ast
2
  from langchain.schema import Document
3
 
4
- def chunk_pythoncode_and_add_metadata(code_files_content, code_files_source):
5
  chunks = []
6
- for code_file_content, code_file_source in zip(code_files_content, code_files_source):
7
  """
8
  Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
9
  aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
@@ -17,7 +17,7 @@ def chunk_pythoncode_and_add_metadata(code_files_content, code_files_source):
17
 
18
 
19
  """
20
- document_chunks = generate_code_chunks_with_metadata(code_files_content, code_files_source)
21
  chunks.extend(document_chunks)
22
  return chunks
23
 
@@ -41,7 +41,7 @@ def chunk_text_and_add_metadata(texts, references, chunk_size, chunk_overlap):
41
  return chunks
42
 
43
 
44
- def generate_code_chunks_with_metadata(code_file_content, code_file_source):
45
  """
46
  Custom Python Code Splitter
47
  chunks python file by length of func/method body
@@ -55,7 +55,7 @@ def generate_code_chunks_with_metadata(code_file_content, code_file_source):
55
  documents = []
56
  #print(f"Processing file: {file_path}")
57
 
58
- _iterate_ast(code_file_content, documents, file_path)
59
  # Determine usage based on the file_path
60
  if file_path.startswith("kadi_apy/lib/"):
61
  usage = "kadi-apy python library"
@@ -72,25 +72,25 @@ def generate_code_chunks_with_metadata(code_file_content, code_file_source):
72
  return documents
73
 
74
 
75
- def _iterate_ast(code_file_content, documents, file_path):
76
  """
77
  Parses the AST of the given Python file and delegates
78
  handling to specific methods based on node types.
79
  """
80
- tree = ast.parse(code_file_content, filename=file_path)
81
 
82
  first_level_nodes = list(ast.iter_child_nodes(tree))
83
 
84
  # Check if there are no first-level nodes
85
  if not first_level_nodes:
86
  documents.extend(
87
- _chunk_nodeless_code_file_content(code_file_content, file_path))
88
  return
89
 
90
  all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
91
  if all_imports:
92
  documents.extend(
93
- _chunk_import_only_code_file_content(code_file_content, file_path))
94
 
95
  # Iterate over first-level nodes
96
  for first_level_node in ast.iter_child_nodes(tree):
@@ -229,12 +229,12 @@ def _chunk_first_level_assign_node(ast_node, code_file_content):
229
 
230
 
231
 
232
- def _chunk_import_only_code_file_content(code_file_content, file_path):
233
  """
234
  Handles cases where the first-level nodes are only imports.
235
  """
236
  documents = []
237
- if file_path.endswith("__init__.py"):
238
  type = "__init__-file"
239
  else:
240
  type = "undefined"
@@ -250,12 +250,12 @@ def _chunk_import_only_code_file_content(code_file_content, file_path):
250
  documents.append(doc)
251
  return documents
252
 
253
- def _chunk_nodeless_code_file_content(code_file_content, file_path):
254
  """
255
  Handles cases where no top-level nodes are found in the AST.
256
  """
257
  documents = []
258
- if file_path.endswith("__init__.py"):
259
  type = "__init__-file"
260
  else:
261
  type = "undefined"
 
1
  import ast
2
  from langchain.schema import Document
3
 
4
+ def chunk_pythoncode_and_add_metadata(code_files_content, code_files_path):
5
  chunks = []
6
+ for code_file_content, code_file_path in zip(code_files_content, code_files_path):
7
  """
8
  Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
9
  aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
 
17
 
18
 
19
  """
20
+ document_chunks = generate_code_chunks_with_metadata(code_files_content, code_files_path)
21
  chunks.extend(document_chunks)
22
  return chunks
23
 
 
41
  return chunks
42
 
43
 
44
+ def generate_code_chunks_with_metadata(code_file_content, code_files_path):
45
  """
46
  Custom Python Code Splitter
47
  chunks python file by length of func/method body
 
55
  documents = []
56
  #print(f"Processing file: {file_path}")
57
 
58
+ _iterate_ast(code_file_content, documents, s)
59
  # Determine usage based on the file_path
60
  if file_path.startswith("kadi_apy/lib/"):
61
  usage = "kadi-apy python library"
 
72
  return documents
73
 
74
 
75
+ def _iterate_ast(code_file_content, documents, code_file_path):
76
  """
77
  Parses the AST of the given Python file and delegates
78
  handling to specific methods based on node types.
79
  """
80
+ tree = ast.parse(code_file_content, filename=code_file_path)
81
 
82
  first_level_nodes = list(ast.iter_child_nodes(tree))
83
 
84
  # Check if there are no first-level nodes
85
  if not first_level_nodes:
86
  documents.extend(
87
+ _chunk_nodeless_code_file_content(code_file_content, code_file_path))
88
  return
89
 
90
  all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
91
  if all_imports:
92
  documents.extend(
93
+ _chunk_import_only_code_file_content(code_file_content, code_file_path))
94
 
95
  # Iterate over first-level nodes
96
  for first_level_node in ast.iter_child_nodes(tree):
 
229
 
230
 
231
 
232
+ def _chunk_import_only_code_file_content(code_file_content, code_file_path):
233
  """
234
  Handles cases where the first-level nodes are only imports.
235
  """
236
  documents = []
237
+ if code_file_path.endswith("__init__.py"):
238
  type = "__init__-file"
239
  else:
240
  type = "undefined"
 
250
  documents.append(doc)
251
  return documents
252
 
253
+ def _chunk_nodeless_code_file_content(code_file_content, code_file_path):
254
  """
255
  Handles cases where no top-level nodes are found in the AST.
256
  """
257
  documents = []
258
+ if code_file_path.endswith("__init__.py"):
259
  type = "__init__-file"
260
  else:
261
  type = "undefined"