bupa1018 commited on
Commit
9039b90
·
1 Parent(s): 0de2459

Update chunk_python_code.py

Browse files
Files changed (1) hide show
  1. chunk_python_code.py +29 -29
chunk_python_code.py CHANGED
@@ -1,9 +1,9 @@
1
  import ast
2
  from langchain.schema import Document
3
 
4
- def split_python_code_into_chunks(texts, file_paths):
5
  chunks = []
6
- for text, file_path in zip(texts, file_paths):
7
  """
8
  Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
9
  aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
@@ -17,13 +17,13 @@ def split_python_code_into_chunks(texts, file_paths):
17
 
18
 
19
  """
20
- document_chunks = chunk_python_code_with_metadata(text, file_path)
21
  chunks.extend(document_chunks)
22
  return chunks
23
 
24
 
25
  # Split text into chunks
26
- def split_into_chunks(texts, references, chunk_size, chunk_overlap):
27
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
28
  chunks = []
29
 
@@ -41,7 +41,7 @@ def split_into_chunks(texts, references, chunk_size, chunk_overlap):
41
  return chunks
42
 
43
 
44
- def chunk_python_code_with_metadata(python_code, file_path):
45
  """
46
  Custom Python Code Splitter
47
  chunks python file by length of func/method body
@@ -55,7 +55,7 @@ def chunk_python_code_with_metadata(python_code, file_path):
55
  documents = []
56
  #print(f"Processing file: {file_path}")
57
 
58
- _iterate_ast(python_code, documents, file_path)
59
  # Determine usage based on the file_path
60
  if file_path.startswith("kadi_apy/lib/"):
61
  usage = "kadi-apy python library"
@@ -72,45 +72,45 @@ def chunk_python_code_with_metadata(python_code, file_path):
72
  return documents
73
 
74
 
75
- def _iterate_ast(python_code, documents, file_path):
76
  """
77
  Parses the AST of the given Python file and delegates
78
  handling to specific methods based on node types.
79
  """
80
- tree = ast.parse(python_code, filename=file_path)
81
 
82
  first_level_nodes = list(ast.iter_child_nodes(tree))
83
 
84
  # Check if there are no first-level nodes
85
  if not first_level_nodes:
86
  documents.extend(
87
- _chunk_nodeless_python_code(python_code, file_path))
88
  return
89
 
90
  all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
91
  if all_imports:
92
  documents.extend(
93
- _chunk_import_only_python_code(python_code, file_path))
94
 
95
  # Iterate over first-level nodes
96
  for first_level_node in ast.iter_child_nodes(tree):
97
 
98
  if isinstance(first_level_node, ast.ClassDef):
99
  documents.extend(
100
- _handle_first_level_class(first_level_node, python_code))
101
  elif isinstance(first_level_node, ast.FunctionDef):
102
  documents.extend(
103
- _chunk_first_level_func_node(first_level_node, python_code))
104
  elif isinstance(first_level_node, ast.Assign):
105
  documents.extend(
106
- _chunk_first_level_assign_node(first_level_node, python_code))
107
  # else:
108
  # documents.extend(
109
- # _handle_not_defined_case(python_code))
110
 
111
 
112
 
113
- def _handle_first_level_class(ast_node , python_code):
114
  """
115
  Handles classes at the first level of the AST.
116
  """
@@ -118,7 +118,7 @@ def _handle_first_level_class(ast_node , python_code):
118
  class_start_line = ast_node.lineno
119
  class_body_lines = [child.lineno for child in ast_node.body if isinstance(child, ast.FunctionDef)]
120
  class_end_line = min(class_body_lines, default=ast_node.end_lineno) - 1
121
- class_source = '\n'.join(python_code.splitlines()[class_start_line-1:class_end_line])
122
 
123
  metadata = {
124
  "type": "class",
@@ -141,7 +141,7 @@ def _handle_first_level_class(ast_node , python_code):
141
  if second_level_node.decorator_list else second_level_node.lineno
142
  )
143
  method_end_line = second_level_node.end_lineno
144
- method_source = '\n'.join(python_code.splitlines()[method_start_line-1:method_end_line])
145
 
146
  visibility = "internal" if second_level_node.name.startswith("_") else "public"
147
 
@@ -159,14 +159,14 @@ def _handle_first_level_class(ast_node , python_code):
159
  return documents
160
 
161
 
162
- def _handle_not_defined_case(python_code):
163
  documents = []
164
  documents.extend(
165
- _chunk_python_code_by_character(python_code))
166
  return documents
167
 
168
 
169
- def _chunk_first_level_func_node(ast_node, python_code):
170
  """
171
  Handles functions at the first level of the AST.
172
  """
@@ -176,7 +176,7 @@ def _chunk_first_level_func_node(ast_node, python_code):
176
  if ast_node.decorator_list else ast_node.lineno
177
  )
178
  function_end_line = ast_node.end_lineno
179
- function_source = '\n'.join(python_code.splitlines()[function_start_line-1:function_end_line])
180
 
181
  visibility = "internal" if ast_node.name.startswith("_") else "public"
182
 
@@ -205,7 +205,7 @@ def _chunk_first_level_func_node(ast_node, python_code):
205
 
206
 
207
 
208
- def _chunk_first_level_assign_node(ast_node, python_code):
209
 
210
  """
211
  Handles assignment statements at the first level of the AST.
@@ -213,7 +213,7 @@ def _chunk_first_level_assign_node(ast_node, python_code):
213
  documents = []
214
  assign_start_line = ast_node.lineno
215
  assign_end_line = ast_node.end_lineno
216
- assign_source = '\n'.join(python_code.splitlines()[assign_start_line-1:assign_end_line])
217
 
218
  # Create metadata without imports
219
  metadata = {"type": "Assign"}
@@ -229,7 +229,7 @@ def _chunk_first_level_assign_node(ast_node, python_code):
229
 
230
 
231
 
232
- def _chunk_import_only_python_code(python_code, file_path):
233
  """
234
  Handles cases where the first-level nodes are only imports.
235
  """
@@ -244,13 +244,13 @@ def _chunk_import_only_python_code(python_code, file_path):
244
 
245
  # Create and store a Document with the full source code
246
  doc = Document(
247
- page_content=python_code,
248
  metadata=metadata
249
  )
250
  documents.append(doc)
251
  return documents
252
 
253
- def _chunk_nodeless_python_code(python_code, file_path):
254
  """
255
  Handles cases where no top-level nodes are found in the AST.
256
  """
@@ -265,7 +265,7 @@ def _chunk_nodeless_python_code(python_code, file_path):
265
 
266
  # Create and store a Document with the full source code
267
  doc = Document(
268
- page_content=python_code,
269
  metadata=metadata
270
  )
271
  documents.append(doc)
@@ -277,7 +277,7 @@ def _chunk_nodeless_python_code(python_code, file_path):
277
  from langchain.text_splitter import RecursiveCharacterTextSplitter
278
 
279
 
280
- def _chunk_python_code_by_character(python_code):
281
  documents = []
282
  text_splitter = RecursiveCharacterTextSplitter(
283
  chunk_size=512,
@@ -285,7 +285,7 @@ def _chunk_python_code_by_character(python_code):
285
  separators=[]
286
  )
287
 
288
- chunks = text_splitter.split_text(python_code)
289
 
290
  for chunk in chunks:
291
  doc = Document(
 
1
  import ast
2
  from langchain.schema import Document
3
 
4
+ def chunk_pythoncode_and_add_metadata(code_files_content, code_files_source):
5
  chunks = []
6
+ for code_file_content, code_file_source in zip(code_files_content, code_files_source):
7
  """
8
  Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
9
  aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
 
17
 
18
 
19
  """
20
+ document_chunks = generate_code_chunks_with_metadata(code_files_content, code_files_source)
21
  chunks.extend(document_chunks)
22
  return chunks
23
 
24
 
25
  # Split text into chunks
26
+ def chunk_text_and_add_metadata(texts, references, chunk_size, chunk_overlap):
27
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
28
  chunks = []
29
 
 
41
  return chunks
42
 
43
 
44
+ def generate_code_chunks_with_metadata(code_file_content, code_file_source):
45
  """
46
  Custom Python Code Splitter
47
  chunks python file by length of func/method body
 
55
  documents = []
56
  #print(f"Processing file: {file_path}")
57
 
58
+ _iterate_ast(code_file_content, documents, file_path)
59
  # Determine usage based on the file_path
60
  if file_path.startswith("kadi_apy/lib/"):
61
  usage = "kadi-apy python library"
 
72
  return documents
73
 
74
 
75
+ def _iterate_ast(code_file_content, documents, file_path):
76
  """
77
  Parses the AST of the given Python file and delegates
78
  handling to specific methods based on node types.
79
  """
80
+ tree = ast.parse(code_file_content, filename=file_path)
81
 
82
  first_level_nodes = list(ast.iter_child_nodes(tree))
83
 
84
  # Check if there are no first-level nodes
85
  if not first_level_nodes:
86
  documents.extend(
87
+ _chunk_nodeless_code_file_content(code_file_content, file_path))
88
  return
89
 
90
  all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
91
  if all_imports:
92
  documents.extend(
93
+ _chunk_import_only_code_file_content(code_file_content, file_path))
94
 
95
  # Iterate over first-level nodes
96
  for first_level_node in ast.iter_child_nodes(tree):
97
 
98
  if isinstance(first_level_node, ast.ClassDef):
99
  documents.extend(
100
+ _handle_first_level_class(first_level_node, code_file_content))
101
  elif isinstance(first_level_node, ast.FunctionDef):
102
  documents.extend(
103
+ _chunk_first_level_func_node(first_level_node, code_file_content))
104
  elif isinstance(first_level_node, ast.Assign):
105
  documents.extend(
106
+ _chunk_first_level_assign_node(first_level_node, code_file_content))
107
  # else:
108
  # documents.extend(
109
+ # _handle_not_defined_case(code_file_content))
110
 
111
 
112
 
113
+ def _handle_first_level_class(ast_node , code_file_content):
114
  """
115
  Handles classes at the first level of the AST.
116
  """
 
118
  class_start_line = ast_node.lineno
119
  class_body_lines = [child.lineno for child in ast_node.body if isinstance(child, ast.FunctionDef)]
120
  class_end_line = min(class_body_lines, default=ast_node.end_lineno) - 1
121
+ class_source = '\n'.join(code_file_content.splitlines()[class_start_line-1:class_end_line])
122
 
123
  metadata = {
124
  "type": "class",
 
141
  if second_level_node.decorator_list else second_level_node.lineno
142
  )
143
  method_end_line = second_level_node.end_lineno
144
+ method_source = '\n'.join(code_file_content.splitlines()[method_start_line-1:method_end_line])
145
 
146
  visibility = "internal" if second_level_node.name.startswith("_") else "public"
147
 
 
159
  return documents
160
 
161
 
162
+ def _handle_not_defined_case(code_file_content):
163
  documents = []
164
  documents.extend(
165
+ _chunk_code_file_content_by_character(code_file_content))
166
  return documents
167
 
168
 
169
+ def _chunk_first_level_func_node(ast_node, code_file_content):
170
  """
171
  Handles functions at the first level of the AST.
172
  """
 
176
  if ast_node.decorator_list else ast_node.lineno
177
  )
178
  function_end_line = ast_node.end_lineno
179
+ function_source = '\n'.join(code_file_content.splitlines()[function_start_line-1:function_end_line])
180
 
181
  visibility = "internal" if ast_node.name.startswith("_") else "public"
182
 
 
205
 
206
 
207
 
208
+ def _chunk_first_level_assign_node(ast_node, code_file_content):
209
 
210
  """
211
  Handles assignment statements at the first level of the AST.
 
213
  documents = []
214
  assign_start_line = ast_node.lineno
215
  assign_end_line = ast_node.end_lineno
216
+ assign_source = '\n'.join(code_file_content.splitlines()[assign_start_line-1:assign_end_line])
217
 
218
  # Create metadata without imports
219
  metadata = {"type": "Assign"}
 
229
 
230
 
231
 
232
+ def _chunk_import_only_code_file_content(code_file_content, file_path):
233
  """
234
  Handles cases where the first-level nodes are only imports.
235
  """
 
244
 
245
  # Create and store a Document with the full source code
246
  doc = Document(
247
+ page_content=code_file_content,
248
  metadata=metadata
249
  )
250
  documents.append(doc)
251
  return documents
252
 
253
+ def _chunk_nodeless_code_file_content(code_file_content, file_path):
254
  """
255
  Handles cases where no top-level nodes are found in the AST.
256
  """
 
265
 
266
  # Create and store a Document with the full source code
267
  doc = Document(
268
+ page_content=code_file_content,
269
  metadata=metadata
270
  )
271
  documents.append(doc)
 
277
  from langchain.text_splitter import RecursiveCharacterTextSplitter
278
 
279
 
280
+ def _chunk_code_file_content_by_character(code_file_content):
281
  documents = []
282
  text_splitter = RecursiveCharacterTextSplitter(
283
  chunk_size=512,
 
285
  separators=[]
286
  )
287
 
288
+ chunks = text_splitter.split_text(code_file_content)
289
 
290
  for chunk in chunks:
291
  doc = Document(