bupa1018 commited on
Commit
0a9a29e
·
1 Parent(s): 81e8d86

Update chunk_python_code.py

Browse files
Files changed (1) hide show
  1. chunk_python_code.py +66 -100
chunk_python_code.py CHANGED
@@ -1,6 +1,9 @@
1
  import ast
2
  from langchain.schema import Document
3
 
 
 
 
4
  def chunk_python_code_with_metadata(source_code, reference):
5
  """
6
  Entry point method to process the Python file.
@@ -19,13 +22,13 @@ def chunk_python_code_with_metadata(source_code, reference):
19
  elif reference.startswith("doc/"):
20
  usage = "doc"
21
  else:
22
- usage = "undefined"
23
 
24
  # Add metadata for usage to all documents
25
  for doc in documents:
26
  doc.metadata["reference"] = reference
27
  doc.metadata["usage"] = usage # Add the determined usage metadata
28
-
29
  return documents
30
 
31
 
@@ -47,12 +50,10 @@ def iterate_ast(source_code, documents, reference):
47
  handle_no_first_level_node_found(documents, source_code, imports_dict, reference)
48
  return
49
 
50
-
51
  all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
52
  if all_imports:
53
  handle_first_level_imports_only(documents, source_code, imports_dict, reference)
54
 
55
-
56
  # Iterate over first-level nodes
57
  for first_level_node in ast.iter_child_nodes(tree):
58
  if isinstance(first_level_node, ast.ClassDef):
@@ -62,22 +63,23 @@ def iterate_ast(source_code, documents, reference):
62
  elif isinstance(first_level_node, ast.Assign):
63
  handle_first_level_assign(first_level_node, documents, source_code, imports_dict)
64
 
65
-
66
 
67
- def handle_first_level_imports_only(documents, source_code, imports_dict, reference):
68
- # Check if the file path before ".py" is "__init__"
 
 
69
  if reference.endswith("__init__.py"):
70
  type = "__init__-file"
71
  else:
72
  type = "undefined"
73
 
 
 
 
74
  # Create and store a Document with the full source code
75
  doc = Document(
76
  page_content=source_code,
77
- metadata={
78
- "type": type,
79
- "imports": imports_dict
80
- }
81
  )
82
  documents.append(doc)
83
 
@@ -99,6 +101,7 @@ def extract_imports(tree):
99
  imports_dict[alias.name] = full_name
100
  return imports_dict
101
 
 
102
  def analyze_imports(node, imports_dict):
103
  """
104
  Analyzes the given node's body and signature to find relevant imports.
@@ -109,173 +112,136 @@ def analyze_imports(node, imports_dict):
109
  relevant_imports.add(imports_dict[sub_node.id])
110
  return list(relevant_imports)
111
 
112
- def handle_not_yet_defined_first_level_cases(documents, source_code, imports_dict):
113
- if source_code:
114
- doc = Document(
115
- page_content=source_code,
116
- metadata={
117
- "type": "undefined",
118
- "imports": imports_dict
119
- }
120
- )
121
- documents.append(doc)
122
-
123
-
124
 
125
  def handle_no_first_level_node_found(documents, source_code, imports_dict, reference):
126
  """
127
  Handles cases where no top-level nodes are found in the AST.
128
- Stores the full content (likely comments) in a Document object
129
- with metadata indicating type 'no code' or 'init' based on the reference.
130
  """
131
- # Check if the file path before ".py" is "__init__"
132
  if reference.endswith("__init__.py"):
133
  type = "__init__-file"
134
  else:
135
  type = "undefined"
136
 
 
 
 
137
  # Create and store a Document with the full source code
138
  doc = Document(
139
  page_content=source_code,
140
- metadata={
141
- "type": type,
142
- "imports": imports_dict
143
- }
144
  )
145
  documents.append(doc)
146
 
147
 
148
  def handle_first_level_assign(assign_node, documents, source_code, imports_dict):
149
  """
150
- Handles assignment statements at the first level of the AST by storing them
151
- in a Document object with metadata, including relevant imports.
152
  """
153
- # Extract assignment source code
154
  assign_start_line = assign_node.lineno
155
  assign_end_line = assign_node.end_lineno
156
  assign_source = '\n'.join(source_code.splitlines()[assign_start_line-1:assign_end_line])
157
 
158
- # Extract relevant imports for this assignment
159
  assign_imports = analyze_imports(assign_node, imports_dict)
160
-
 
 
 
161
  # Create and store Document for the assignment
162
  doc = Document(
163
  page_content=assign_source,
164
- metadata={
165
- "type": "Assign",
166
- "imports": assign_imports
167
- }
168
  )
169
  documents.append(doc)
170
 
171
- def handle_first_level_class(class_node, documents, source_code, imports_dict):
172
 
 
173
  """
174
- Handles classes at the first level of the AST by storing them
175
- in a Document object with metadata, including relevant imports.
176
  """
177
- # Extract class source code
178
  class_start_line = class_node.lineno
179
-
180
- # Find the line where the first function (def) starts or the next top-level node
181
  class_body_lines = [child.lineno for child in class_node.body if isinstance(child, ast.FunctionDef)]
182
- class_end_line = min(class_body_lines, default=class_node.end_lineno) - 1 # Use `-1` to exclude the next node
183
-
184
- # Generate the class source code
185
  class_source = '\n'.join(source_code.splitlines()[class_start_line-1:class_end_line])
186
-
187
- # Extract relevant imports for this class
188
  class_imports = analyze_imports(class_node, imports_dict)
189
-
 
 
 
 
 
 
 
 
190
  # Create and store Document for the class
191
  doc = Document(
192
  page_content=class_source,
193
- metadata={
194
- "type": "class",
195
- "class": class_node.name,
196
- "visibility": "public",
197
- "imports": class_imports,
198
- }
199
  )
200
  documents.append(doc)
201
 
202
-
203
  # Handle methods within the class
204
  for second_level_node in ast.iter_child_nodes(class_node):
205
  if isinstance(second_level_node, ast.FunctionDef):
206
- # Extract method source code
207
  method_start_line = (
208
  second_level_node.decorator_list[0].lineno
209
  if second_level_node.decorator_list else second_level_node.lineno
210
  )
211
  method_end_line = second_level_node.end_lineno
212
  method_source = '\n'.join(source_code.splitlines()[method_start_line-1:method_end_line])
213
-
214
- # Determine visibility metadata
215
  visibility = "internal" if second_level_node.name.startswith("_") else "public"
216
- # Extract relevant imports for this method
217
  method_imports = analyze_imports(second_level_node, imports_dict)
218
-
219
- # Create and store Document
220
 
221
  doc = Document(
222
  page_content=method_source,
223
  metadata={
224
  "type": "method",
225
  "method": second_level_node.name,
226
- "visibility": "visibility",
227
- "imports": method_imports,
228
  "class": class_node.name
229
  }
230
  )
231
  documents.append(doc)
232
 
 
233
  def handle_first_level_func(function_node, documents, source_code, imports_dict):
234
  """
235
- Handles functions at the first level of the AST by storing them
236
- in a Document object with metadata, including relevant imports.
237
  """
238
- # Extract function source code
239
  function_start_line = (
240
  function_node.decorator_list[0].lineno
241
  if function_node.decorator_list else function_node.lineno
242
  )
243
  function_end_line = function_node.end_lineno
244
  function_source = '\n'.join(source_code.splitlines()[function_start_line-1:function_end_line])
245
-
246
- # Determine visibility metadata
 
 
247
  visibility = "internal" if function_node.name.startswith("_") else "public"
248
-
249
- # Check if the function is a CLI command (e.g., decorated with `@apy_command`)
250
  is_command = any(
251
- decorator.id == "apy_command" # Check decorator name
252
  for decorator in function_node.decorator_list
253
- if hasattr(decorator, "id") # Ensure the decorator has an identifier
254
  )
255
-
256
- # Extract relevant imports for this function
257
- function_imports = analyze_imports(function_node, imports_dict)
258
-
259
- # Create and store Document
260
-
261
  if is_command:
262
- doc = Document(
263
- page_content=function_source,
264
- metadata={
265
- "type": "command",
266
- "command": function_node.name,
267
- "visibility": "public",
268
- "imports": function_imports
269
- }
270
- )
271
  else:
272
- doc = Document(
273
- page_content=function_source,
274
- metadata={
275
- "type": "function",
276
- "method": function_node.name,
277
- "visibility": visibility,
278
- "imports": function_imports
279
- }
280
- )
281
- documents.append(doc)
 
1
  import ast
2
  from langchain.schema import Document
3
 
4
+ import ast
5
+ from langchain.schema import Document
6
+
7
  def chunk_python_code_with_metadata(source_code, reference):
8
  """
9
  Entry point method to process the Python file.
 
22
  elif reference.startswith("doc/"):
23
  usage = "doc"
24
  else:
25
+ usage = "undefined"
26
 
27
  # Add metadata for usage to all documents
28
  for doc in documents:
29
  doc.metadata["reference"] = reference
30
  doc.metadata["usage"] = usage # Add the determined usage metadata
31
+ print(doc)
32
  return documents
33
 
34
 
 
50
  handle_no_first_level_node_found(documents, source_code, imports_dict, reference)
51
  return
52
 
 
53
  all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
54
  if all_imports:
55
  handle_first_level_imports_only(documents, source_code, imports_dict, reference)
56
 
 
57
  # Iterate over first-level nodes
58
  for first_level_node in ast.iter_child_nodes(tree):
59
  if isinstance(first_level_node, ast.ClassDef):
 
63
  elif isinstance(first_level_node, ast.Assign):
64
  handle_first_level_assign(first_level_node, documents, source_code, imports_dict)
65
 
 
66
 
67
+ def handle_first_level_imports_only(documents, source_code, imports_dict, reference):
68
+ """
69
+ Handles cases where the first-level nodes are only imports.
70
+ """
71
  if reference.endswith("__init__.py"):
72
  type = "__init__-file"
73
  else:
74
  type = "undefined"
75
 
76
+ # Create metadata with "none" if imports are empty
77
+ metadata = {"type": type, "imports": "none" if not imports_dict else imports_dict}
78
+
79
  # Create and store a Document with the full source code
80
  doc = Document(
81
  page_content=source_code,
82
+ metadata=metadata
 
 
 
83
  )
84
  documents.append(doc)
85
 
 
101
  imports_dict[alias.name] = full_name
102
  return imports_dict
103
 
104
+
105
  def analyze_imports(node, imports_dict):
106
  """
107
  Analyzes the given node's body and signature to find relevant imports.
 
112
  relevant_imports.add(imports_dict[sub_node.id])
113
  return list(relevant_imports)
114
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  def handle_no_first_level_node_found(documents, source_code, imports_dict, reference):
117
  """
118
  Handles cases where no top-level nodes are found in the AST.
 
 
119
  """
 
120
  if reference.endswith("__init__.py"):
121
  type = "__init__-file"
122
  else:
123
  type = "undefined"
124
 
125
+ # Create metadata with "none" if imports are empty
126
+ metadata = {"type": type, "imports": "none" if not imports_dict else imports_dict}
127
+
128
  # Create and store a Document with the full source code
129
  doc = Document(
130
  page_content=source_code,
131
+ metadata=metadata
 
 
 
132
  )
133
  documents.append(doc)
134
 
135
 
136
  def handle_first_level_assign(assign_node, documents, source_code, imports_dict):
137
  """
138
+ Handles assignment statements at the first level of the AST.
 
139
  """
 
140
  assign_start_line = assign_node.lineno
141
  assign_end_line = assign_node.end_lineno
142
  assign_source = '\n'.join(source_code.splitlines()[assign_start_line-1:assign_end_line])
143
 
144
+ # Extract relevant imports
145
  assign_imports = analyze_imports(assign_node, imports_dict)
146
+
147
+ # Create metadata with "none" if imports are empty
148
+ metadata = {"type": "Assign", "imports": "none" if not assign_imports else assign_imports}
149
+
150
  # Create and store Document for the assignment
151
  doc = Document(
152
  page_content=assign_source,
153
+ metadata=metadata
 
 
 
154
  )
155
  documents.append(doc)
156
 
 
157
 
158
+ def handle_first_level_class(class_node, documents, source_code, imports_dict):
159
  """
160
+ Handles classes at the first level of the AST.
 
161
  """
 
162
  class_start_line = class_node.lineno
 
 
163
  class_body_lines = [child.lineno for child in class_node.body if isinstance(child, ast.FunctionDef)]
164
+ class_end_line = min(class_body_lines, default=class_node.end_lineno) - 1
 
 
165
  class_source = '\n'.join(source_code.splitlines()[class_start_line-1:class_end_line])
166
+
167
+ # Extract relevant imports
168
  class_imports = analyze_imports(class_node, imports_dict)
169
+
170
+ # Create metadata with "none" if imports are empty
171
+ metadata = {
172
+ "type": "class",
173
+ "class": class_node.name,
174
+ "visibility": "public",
175
+ "imports": "none" if not class_imports else class_imports
176
+ }
177
+
178
  # Create and store Document for the class
179
  doc = Document(
180
  page_content=class_source,
181
+ metadata=metadata
 
 
 
 
 
182
  )
183
  documents.append(doc)
184
 
 
185
  # Handle methods within the class
186
  for second_level_node in ast.iter_child_nodes(class_node):
187
  if isinstance(second_level_node, ast.FunctionDef):
 
188
  method_start_line = (
189
  second_level_node.decorator_list[0].lineno
190
  if second_level_node.decorator_list else second_level_node.lineno
191
  )
192
  method_end_line = second_level_node.end_lineno
193
  method_source = '\n'.join(source_code.splitlines()[method_start_line-1:method_end_line])
194
+
 
195
  visibility = "internal" if second_level_node.name.startswith("_") else "public"
 
196
  method_imports = analyze_imports(second_level_node, imports_dict)
 
 
197
 
198
  doc = Document(
199
  page_content=method_source,
200
  metadata={
201
  "type": "method",
202
  "method": second_level_node.name,
203
+ "visibility": visibility,
204
+ "imports": "none" if not method_imports else method_imports,
205
  "class": class_node.name
206
  }
207
  )
208
  documents.append(doc)
209
 
210
+
211
  def handle_first_level_func(function_node, documents, source_code, imports_dict):
212
  """
213
+ Handles functions at the first level of the AST.
 
214
  """
 
215
  function_start_line = (
216
  function_node.decorator_list[0].lineno
217
  if function_node.decorator_list else function_node.lineno
218
  )
219
  function_end_line = function_node.end_lineno
220
  function_source = '\n'.join(source_code.splitlines()[function_start_line-1:function_end_line])
221
+
222
+ # Extract relevant imports
223
+ function_imports = analyze_imports(function_node, imports_dict)
224
+
225
  visibility = "internal" if function_node.name.startswith("_") else "public"
226
+
 
227
  is_command = any(
228
+ decorator.id == "apy_command"
229
  for decorator in function_node.decorator_list
230
+ if hasattr(decorator, "id")
231
  )
232
+
233
+ metadata = {
234
+ "type": "command" if is_command else "function",
235
+ "visibility": visibility,
236
+ "imports": "none" if not function_imports else function_imports
237
+ }
238
  if is_command:
239
+ metadata["command"] = function_node.name
 
 
 
 
 
 
 
 
240
  else:
241
+ metadata["method"] = function_node.name
242
+
243
+ doc = Document(
244
+ page_content=function_source,
245
+ metadata=metadata
246
+ )
247
+ documents.append(doc)