bupa1018 commited on
Commit
1aeb49e
·
1 Parent(s): 0eb49ff

Update process_python_code

Browse files
Files changed (1) hide show
  1. process_python_code +142 -11
process_python_code CHANGED
@@ -1,38 +1,72 @@
1
  import ast
2
- from langchain.schema import Document
3
 
4
-
5
- def chunk_python_code_with_metadata(source_code, references):
6
  """
7
  Entry point method to process the Python file.
8
  It invokes the iterate_ast function.
9
  """
10
  documents = []
11
- print(f"Processing file: {file_path}")
12
- iterate_ast(source_code, documents)
13
  for doc in documents:
14
- print(f"Stored Document:\n")
15
- print(doc)
16
- print(len(documents))
17
  return documents
18
 
19
- def iterate_ast(source_code, documents):
20
  """
21
  Parses the AST of the given Python file and delegates
22
  handling to specific methods based on node types.
23
  """
24
  # Parse the source code into an abstract syntax tree (AST)
25
- tree = ast.parse(source_code, filename=file_path)
26
 
27
  # Gather all top-level imports for later use
28
  imports_dict = extract_imports(tree)
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
30
  # Iterate over first-level nodes
31
  for first_level_node in ast.iter_child_nodes(tree):
32
  if isinstance(first_level_node, ast.ClassDef):
33
  handle_first_level_class(first_level_node, documents, source_code, imports_dict)
34
  elif isinstance(first_level_node, ast.FunctionDef):
35
  handle_first_level_func(first_level_node, documents, source_code, imports_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def extract_imports(tree):
38
  """
@@ -61,6 +95,65 @@ def analyze_imports(node, imports_dict):
61
  relevant_imports.add(imports_dict[sub_node.id])
62
  return list(relevant_imports)
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def handle_first_level_class(class_node, documents, source_code, imports_dict):
65
 
66
  """
@@ -87,7 +180,7 @@ def handle_first_level_class(class_node, documents, source_code, imports_dict):
87
  "type": "class",
88
  "class": class_node.name,
89
  "visibility": "public",
90
- "imports": class_imports # Add class-specific imports
91
  }
92
  )
93
  documents.append(doc)
@@ -172,3 +265,41 @@ def handle_first_level_func(function_node, documents, source_code, imports_dict)
172
  }
173
  )
174
  documents.append(doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import ast
2
+ from langchain.schema import Document # Assuming "Document" is imported from LangChain
3
 
4
+ def chunkPythonFiles(source_code, reference):
 
5
  """
6
  Entry point method to process the Python file.
7
  It invokes the iterate_ast function.
8
  """
9
  documents = []
10
+ print(f"Processing file: {reference}")
11
+ iterate_ast(source_code, documents, reference)
12
  for doc in documents:
13
+ doc.metadata["reference"] = reference
14
+ #print("HERE IS A DOC\n", doc)
15
+ #print(len(documents))
16
  return documents
17
 
18
+ def iterate_ast(source_code, documents, reference):
19
  """
20
  Parses the AST of the given Python file and delegates
21
  handling to specific methods based on node types.
22
  """
23
  # Parse the source code into an abstract syntax tree (AST)
24
+ tree = ast.parse(source_code, filename=reference)
25
 
26
  # Gather all top-level imports for later use
27
  imports_dict = extract_imports(tree)
28
+
29
+ first_level_nodes = list(ast.iter_child_nodes(tree))
30
+
31
+ # Check if there are no first-level nodes
32
+ if not first_level_nodes:
33
+ handle_no_first_level_node_found(documents, source_code, imports_dict, reference)
34
+ return
35
+
36
+
37
+ all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
38
+ if all_imports:
39
+ handle_first_level_imports_only(documents, source_code, imports_dict, reference)
40
 
41
+
42
  # Iterate over first-level nodes
43
  for first_level_node in ast.iter_child_nodes(tree):
44
  if isinstance(first_level_node, ast.ClassDef):
45
  handle_first_level_class(first_level_node, documents, source_code, imports_dict)
46
  elif isinstance(first_level_node, ast.FunctionDef):
47
  handle_first_level_func(first_level_node, documents, source_code, imports_dict)
48
+ elif isinstance(first_level_node, ast.Assign):
49
+ handle_first_level_assign(first_level_node, documents, source_code, imports_dict)
50
+
51
+
52
+
53
+ def handle_first_level_imports_only(documents, source_code, imports_dict, reference):
54
+ # Check if the file path before ".py" is "__init__"
55
+ if reference.endswith("__init__.py"):
56
+ type = "__init__-file"
57
+ else:
58
+ type = "undefined"
59
+
60
+ # Create and store a Document with the full source code
61
+ doc = Document(
62
+ page_content=source_code,
63
+ metadata={
64
+ "type": type,
65
+ "imports": imports_dict
66
+ }
67
+ )
68
+ documents.append(doc)
69
+
70
 
71
  def extract_imports(tree):
72
  """
 
95
  relevant_imports.add(imports_dict[sub_node.id])
96
  return list(relevant_imports)
97
 
98
+ def handle_not_yet_defined_first_level_cases(documents, source_code, imports_dict):
99
+ if source_code:
100
+ doc = Document(
101
+ page_content=source_code,
102
+ metadata={
103
+ "type": "undefined",
104
+ "imports": imports_dict
105
+ }
106
+ )
107
+ documents.append(doc)
108
+
109
+
110
+
111
+ def handle_no_first_level_node_found(documents, source_code, imports_dict, reference):
112
+ """
113
+ Handles cases where no top-level nodes are found in the AST.
114
+ Stores the full content (likely comments) in a Document object
115
+ with metadata indicating type 'no code' or 'init' based on the reference.
116
+ """
117
+ # Check if the file path before ".py" is "__init__"
118
+ if reference.endswith("__init__.py"):
119
+ type = "__init__-file"
120
+ else:
121
+ type = "undefined"
122
+
123
+ # Create and store a Document with the full source code
124
+ doc = Document(
125
+ page_content=source_code,
126
+ metadata={
127
+ "type": type,
128
+ "imports": imports_dict
129
+ }
130
+ )
131
+ documents.append(doc)
132
+
133
+
134
+ def handle_first_level_assign(assign_node, documents, source_code, imports_dict):
135
+ """
136
+ Handles assignment statements at the first level of the AST by storing them
137
+ in a Document object with metadata, including relevant imports.
138
+ """
139
+ # Extract assignment source code
140
+ assign_start_line = assign_node.lineno
141
+ assign_end_line = assign_node.end_lineno
142
+ assign_source = '\n'.join(source_code.splitlines()[assign_start_line-1:assign_end_line])
143
+
144
+ # Extract relevant imports for this assignment
145
+ assign_imports = analyze_imports(assign_node, imports_dict)
146
+
147
+ # Create and store Document for the assignment
148
+ doc = Document(
149
+ page_content=assign_source,
150
+ metadata={
151
+ "type": "Assign",
152
+ "imports": assign_imports
153
+ }
154
+ )
155
+ documents.append(doc)
156
+
157
  def handle_first_level_class(class_node, documents, source_code, imports_dict):
158
 
159
  """
 
180
  "type": "class",
181
  "class": class_node.name,
182
  "visibility": "public",
183
+ "imports": class_imports,
184
  }
185
  )
186
  documents.append(doc)
 
265
  }
266
  )
267
  documents.append(doc)
268
+
269
+
270
+ # Example usage
271
+ #file_path = r"C:\Users\Anwender\Downloads\exampleScript.py"
272
+
273
+ #with open(file_path, "r", encoding="utf-8") as file:
274
+ # source_code = file.read()
275
+ #chunkPythonFiles(source_code, file_path)
276
+
277
+
278
+ import os
279
+
280
+ def process_folder(folder_path):
281
+ # Initialize a counter for the number of Python files
282
+ python_file_count = 0
283
+ docsT = []
284
+ # Walk through all subdirectories and files in the folder
285
+ for root, _, files in os.walk(folder_path):
286
+ for file_name in files:
287
+ # Create the full file path
288
+ file_path = os.path.join(root, file_name)
289
+ #print(file_path)
290
+
291
+ # Ensure it's a Python file
292
+ if file_name.endswith(".py"):
293
+ python_file_count += 1 # Increment the counter
294
+ with open(file_path, "r", encoding="utf-8") as file:
295
+ source_code = file.read()
296
+ print(file_name)
297
+
298
+ # Call your function
299
+ docs = chunkPythonFiles(source_code, file_path)
300
+
301
+ print("HWHWHWWHWHWHWH!:" ,len(docs))
302
+ docsT.extend(docs)
303
+ # Print the total number of Python files processed
304
+ print(f"Total Python files processed: {python_file_count}")
305
+ print(f"Total docs files processed: {len(docsT)}")