Spaces:
Sleeping
Sleeping
Update chunk_python_code.py
Browse files- chunk_python_code.py +78 -73
chunk_python_code.py
CHANGED
@@ -66,79 +66,6 @@ def _iterate_ast(python_code, documents, file_path):
|
|
66 |
_handle_not_defined_case(python_code))
|
67 |
|
68 |
|
69 |
-
def _chunk_import_only_python_code(python_code, file_path):
|
70 |
-
"""
|
71 |
-
Handles cases where the first-level nodes are only imports.
|
72 |
-
"""
|
73 |
-
documents = []
|
74 |
-
if file_path.endswith("__init__.py"):
|
75 |
-
type = "__init__-file"
|
76 |
-
else:
|
77 |
-
type = "undefined"
|
78 |
-
|
79 |
-
# Create metadata without imports
|
80 |
-
metadata = {"type": type}
|
81 |
-
|
82 |
-
# Create and store a Document with the full source code
|
83 |
-
doc = Document(
|
84 |
-
page_content=python_code,
|
85 |
-
metadata=metadata
|
86 |
-
)
|
87 |
-
documents.append(doc)
|
88 |
-
return documents
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
def _handle_not_defined_case(python_code):
|
93 |
-
documents = []
|
94 |
-
documents.extend(
|
95 |
-
_chunk_python_code_by_character(python_code)
|
96 |
-
return documents
|
97 |
-
|
98 |
-
|
99 |
-
def _chunk_nodeless_python_code(python_code, file_path):
|
100 |
-
"""
|
101 |
-
Handles cases where no top-level nodes are found in the AST.
|
102 |
-
"""
|
103 |
-
documents = []
|
104 |
-
if file_path.endswith("__init__.py"):
|
105 |
-
type = "__init__-file"
|
106 |
-
else:
|
107 |
-
type = "undefined"
|
108 |
-
|
109 |
-
# Create metadata without imports
|
110 |
-
metadata = {"type": type}
|
111 |
-
|
112 |
-
# Create and store a Document with the full source code
|
113 |
-
doc = Document(
|
114 |
-
page_content=python_code,
|
115 |
-
metadata=metadata
|
116 |
-
)
|
117 |
-
documents.append(doc)
|
118 |
-
|
119 |
-
return documents
|
120 |
-
|
121 |
-
def _chunk_first_level_assign_node(ast_node, python_code):
|
122 |
-
|
123 |
-
"""
|
124 |
-
Handles assignment statements at the first level of the AST.
|
125 |
-
"""
|
126 |
-
documents = []
|
127 |
-
assign_start_line = ast_node.lineno
|
128 |
-
assign_end_line = ast_node.end_lineno
|
129 |
-
assign_source = '\n'.join(python_code.splitlines()[assign_start_line-1:assign_end_line])
|
130 |
-
|
131 |
-
# Create metadata without imports
|
132 |
-
metadata = {"type": "Assign"}
|
133 |
-
|
134 |
-
# Create and store Document for the assignment
|
135 |
-
doc = Document(
|
136 |
-
page_content=assign_source,
|
137 |
-
metadata=metadata
|
138 |
-
)
|
139 |
-
documents.append(doc)
|
140 |
-
|
141 |
-
return documents
|
142 |
|
143 |
def _handle_first_level_class(ast_node , python_code):
|
144 |
"""
|
@@ -188,6 +115,14 @@ def _handle_first_level_class(ast_node , python_code):
|
|
188 |
|
189 |
return documents
|
190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
def _chunk_first_level_func_node(ast_node, python_code):
|
192 |
"""
|
193 |
Handles functions at the first level of the AST.
|
@@ -226,6 +161,76 @@ def _chunk_first_level_func_node(ast_node, python_code):
|
|
226 |
return documents
|
227 |
|
228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
230 |
|
231 |
|
|
|
66 |
_handle_not_defined_case(python_code))
|
67 |
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
def _handle_first_level_class(ast_node , python_code):
|
71 |
"""
|
|
|
115 |
|
116 |
return documents
|
117 |
|
118 |
+
|
119 |
+
def _handle_not_defined_case(python_code):
|
120 |
+
documents = []
|
121 |
+
documents.extend(
|
122 |
+
_chunk_python_code_by_character(python_code))
|
123 |
+
return documents
|
124 |
+
|
125 |
+
|
126 |
def _chunk_first_level_func_node(ast_node, python_code):
|
127 |
"""
|
128 |
Handles functions at the first level of the AST.
|
|
|
161 |
return documents
|
162 |
|
163 |
|
164 |
+
|
165 |
+
def _chunk_first_level_assign_node(ast_node, python_code):
|
166 |
+
|
167 |
+
"""
|
168 |
+
Handles assignment statements at the first level of the AST.
|
169 |
+
"""
|
170 |
+
documents = []
|
171 |
+
assign_start_line = ast_node.lineno
|
172 |
+
assign_end_line = ast_node.end_lineno
|
173 |
+
assign_source = '\n'.join(python_code.splitlines()[assign_start_line-1:assign_end_line])
|
174 |
+
|
175 |
+
# Create metadata without imports
|
176 |
+
metadata = {"type": "Assign"}
|
177 |
+
|
178 |
+
# Create and store Document for the assignment
|
179 |
+
doc = Document(
|
180 |
+
page_content=assign_source,
|
181 |
+
metadata=metadata
|
182 |
+
)
|
183 |
+
documents.append(doc)
|
184 |
+
|
185 |
+
return documents
|
186 |
+
|
187 |
+
|
188 |
+
|
189 |
+
def _chunk_import_only_python_code(python_code, file_path):
|
190 |
+
"""
|
191 |
+
Handles cases where the first-level nodes are only imports.
|
192 |
+
"""
|
193 |
+
documents = []
|
194 |
+
if file_path.endswith("__init__.py"):
|
195 |
+
type = "__init__-file"
|
196 |
+
else:
|
197 |
+
type = "undefined"
|
198 |
+
|
199 |
+
# Create metadata without imports
|
200 |
+
metadata = {"type": type}
|
201 |
+
|
202 |
+
# Create and store a Document with the full source code
|
203 |
+
doc = Document(
|
204 |
+
page_content=python_code,
|
205 |
+
metadata=metadata
|
206 |
+
)
|
207 |
+
documents.append(doc)
|
208 |
+
return documents
|
209 |
+
|
210 |
+
def _chunk_nodeless_python_code(python_code, file_path):
|
211 |
+
"""
|
212 |
+
Handles cases where no top-level nodes are found in the AST.
|
213 |
+
"""
|
214 |
+
documents = []
|
215 |
+
if file_path.endswith("__init__.py"):
|
216 |
+
type = "__init__-file"
|
217 |
+
else:
|
218 |
+
type = "undefined"
|
219 |
+
|
220 |
+
# Create metadata without imports
|
221 |
+
metadata = {"type": type}
|
222 |
+
|
223 |
+
# Create and store a Document with the full source code
|
224 |
+
doc = Document(
|
225 |
+
page_content=python_code,
|
226 |
+
metadata=metadata
|
227 |
+
)
|
228 |
+
documents.append(doc)
|
229 |
+
|
230 |
+
return documents
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
235 |
|
236 |
|