Spaces:
Sleeping
Sleeping
Update chunk_python_code.py
Browse files- chunk_python_code.py +25 -25
chunk_python_code.py
CHANGED
@@ -63,7 +63,7 @@ def _iterate_ast(python_code, documents, file_path):
|
|
63 |
_chunk_first_level_assign_node(first_level_node, documents, python_code))
|
64 |
else:
|
65 |
documents.extend(
|
66 |
-
|
67 |
|
68 |
|
69 |
def _chunk_import_only_python_code(python_code, file_path):
|
@@ -89,34 +89,12 @@ def _chunk_import_only_python_code(python_code, file_path):
|
|
89 |
|
90 |
|
91 |
|
92 |
-
def
|
93 |
documents = []
|
94 |
documents.extend(
|
95 |
-
_chunk_python_code_by_character)
|
96 |
return documents
|
97 |
|
98 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
99 |
-
|
100 |
-
|
101 |
-
def _chunk_python_code_by_character(python_code):
|
102 |
-
documents = []
|
103 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
104 |
-
chunk_size=512,
|
105 |
-
chunk_overlap=128,
|
106 |
-
separators=[]
|
107 |
-
)
|
108 |
-
|
109 |
-
chunks = text_splitter.split_text(python_code)
|
110 |
-
|
111 |
-
for chunk in chunks:
|
112 |
-
doc = Document(
|
113 |
-
page_content=chunk
|
114 |
-
)
|
115 |
-
documents.append(doc)
|
116 |
-
|
117 |
-
return documents
|
118 |
-
|
119 |
-
|
120 |
|
121 |
def _chunk_nodeless_python_code(python_code, file_path):
|
122 |
"""
|
@@ -245,4 +223,26 @@ def _chunk_first_level_func_node(ast_node, python_code):
|
|
245 |
)
|
246 |
documents.append(doc)
|
247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
return documents
|
|
|
63 |
_chunk_first_level_assign_node(first_level_node, documents, python_code))
|
64 |
else:
|
65 |
documents.extend(
|
66 |
+
_handle_not_defined_case(python_code))
|
67 |
|
68 |
|
69 |
def _chunk_import_only_python_code(python_code, file_path):
|
|
|
89 |
|
90 |
|
91 |
|
92 |
+
def _handle_not_defined_case(python_code):
|
93 |
documents = []
|
94 |
documents.extend(
|
95 |
+
_chunk_python_code_by_character(python_code)
|
96 |
return documents
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
def _chunk_nodeless_python_code(python_code, file_path):
|
100 |
"""
|
|
|
223 |
)
|
224 |
documents.append(doc)
|
225 |
|
226 |
+
return documents
|
227 |
+
|
228 |
+
|
229 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
230 |
+
|
231 |
+
|
232 |
+
def _chunk_python_code_by_character(python_code):
|
233 |
+
documents = []
|
234 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
235 |
+
chunk_size=512,
|
236 |
+
chunk_overlap=128,
|
237 |
+
separators=[]
|
238 |
+
)
|
239 |
+
|
240 |
+
chunks = text_splitter.split_text(python_code)
|
241 |
+
|
242 |
+
for chunk in chunks:
|
243 |
+
doc = Document(
|
244 |
+
page_content=chunk
|
245 |
+
)
|
246 |
+
documents.append(doc)
|
247 |
+
|
248 |
return documents
|