bupa1018 commited on
Commit
34426fc
·
1 Parent(s): 24c46ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py CHANGED
@@ -183,6 +183,116 @@ def process_directory(directory, partial_paths=None, file_paths=None):
183
  print(f"Print the text for testing broooo {all_texts}")
184
  return all_texts, file_references
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  # Split text into chunks
187
  def split_into_chunks(texts, references, chunk_size, chunk_overlap):
188
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
183
  print(f"Print the text for testing broooo {all_texts}")
184
  return all_texts, file_references
185
 
186
+ import ast
187
+
188
+ def get_source_segment(source_lines, node):
189
+ start_line, start_col = node.lineno - 1, node.col_offset
190
+ end_line = node.end_lineno - 1 if hasattr(node, 'end_lineno') else node.lineno - 1
191
+ end_col = node.end_col_offset if hasattr(node, 'end_col_offset') else len(source_lines[end_line])
192
+
193
+ lines = source_lines[start_line:end_line + 1]
194
+ lines[0] = lines[0][start_col:]
195
+ lines[-1] = lines[-1][:end_col]
196
+
197
+ return ''.join(lines)
198
+
199
+ from langchain.schema import Document
200
+
201
+ def chunk_python_file_content(content, char_limit=1572):
202
+ source_lines = content.splitlines(keepends=True)
203
+
204
+ # Parse the content into an abstract syntax tree (AST)
205
+ tree = ast.parse(content)
206
+
207
+ chunks = []
208
+ current_chunk = ""
209
+ current_chunk_size = 0
210
+
211
+ # Find all class definitions and top-level functions in the AST
212
+ class_nodes = [node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
213
+ function_nodes = [node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef) and not isinstance(node, ast.ClassDef)]
214
+
215
+ for class_node in class_nodes:
216
+ method_nodes = [node for node in class_node.body if isinstance(node, ast.FunctionDef)]
217
+
218
+ if method_nodes:
219
+ first_method_start_line = method_nodes[0].lineno - 1
220
+ class_def_lines = source_lines[class_node.lineno - 1:first_method_start_line]
221
+ else:
222
+ class_def_lines = source_lines[class_node.lineno - 1:class_node.end_lineno]
223
+
224
+ class_def = ''.join(class_def_lines)
225
+ class_def_size = len(class_def)
226
+
227
+ # Add class definition to the current chunk if it fits
228
+ if current_chunk_size + class_def_size <= char_limit:
229
+ current_chunk += f"{class_def.strip()}\n"
230
+ current_chunk_size += class_def_size
231
+ else:
232
+ # Start a new chunk if the class definition exceeds the limit
233
+ if current_chunk:
234
+ chunks.append(current_chunk.strip())
235
+ current_chunk = ""
236
+ current_chunk_size = 0
237
+ current_chunk += f"{class_def.strip()}\n"
238
+ current_chunk_size = class_def_size
239
+
240
+ for method_node in method_nodes:
241
+ method_def = get_source_segment(source_lines, method_node)
242
+ method_def_size = len(method_def)
243
+
244
+ # Add method definition to the current chunk if it fits
245
+ if current_chunk_size + method_def_size <= char_limit:
246
+ current_chunk += f"{method_def.strip()}\n"
247
+ current_chunk_size += method_def_size
248
+ else:
249
+ # Start a new chunk if the method definition exceeds the limit
250
+ if current_chunk:
251
+ chunks.append(current_chunk.strip())
252
+ current_chunk = ""
253
+ current_chunk_size = 0
254
+ current_chunk += f"# This is a class method of class: {class_node.name}\n{method_def.strip()}\n"
255
+ current_chunk_size = method_def_size
256
+
257
+ for function_node in function_nodes:
258
+ function_def = get_source_segment(source_lines, function_node)
259
+ function_def_size = len(function_def)
260
+
261
+ # Add function definition to the current chunk if it fits
262
+ if current_chunk_size + function_def_size <= char_limit:
263
+ current_chunk += f"{function_def.strip()}\n"
264
+ current_chunk_size += function_def_size
265
+ else:
266
+ # Start a new chunk if the function definition exceeds the limit
267
+ if current_chunk:
268
+ chunks.append(current_chunk.strip())
269
+ current_chunk = ""
270
+ current_chunk_size = 0
271
+ current_chunk += f"{function_def.strip()}\n"
272
+ current_chunk_size = function_def_size
273
+
274
+ if current_chunk:
275
+ chunks.append(current_chunk.strip())
276
+
277
+ return chunks
278
+
279
+
280
+
281
+ # Split python code into chunks
282
+ def split_pythoncode_into_chunks(texts, references, chunk_size, chunk_overlap):
283
+ chunks = []
284
+
285
+ for text, reference in zip(texts, references):
286
+ file_chunks = chunk_python_file_content(text, char_limit=chunk_size)
287
+
288
+ for chunk in file_chunks:
289
+ document = Document(page_content=chunk, metadata={"source": reference})
290
+ chunks.append(document)
291
+
292
+ print(f"Total number of chunks: {len(chunks)}")
293
+ return chunks
294
+
295
+
296
  # Split text into chunks
297
  def split_into_chunks(texts, references, chunk_size, chunk_overlap):
298
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)