bupa1018 commited on
Commit
f26d824
·
1 Parent(s): fb23588

Update chunk_python_code.py

Browse files
Files changed (1) hide show
  1. chunk_python_code.py +38 -0
chunk_python_code.py CHANGED
@@ -1,6 +1,44 @@
1
  import ast
2
  from langchain.schema import Document
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  def chunk_python_code_with_metadata(python_code, file_path):
 
1
  import ast
2
  from langchain.schema import Document
3
 
4
+ def split_python_code_into_chunks(texts, file_paths):
5
+ chunks = []
6
+ for text, file_path in zip(texts, file_paths):
7
+ """
8
+ Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
9
+ aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
10
+ e.g visbility: public, _internal
11
+ type: "class", "methods", "command"(CLI commands)
12
+ source:
13
+
14
+
15
+ with the intend to use a filter when retrieving potentaion useful snippets.
16
+
17
+
18
+
19
+ """
20
+ document_chunks = chunk_python_code_with_metadata(text, file_path)
21
+ chunks.extend(document_chunks)
22
+ return chunks
23
+
24
+
25
+ # Split text into chunks
26
+ def split_into_chunks(texts, references, chunk_size, chunk_overlap):
27
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
28
+ chunks = []
29
+
30
+ for text, reference in zip(texts, references):
31
+ chunks.extend([
32
+ Document(
33
+ page_content=chunk,
34
+ metadata={
35
+ "source": reference,
36
+ "usage": "doc"
37
+ }
38
+ )
39
+ for chunk in text_splitter.split_text(text)
40
+ ])
41
+ return chunks
42
 
43
 
44
  def chunk_python_code_with_metadata(python_code, file_path):