tolgadev commited on
Commit
c528bc9
Β·
verified Β·
1 Parent(s): 30689e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +416 -416
app.py CHANGED
@@ -1,416 +1,416 @@
1
- # --------------------------------------------- Libraries ----------------------------------------------------------#
2
- import gradio as gr
3
- from PyPDF2 import PdfReader
4
- import nbformat
5
-
6
- from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter, PythonCodeTextSplitter, Language
7
- from langchain.docstore.document import Document
8
- from langchain_community.document_loaders import Docx2txtLoader, CSVLoader
9
-
10
- # --------------------------------------------- Functions ----------------------------------------------------------#
11
-
12
- def process_uploaded_file(uploaded_file):
13
- text = ""
14
- display_content = ""
15
- file_extension = uploaded_file.name.split(".")[-1]
16
-
17
- if file_extension == "pdf":
18
- try:
19
- # Gradio's uploaded_file.name provides the path to the temporary file
20
- pdf = PdfReader(uploaded_file.name)
21
- for page in pdf.pages:
22
- page_text = page.extract_text()
23
- text += page_text + "\n"
24
- display_content += page_text + "\n"
25
- except Exception as e:
26
- display_content = f"Error reading PDF file: {e}"
27
- text = ""
28
-
29
- elif file_extension == "docx":
30
- try:
31
- docx_loader = Docx2txtLoader(uploaded_file.name)
32
- documents = docx_loader.load()
33
- text = "\n".join([doc.page_content for doc in documents])
34
- display_content = text
35
- except Exception as e:
36
- display_content = f"Error reading DOCX file: {e}"
37
- text = ""
38
-
39
- elif file_extension in ["html", "css", "py", "txt"]:
40
- try:
41
- with open(uploaded_file.name, "r", encoding="utf-8") as f:
42
- file_content = f.read()
43
- display_content = file_content # Display as plain text in Textbox
44
- text = file_content
45
- except Exception as e:
46
- display_content = f"Error reading {file_extension.upper()} file: {e}"
47
- text = ""
48
-
49
- elif file_extension == "ipynb":
50
- try:
51
- # nbformat.read can take a file path
52
- nb_content = nbformat.read(uploaded_file.name, as_version=4)
53
- nb_filtered = [cell for cell in nb_content["cells"] if cell["cell_type"] in ["code", "markdown"]]
54
-
55
- for cell in nb_filtered:
56
- if cell["cell_type"] == "code":
57
- display_content += f"```python\n{cell['source']}\n```\n"
58
- text += cell["source"] + "\n"
59
- elif cell["cell_type"] == "markdown":
60
- display_content += f"{cell['source']}\n"
61
- text += cell["source"] + "\n"
62
- except Exception as e:
63
- display_content = f"Error reading IPYNB file: {e}"
64
- text = ""
65
-
66
- elif file_extension == "csv":
67
- try:
68
- loader = CSVLoader(file_path=uploaded_file.name, encoding="utf-8", csv_args={'delimiter': ','})
69
- documents = loader.load()
70
- text = "\n".join([doc.page_content for doc in documents])
71
- display_content = text # For CSV, display the concatenated text
72
- except Exception as e:
73
- display_content = f"Error reading CSV file: {e}"
74
- text = ""
75
- else:
76
- display_content = "Unsupported file type."
77
- text = ""
78
-
79
- return text, display_content
80
-
81
-
82
- def chunk_recursive(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace):
83
- if not text:
84
- return [], ""
85
- text_splitter = RecursiveCharacterTextSplitter(
86
- chunk_size=chunk_size,
87
- chunk_overlap=chunk_overlap,
88
- length_function=len,
89
- keep_separator=keep_separator,
90
- add_start_index=add_start_index,
91
- strip_whitespace=strip_whitespace,
92
- )
93
- chunks = text_splitter.create_documents([text])
94
- formatted_chunks = []
95
- for chunk in chunks:
96
- if isinstance(chunk, Document):
97
- formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
98
- else:
99
- formatted_chunks.append({"content": str(chunk), "metadata": {}})
100
-
101
- code_example = f"""
102
- from langchain.text_splitter import RecursiveCharacterTextSplitter
103
-
104
- text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
105
-
106
- text_splitter = RecursiveCharacterTextSplitter(
107
- chunk_size={chunk_size},
108
- chunk_overlap={chunk_overlap},
109
- length_function=len,
110
- keep_separator={keep_separator},
111
- add_start_index={add_start_index},
112
- strip_whitespace={strip_whitespace},
113
- )
114
- chunks = text_splitter.create_documents([text_content])
115
- # Access chunks: chunks[0].page_content, chunks[0].metadata
116
- """
117
- return formatted_chunks, code_example
118
-
119
- def chunk_character(text, chunk_size, chunk_overlap, separator, keep_separator, add_start_index, strip_whitespace):
120
- if not text:
121
- return [], ""
122
-
123
- if isinstance(separator, list):
124
- separator_str = "".join(separator)
125
- else:
126
- separator_str = separator
127
-
128
- text_splitter = CharacterTextSplitter(
129
- separator=separator_str,
130
- chunk_size=chunk_size,
131
- chunk_overlap=chunk_overlap,
132
- length_function=len,
133
- keep_separator=keep_separator,
134
- add_start_index=add_start_index,
135
- strip_whitespace=strip_whitespace,
136
- )
137
- chunks = text_splitter.create_documents([text])
138
- formatted_chunks = []
139
- for chunk in chunks:
140
- if isinstance(chunk, Document):
141
- formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
142
- else:
143
- formatted_chunks.append({"content": str(chunk), "metadata": {}})
144
-
145
- code_example = f"""
146
- from langchain.text_splitter import CharacterTextSplitter
147
-
148
- text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
149
-
150
- text_splitter = CharacterTextSplitter(
151
- separator=\"\"\"{separator_str}\"\"\",
152
- chunk_size={chunk_size},
153
- chunk_overlap={chunk_overlap},
154
- length_function=len,
155
- keep_separator={keep_separator},
156
- add_start_index={add_start_index},
157
- strip_whitespace={strip_whitespace},
158
- )
159
- chunks = text_splitter.create_documents([text_content])
160
- # Access chunks: chunks[0].page_content, chunks[0].metadata
161
- """
162
- return formatted_chunks, code_example
163
-
164
- def chunk_python_code(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace):
165
- if not text:
166
- return [], ""
167
- text_splitter = PythonCodeTextSplitter(
168
- chunk_size=chunk_size,
169
- chunk_overlap=chunk_overlap,
170
- keep_separator=keep_separator,
171
- add_start_index=add_start_index,
172
- strip_whitespace=strip_whitespace,
173
- )
174
- chunks = text_splitter.create_documents([text])
175
- formatted_chunks = []
176
- for chunk in chunks:
177
- if isinstance(chunk, Document):
178
- formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
179
- else:
180
- formatted_chunks.append({"content": str(chunk), "metadata": {}})
181
-
182
- code_example = f"""
183
- from langchain.text_splitter import PythonCodeTextSplitter
184
-
185
- text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
186
-
187
- text_splitter = PythonCodeTextSplitter(
188
- chunk_size={chunk_size},
189
- chunk_overlap={chunk_overlap},
190
- keep_separator={keep_separator},
191
- add_start_index={add_start_index},
192
- strip_whitespace={strip_whitespace},
193
- )
194
- chunks = text_splitter.create_documents([text_content])
195
- # Access chunks: chunks[0].page_content, chunks[0].metadata
196
- """
197
- return formatted_chunks, code_example
198
-
199
- def chunk_javascript_code(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace):
200
- if not text:
201
- return [], ""
202
- text_splitter = RecursiveCharacterTextSplitter.from_language(
203
- language=Language.JS,
204
- chunk_size=chunk_size,
205
- chunk_overlap=chunk_overlap,
206
- keep_separator=keep_separator,
207
- add_start_index=add_start_index,
208
- strip_whitespace=strip_whitespace,
209
- )
210
- chunks = text_splitter.create_documents([text])
211
- formatted_chunks = []
212
- for chunk in chunks:
213
- if isinstance(chunk, Document):
214
- formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
215
- else:
216
- formatted_chunks.append({"content": str(chunk), "metadata": {}})
217
-
218
- code_example = f"""
219
- from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
220
-
221
- text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
222
-
223
- text_splitter = RecursiveCharacterTextSplitter.from_language(
224
- language=Language.JS,
225
- chunk_size={chunk_size},
226
- chunk_overlap={chunk_overlap},
227
- keep_separator={keep_separator},
228
- add_start_index={add_start_index},
229
- strip_whitespace={strip_whitespace},
230
- )
231
- chunks = text_splitter.create_documents([text_content])
232
- # Access chunks: chunks[0].page_content, chunks[0].metadata
233
- """
234
- return formatted_chunks, code_example
235
-
236
- def chunk_markdown(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace):
237
- if not text:
238
- return [], ""
239
- text_splitter = MarkdownTextSplitter(
240
- chunk_size=chunk_size,
241
- chunk_overlap=chunk_overlap,
242
- length_function=len,
243
- keep_separator=keep_separator,
244
- add_start_index=add_start_index,
245
- strip_whitespace=strip_whitespace,
246
- )
247
- chunks = text_splitter.create_documents([text])
248
- formatted_chunks = []
249
- for chunk in chunks:
250
- if isinstance(chunk, Document):
251
- formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
252
- else:
253
- formatted_chunks.append({"content": str(chunk), "metadata": {}})
254
-
255
- code_example = f"""
256
- from langchain.text_splitter import MarkdownTextSplitter
257
-
258
- text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
259
-
260
- text_splitter = MarkdownTextSplitter(
261
- chunk_size={chunk_size},
262
- chunk_overlap={chunk_overlap},
263
- length_function=len,
264
- keep_separator={keep_separator},
265
- add_start_index={add_start_index},
266
- strip_whitespace={strip_whitespace},
267
- )
268
- chunks = text_splitter.create_documents([text_content])
269
- # Access chunks: chunks[0].page_content, chunks[0].metadata
270
- """
271
- return formatted_chunks, code_example
272
-
273
- def main_interface(uploaded_file, chunk_size, chunk_overlap, separator, keep_separator, add_start_index, strip_whitespace):
274
- if uploaded_file is None:
275
- return "", "", [], [], [], [], [], "", "", "", "", "", "", "", "", "", "", ""
276
-
277
- # Ensure chunk_size and chunk_overlap are integers
278
- chunk_size = int(chunk_size)
279
- chunk_overlap = int(chunk_overlap)
280
-
281
- raw_text, display_content = process_uploaded_file(uploaded_file)
282
-
283
- recursive_chunks, recursive_code = chunk_recursive(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace)
284
- character_chunks, character_code = chunk_character(raw_text, chunk_size, chunk_overlap, separator, keep_separator, add_start_index, strip_whitespace)
285
- markdown_chunks, markdown_code = chunk_markdown(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace)
286
- python_chunks, python_code = chunk_python_code(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace)
287
- javascript_chunks, javascript_code = chunk_javascript_code(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace)
288
-
289
- return (
290
- display_content,
291
- raw_text,
292
- recursive_chunks,
293
- character_chunks,
294
- markdown_chunks,
295
- python_chunks,
296
- javascript_chunks,
297
- f"Number of chunks: {len(recursive_chunks)}",
298
- f"Number of chunks: {len(character_chunks)}",
299
- f"Number of chunks: {len(markdown_chunks)}",
300
- f"Number of chunks: {len(python_chunks)}",
301
- f"Number of chunks: {len(javascript_chunks)}",
302
- recursive_code,
303
- character_code,
304
- markdown_code,
305
- python_code,
306
- javascript_code
307
- )
308
-
309
- # --------------------------------------------- Gradio Interface ----------------------------------------------------------#
310
-
311
- with gr.Blocks(theme=gr.themes.Soft(), title="πŸ¦œοΈπŸ”— LangChain Text Chunker") as demo:
312
- gr.Markdown(
313
- """
314
- # πŸ¦œοΈπŸ”— LangChain Text Chunker
315
- Welcome to the LangChain Text Chunker application! This tool allows you to upload various document types,
316
- extract their text content, and then apply different LangChain text splitting (chunking) methods.
317
- You can observe how each method breaks down the text into smaller, manageable chunks, along with their metadata.
318
-
319
- ### How to Use:
320
- 1. **Upload your document**: Select a file (PDF, DOCX, TXT, HTML, CSS, PY, IPYNB, CSV) using the file input.
321
- 2. **Adjust Chunking Parameters**: Use the sliders and dropdowns to customize `Chunk Size`, `Chunk Overlap`,
322
- `Character Splitter Separator`, `Keep Separator` behavior, `Add Start Index` to metadata, and `Strip Whitespace`.
323
- 3. **Process Document**: Click the "Process Document" button to see the extracted raw text and the results
324
- of various chunking methods in their respective tabs.
325
- 4. **Explore Chunks**: Each tab will display the chunks as JSON, along with the total number of chunks created.
326
- 5. **Python Example Code**: You can view dynamically generated Python 🐍 example code.
327
- 6. **Inference**: This Gradio app is inferred from [Mervin Praison's work](https://mer.vin/2024/03/chunking-strategy/) about "Advanced Chunking Strategies".
328
- """
329
- )
330
-
331
- with gr.Row():
332
- with gr.Column(scale=1):
333
- file_input = gr.File(label="Upload your document", file_types=[".pdf", ".docx", ".txt", ".html", ".css", ".py", ".ipynb", ".csv"])
334
- process_button = gr.Button("Process Document", variant="primary")
335
-
336
- with gr.Accordion("Chunking Parameters", open=False):
337
- chunk_size_input = gr.Slider(minimum=100, maximum=2000, value=250, step=50, label="Chunk Size", info="Maximum size of chunks to return.")
338
- chunk_overlap_input = gr.Slider(minimum=0, maximum=500, value=0, step=10, label="Chunk Overlap", info="Overlap in characters between chunks.")
339
- separator_input = gr.Dropdown(
340
- label="Character Splitter Separator",
341
- choices=["\\n\\n", "\\n", " ", "", "\n", "." ,",", ";", ":", "!", "?", "-",
342
- "β€”", "(", ")", "[", "]", "{", "}", '"', "'",
343
- "β€œ", "”", "β€˜", "’", "..."], # Representing common separators
344
- value="\\n\\n",
345
- allow_custom_value=True,
346
- multiselect=True,
347
- info="Characters to split on for Character Chunking. Multiple selections will be joined."
348
- )
349
- keep_separator_input = gr.Dropdown(
350
- label="Keep Separator",
351
- choices=[True, False, "start", "end"],
352
- value=False,
353
- info="Whether to keep the separator and where to place it in each corresponding chunk (True='start')."
354
- )
355
- add_start_index_input = gr.Checkbox(label="Add Start Index to Metadata", value=True, info="If checked, includes chunk’s start index in metadata.")
356
- strip_whitespace_input = gr.Checkbox(label="Strip Whitespace", value=True, info="If checked, strips whitespace from the start and end of every document.")
357
-
358
- with gr.Column(scale=2):
359
- raw_text_display = gr.Textbox(label="Extracted Raw Text", lines=10, interactive=False, show_copy_button=True)
360
- hidden_raw_text = gr.State("") # To store the actual raw text for chunking
361
-
362
- with gr.Tabs():
363
- with gr.TabItem("Recursive Chunking"):
364
- recursive_count_output = gr.Markdown()
365
- recursive_output = gr.JSON(label="Recursive Chunks")
366
- recursive_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
367
- with gr.TabItem("Character Chunking"):
368
- character_count_output = gr.Markdown()
369
- character_output = gr.JSON(label="Character Chunks")
370
- character_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
371
- with gr.TabItem("Markdown Chunking"):
372
- markdown_count_output = gr.Markdown()
373
- markdown_output = gr.JSON(label="Markdown Chunks")
374
- markdown_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
375
- with gr.TabItem("Python Code Chunking"):
376
- python_count_output = gr.Markdown()
377
- python_output = gr.JSON(label="Python Code Chunks")
378
- python_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
379
- with gr.TabItem("JavaScript Code Chunking"):
380
- javascript_count_output = gr.Markdown()
381
- javascript_output = gr.JSON(label="JavaScript Code Chunks")
382
- javascript_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
383
-
384
- process_button.click(
385
- fn=main_interface,
386
- inputs=[
387
- file_input,
388
- chunk_size_input,
389
- chunk_overlap_input,
390
- separator_input,
391
- keep_separator_input,
392
- add_start_index_input,
393
- strip_whitespace_input
394
- ],
395
- outputs=[
396
- raw_text_display,
397
- hidden_raw_text,
398
- recursive_output,
399
- character_output,
400
- markdown_output,
401
- python_output,
402
- javascript_output,
403
- recursive_count_output,
404
- character_count_output,
405
- markdown_count_output,
406
- python_count_output,
407
- javascript_count_output,
408
- recursive_code_output,
409
- character_code_output,
410
- markdown_code_output,
411
- python_code_output,
412
- javascript_code_output
413
- ]
414
- )
415
-
416
- demo.launch()
 
1
+ # --------------------------------------------- Libraries ----------------------------------------------------------#
2
+ import gradio as gr
3
+ from PyPDF2 import PdfReader
4
+ import nbformat
5
+
6
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter, PythonCodeTextSplitter, Language
7
+ from langchain.docstore.document import Document
8
+ from langchain_community.document_loaders import Docx2txtLoader, CSVLoader
9
+
10
+ # --------------------------------------------- Functions ----------------------------------------------------------#
11
+
12
+ def process_uploaded_file(uploaded_file):
13
+ text = ""
14
+ display_content = ""
15
+ file_extension = uploaded_file.name.split(".")[-1]
16
+
17
+ if file_extension == "pdf":
18
+ try:
19
+ # Gradio's uploaded_file.name provides the path to the temporary file
20
+ pdf = PdfReader(uploaded_file.name)
21
+ for page in pdf.pages:
22
+ page_text = page.extract_text()
23
+ text += page_text + "\n"
24
+ display_content += page_text + "\n"
25
+ except Exception as e:
26
+ display_content = f"Error reading PDF file: {e}"
27
+ text = ""
28
+
29
+ elif file_extension == "docx":
30
+ try:
31
+ docx_loader = Docx2txtLoader(uploaded_file.name)
32
+ documents = docx_loader.load()
33
+ text = "\n".join([doc.page_content for doc in documents])
34
+ display_content = text
35
+ except Exception as e:
36
+ display_content = f"Error reading DOCX file: {e}"
37
+ text = ""
38
+
39
+ elif file_extension in ["html", "css", "py", "txt"]:
40
+ try:
41
+ with open(uploaded_file.name, "r", encoding="utf-8") as f:
42
+ file_content = f.read()
43
+ display_content = file_content # Display as plain text in Textbox
44
+ text = file_content
45
+ except Exception as e:
46
+ display_content = f"Error reading {file_extension.upper()} file: {e}"
47
+ text = ""
48
+
49
+ elif file_extension == "ipynb":
50
+ try:
51
+ # nbformat.read can take a file path
52
+ nb_content = nbformat.read(uploaded_file.name, as_version=4)
53
+ nb_filtered = [cell for cell in nb_content["cells"] if cell["cell_type"] in ["code", "markdown"]]
54
+
55
+ for cell in nb_filtered:
56
+ if cell["cell_type"] == "code":
57
+ display_content += f"```python\n{cell['source']}\n```\n"
58
+ text += cell["source"] + "\n"
59
+ elif cell["cell_type"] == "markdown":
60
+ display_content += f"{cell['source']}\n"
61
+ text += cell["source"] + "\n"
62
+ except Exception as e:
63
+ display_content = f"Error reading IPYNB file: {e}"
64
+ text = ""
65
+
66
+ elif file_extension == "csv":
67
+ try:
68
+ loader = CSVLoader(file_path=uploaded_file.name, encoding="utf-8", csv_args={'delimiter': ','})
69
+ documents = loader.load()
70
+ text = "\n".join([doc.page_content for doc in documents])
71
+ display_content = text # For CSV, display the concatenated text
72
+ except Exception as e:
73
+ display_content = f"Error reading CSV file: {e}"
74
+ text = ""
75
+ else:
76
+ display_content = "Unsupported file type."
77
+ text = ""
78
+
79
+ return text, display_content
80
+
81
+
82
+ def chunk_recursive(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace):
83
+ if not text:
84
+ return [], ""
85
+ text_splitter = RecursiveCharacterTextSplitter(
86
+ chunk_size=chunk_size,
87
+ chunk_overlap=chunk_overlap,
88
+ length_function=len,
89
+ keep_separator=keep_separator,
90
+ add_start_index=add_start_index,
91
+ strip_whitespace=strip_whitespace,
92
+ )
93
+ chunks = text_splitter.create_documents([text])
94
+ formatted_chunks = []
95
+ for chunk in chunks:
96
+ if isinstance(chunk, Document):
97
+ formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
98
+ else:
99
+ formatted_chunks.append({"content": str(chunk), "metadata": {}})
100
+
101
+ code_example = f"""
102
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
103
+
104
+ text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
105
+
106
+ text_splitter = RecursiveCharacterTextSplitter(
107
+ chunk_size={chunk_size},
108
+ chunk_overlap={chunk_overlap},
109
+ length_function=len,
110
+ keep_separator={keep_separator},
111
+ add_start_index={add_start_index},
112
+ strip_whitespace={strip_whitespace},
113
+ )
114
+ chunks = text_splitter.create_documents([text_content])
115
+ # Access chunks: chunks[0].page_content, chunks[0].metadata
116
+ """
117
+ return formatted_chunks, code_example
118
+
119
+ def chunk_character(text, chunk_size, chunk_overlap, separator, keep_separator, add_start_index, strip_whitespace):
120
+ if not text:
121
+ return [], ""
122
+
123
+ if isinstance(separator, list):
124
+ separator_str = "".join(separator)
125
+ else:
126
+ separator_str = separator
127
+
128
+ text_splitter = CharacterTextSplitter(
129
+ separator=separator_str,
130
+ chunk_size=chunk_size,
131
+ chunk_overlap=chunk_overlap,
132
+ length_function=len,
133
+ keep_separator=keep_separator,
134
+ add_start_index=add_start_index,
135
+ strip_whitespace=strip_whitespace,
136
+ )
137
+ chunks = text_splitter.create_documents([text])
138
+ formatted_chunks = []
139
+ for chunk in chunks:
140
+ if isinstance(chunk, Document):
141
+ formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
142
+ else:
143
+ formatted_chunks.append({"content": str(chunk), "metadata": {}})
144
+
145
+ code_example = f"""
146
+ from langchain.text_splitter import CharacterTextSplitter
147
+
148
+ text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
149
+
150
+ text_splitter = CharacterTextSplitter(
151
+ separator=\"\"\"{separator_str}\"\"\",
152
+ chunk_size={chunk_size},
153
+ chunk_overlap={chunk_overlap},
154
+ length_function=len,
155
+ keep_separator={keep_separator},
156
+ add_start_index={add_start_index},
157
+ strip_whitespace={strip_whitespace},
158
+ )
159
+ chunks = text_splitter.create_documents([text_content])
160
+ # Access chunks: chunks[0].page_content, chunks[0].metadata
161
+ """
162
+ return formatted_chunks, code_example
163
+
164
+ def chunk_python_code(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace):
165
+ if not text:
166
+ return [], ""
167
+ text_splitter = PythonCodeTextSplitter(
168
+ chunk_size=chunk_size,
169
+ chunk_overlap=chunk_overlap,
170
+ keep_separator=keep_separator,
171
+ add_start_index=add_start_index,
172
+ strip_whitespace=strip_whitespace,
173
+ )
174
+ chunks = text_splitter.create_documents([text])
175
+ formatted_chunks = []
176
+ for chunk in chunks:
177
+ if isinstance(chunk, Document):
178
+ formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
179
+ else:
180
+ formatted_chunks.append({"content": str(chunk), "metadata": {}})
181
+
182
+ code_example = f"""
183
+ from langchain.text_splitter import PythonCodeTextSplitter
184
+
185
+ text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
186
+
187
+ text_splitter = PythonCodeTextSplitter(
188
+ chunk_size={chunk_size},
189
+ chunk_overlap={chunk_overlap},
190
+ keep_separator={keep_separator},
191
+ add_start_index={add_start_index},
192
+ strip_whitespace={strip_whitespace},
193
+ )
194
+ chunks = text_splitter.create_documents([text_content])
195
+ # Access chunks: chunks[0].page_content, chunks[0].metadata
196
+ """
197
+ return formatted_chunks, code_example
198
+
199
+ def chunk_javascript_code(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace):
200
+ if not text:
201
+ return [], ""
202
+ text_splitter = RecursiveCharacterTextSplitter.from_language(
203
+ language=Language.JS,
204
+ chunk_size=chunk_size,
205
+ chunk_overlap=chunk_overlap,
206
+ keep_separator=keep_separator,
207
+ add_start_index=add_start_index,
208
+ strip_whitespace=strip_whitespace,
209
+ )
210
+ chunks = text_splitter.create_documents([text])
211
+ formatted_chunks = []
212
+ for chunk in chunks:
213
+ if isinstance(chunk, Document):
214
+ formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
215
+ else:
216
+ formatted_chunks.append({"content": str(chunk), "metadata": {}})
217
+
218
+ code_example = f"""
219
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
220
+
221
+ text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
222
+
223
+ text_splitter = RecursiveCharacterTextSplitter.from_language(
224
+ language=Language.JS,
225
+ chunk_size={chunk_size},
226
+ chunk_overlap={chunk_overlap},
227
+ keep_separator={keep_separator},
228
+ add_start_index={add_start_index},
229
+ strip_whitespace={strip_whitespace},
230
+ )
231
+ chunks = text_splitter.create_documents([text_content])
232
+ # Access chunks: chunks[0].page_content, chunks[0].metadata
233
+ """
234
+ return formatted_chunks, code_example
235
+
236
+ def chunk_markdown(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace):
237
+ if not text:
238
+ return [], ""
239
+ text_splitter = MarkdownTextSplitter(
240
+ chunk_size=chunk_size,
241
+ chunk_overlap=chunk_overlap,
242
+ length_function=len,
243
+ keep_separator=keep_separator,
244
+ add_start_index=add_start_index,
245
+ strip_whitespace=strip_whitespace,
246
+ )
247
+ chunks = text_splitter.create_documents([text])
248
+ formatted_chunks = []
249
+ for chunk in chunks:
250
+ if isinstance(chunk, Document):
251
+ formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata})
252
+ else:
253
+ formatted_chunks.append({"content": str(chunk), "metadata": {}})
254
+
255
+ code_example = f"""
256
+ from langchain.text_splitter import MarkdownTextSplitter
257
+
258
+ text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example
259
+
260
+ text_splitter = MarkdownTextSplitter(
261
+ chunk_size={chunk_size},
262
+ chunk_overlap={chunk_overlap},
263
+ length_function=len,
264
+ keep_separator={keep_separator},
265
+ add_start_index={add_start_index},
266
+ strip_whitespace={strip_whitespace},
267
+ )
268
+ chunks = text_splitter.create_documents([text_content])
269
+ # Access chunks: chunks[0].page_content, chunks[0].metadata
270
+ """
271
+ return formatted_chunks, code_example
272
+
273
+ def main_interface(uploaded_file, chunk_size, chunk_overlap, separator, keep_separator, add_start_index, strip_whitespace):
274
+ if uploaded_file is None:
275
+ return "", "", [], [], [], [], [], "", "", "", "", "", "", "", "", "", "", ""
276
+
277
+ # Ensure chunk_size and chunk_overlap are integers
278
+ chunk_size = int(chunk_size)
279
+ chunk_overlap = int(chunk_overlap)
280
+
281
+ raw_text, display_content = process_uploaded_file(uploaded_file)
282
+
283
+ recursive_chunks, recursive_code = chunk_recursive(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace)
284
+ character_chunks, character_code = chunk_character(raw_text, chunk_size, chunk_overlap, separator, keep_separator, add_start_index, strip_whitespace)
285
+ markdown_chunks, markdown_code = chunk_markdown(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace)
286
+ python_chunks, python_code = chunk_python_code(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace)
287
+ javascript_chunks, javascript_code = chunk_javascript_code(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace)
288
+
289
+ return (
290
+ display_content,
291
+ raw_text,
292
+ recursive_chunks,
293
+ character_chunks,
294
+ markdown_chunks,
295
+ python_chunks,
296
+ javascript_chunks,
297
+ f"Number of chunks: {len(recursive_chunks)}",
298
+ f"Number of chunks: {len(character_chunks)}",
299
+ f"Number of chunks: {len(markdown_chunks)}",
300
+ f"Number of chunks: {len(python_chunks)}",
301
+ f"Number of chunks: {len(javascript_chunks)}",
302
+ recursive_code,
303
+ character_code,
304
+ markdown_code,
305
+ python_code,
306
+ javascript_code
307
+ )
308
+
309
+ # --------------------------------------------- Gradio Interface ----------------------------------------------------------#
310
+
311
+ with gr.Blocks(theme=gr.themes.Soft(), title="πŸ¦œοΈπŸ”— LangChain Text Chunker") as demo:
312
+ gr.Markdown(
313
+ """
314
+ # πŸ¦œοΈπŸ”— LangChain Text Chunker
315
+ Welcome to the LangChain Text Chunker application! This tool allows you to upload various document types,
316
+ extract their text content, and then apply different LangChain text splitting (chunking) methods.
317
+ You can observe how each method breaks down the text into smaller, manageable chunks, along with their metadata.
318
+
319
+ ### How to Use:
320
+ 1. **Upload your document**: Select a file (PDF, DOCX, TXT, HTML, CSS, PY, IPYNB, CSV) using the file input.
321
+ 2. **Adjust Chunking Parameters**: Use the sliders and dropdowns to customize `Chunk Size`, `Chunk Overlap`,
322
+ `Character Splitter Separator`, `Keep Separator` behavior, `Add Start Index` to metadata, and `Strip Whitespace`.
323
+ 3. **Process Document**: Click the "Process Document" button to see the extracted raw text and the results
324
+ of various chunking methods in their respective tabs.
325
+ 4. **Explore Chunks**: Each tab will display the chunks as JSON, along with the total number of chunks created.
326
+ 5. **Python Example Code**: You can view dynamically generated Python 🐍 example code.
327
+ 6. **Inference**: This Gradio app is inferred from [Mervin Praison's work](https://mer.vin/2024/03/chunking-strategy/) about "Advanced Chunking Strategies".
328
+ """
329
+ )
330
+
331
+ with gr.Row():
332
+ with gr.Column(scale=1):
333
+ file_input = gr.File(label="Upload your document", file_types=[".pdf", ".docx", ".txt", ".html", ".css", ".py", ".ipynb", ".csv"])
334
+ process_button = gr.Button("Process Document", variant="primary")
335
+
336
+ with gr.Accordion("Chunking Parameters", open=False):
337
+ chunk_size_input = gr.Slider(minimum=100, maximum=2000, value=250, step=50, label="Chunk Size", info="Maximum size of chunks to return.")
338
+ chunk_overlap_input = gr.Slider(minimum=0, maximum=500, value=0, step=10, label="Chunk Overlap", info="Overlap in characters between chunks.")
339
+ separator_input = gr.Dropdown(
340
+ label="Character Splitter Separator",
341
+ choices=["\\n\\n", "\\n", " ", "", "\n", "." ,",", ";", ":", "!", "?", "-",
342
+ "β€”", "(", ")", "[", "]", "{", "}", '"', "'",
343
+ "β€œ", "”", "β€˜", "’", "..."], # Representing common separators
344
+ value="\\n\\n",
345
+ allow_custom_value=True,
346
+ multiselect=True,
347
+ info="Characters to split on for Character Chunking. Multiple selections will be joined."
348
+ )
349
+ keep_separator_input = gr.Dropdown(
350
+ label="Keep Separator",
351
+ choices=[True, False, "start", "end"],
352
+ value=False,
353
+ info="Whether to keep the separator and where to place it in each corresponding chunk (True='start')."
354
+ )
355
+ add_start_index_input = gr.Checkbox(label="Add Start Index to Metadata", value=True, info="If checked, includes chunk’s start index in metadata.")
356
+ strip_whitespace_input = gr.Checkbox(label="Strip Whitespace", value=True, info="If checked, strips whitespace from the start and end of every document.")
357
+
358
+ with gr.Column(scale=2):
359
+ raw_text_display = gr.Textbox(label="Extracted Raw Text", lines=10, interactive=False, show_copy_button=True)
360
+ hidden_raw_text = gr.State("") # To store the actual raw text for chunking
361
+
362
+ with gr.Tabs():
363
+ with gr.TabItem("Recursive Chunking"):
364
+ recursive_count_output = gr.Markdown()
365
+ recursive_output = gr.JSON(label="Recursive Chunks")
366
+ recursive_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
367
+ with gr.TabItem("Character Chunking"):
368
+ character_count_output = gr.Markdown()
369
+ character_output = gr.JSON(label="Character Chunks")
370
+ character_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
371
+ with gr.TabItem("Markdown Chunking"):
372
+ markdown_count_output = gr.Markdown()
373
+ markdown_output = gr.JSON(label="Markdown Chunks")
374
+ markdown_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
375
+ with gr.TabItem("Python Code Chunking"):
376
+ python_count_output = gr.Markdown()
377
+ python_output = gr.JSON(label="Python Code Chunks")
378
+ python_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
379
+ with gr.TabItem("JavaScript Code Chunking"):
380
+ javascript_count_output = gr.Markdown()
381
+ javascript_output = gr.JSON(label="JavaScript Code Chunks")
382
+ javascript_code_output = gr.Code(label="Python Code Example", language="python", interactive=False)
383
+
384
+ process_button.click(
385
+ fn=main_interface,
386
+ inputs=[
387
+ file_input,
388
+ chunk_size_input,
389
+ chunk_overlap_input,
390
+ separator_input,
391
+ keep_separator_input,
392
+ add_start_index_input,
393
+ strip_whitespace_input
394
+ ],
395
+ outputs=[
396
+ raw_text_display,
397
+ hidden_raw_text,
398
+ recursive_output,
399
+ character_output,
400
+ markdown_output,
401
+ python_output,
402
+ javascript_output,
403
+ recursive_count_output,
404
+ character_count_output,
405
+ markdown_count_output,
406
+ python_count_output,
407
+ javascript_count_output,
408
+ recursive_code_output,
409
+ character_code_output,
410
+ markdown_code_output,
411
+ python_code_output,
412
+ javascript_code_output
413
+ ]
414
+ )
415
+
416
+ demo.queue().launch(share=False, inbrowser=True)