Spaces:
Sleeping
Sleeping
# --------------------------------------------- Libraries ----------------------------------------------------------# | |
import gradio as gr | |
from PyPDF2 import PdfReader | |
import nbformat | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter, PythonCodeTextSplitter, Language | |
from langchain.docstore.document import Document | |
from langchain_community.document_loaders import Docx2txtLoader, CSVLoader | |
# --------------------------------------------- Functions ----------------------------------------------------------# | |
def process_uploaded_file(uploaded_file): | |
text = "" | |
display_content = "" | |
file_extension = uploaded_file.name.split(".")[-1] | |
if file_extension == "pdf": | |
try: | |
# Gradio's uploaded_file.name provides the path to the temporary file | |
pdf = PdfReader(uploaded_file.name) | |
for page in pdf.pages: | |
page_text = page.extract_text() | |
text += page_text + "\n" | |
display_content += page_text + "\n" | |
except Exception as e: | |
display_content = f"Error reading PDF file: {e}" | |
text = "" | |
elif file_extension == "docx": | |
try: | |
docx_loader = Docx2txtLoader(uploaded_file.name) | |
documents = docx_loader.load() | |
text = "\n".join([doc.page_content for doc in documents]) | |
display_content = text | |
except Exception as e: | |
display_content = f"Error reading DOCX file: {e}" | |
text = "" | |
elif file_extension in ["html", "css", "py", "txt"]: | |
try: | |
with open(uploaded_file.name, "r", encoding="utf-8") as f: | |
file_content = f.read() | |
display_content = file_content # Display as plain text in Textbox | |
text = file_content | |
except Exception as e: | |
display_content = f"Error reading {file_extension.upper()} file: {e}" | |
text = "" | |
elif file_extension == "ipynb": | |
try: | |
# nbformat.read can take a file path | |
nb_content = nbformat.read(uploaded_file.name, as_version=4) | |
nb_filtered = [cell for cell in nb_content["cells"] if cell["cell_type"] in ["code", "markdown"]] | |
for cell in nb_filtered: | |
if cell["cell_type"] == "code": | |
display_content += f"```python\n{cell['source']}\n```\n" | |
text += cell["source"] + "\n" | |
elif cell["cell_type"] == "markdown": | |
display_content += f"{cell['source']}\n" | |
text += cell["source"] + "\n" | |
except Exception as e: | |
display_content = f"Error reading IPYNB file: {e}" | |
text = "" | |
elif file_extension == "csv": | |
try: | |
loader = CSVLoader(file_path=uploaded_file.name, encoding="utf-8", csv_args={'delimiter': ','}) | |
documents = loader.load() | |
text = "\n".join([doc.page_content for doc in documents]) | |
display_content = text # For CSV, display the concatenated text | |
except Exception as e: | |
display_content = f"Error reading CSV file: {e}" | |
text = "" | |
else: | |
display_content = "Unsupported file type." | |
text = "" | |
return text, display_content | |
def chunk_recursive(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace): | |
if not text: | |
return [], "" | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=len, | |
keep_separator=keep_separator, | |
add_start_index=add_start_index, | |
strip_whitespace=strip_whitespace, | |
) | |
chunks = text_splitter.create_documents([text]) | |
formatted_chunks = [] | |
for chunk in chunks: | |
if isinstance(chunk, Document): | |
formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata}) | |
else: | |
formatted_chunks.append({"content": str(chunk), "metadata": {}}) | |
code_example = f""" | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size={chunk_size}, | |
chunk_overlap={chunk_overlap}, | |
length_function=len, | |
keep_separator={keep_separator}, | |
add_start_index={add_start_index}, | |
strip_whitespace={strip_whitespace}, | |
) | |
chunks = text_splitter.create_documents([text_content]) | |
# Access chunks: chunks[0].page_content, chunks[0].metadata | |
""" | |
return formatted_chunks, code_example | |
def chunk_character(text, chunk_size, chunk_overlap, separator, keep_separator, add_start_index, strip_whitespace): | |
if not text: | |
return [], "" | |
if isinstance(separator, list): | |
separator_str = "".join(separator) | |
else: | |
separator_str = separator | |
text_splitter = CharacterTextSplitter( | |
separator=separator_str, | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=len, | |
keep_separator=keep_separator, | |
add_start_index=add_start_index, | |
strip_whitespace=strip_whitespace, | |
) | |
chunks = text_splitter.create_documents([text]) | |
formatted_chunks = [] | |
for chunk in chunks: | |
if isinstance(chunk, Document): | |
formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata}) | |
else: | |
formatted_chunks.append({"content": str(chunk), "metadata": {}}) | |
code_example = f""" | |
from langchain.text_splitter import CharacterTextSplitter | |
text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example | |
text_splitter = CharacterTextSplitter( | |
separator=\"\"\"{separator_str}\"\"\", | |
chunk_size={chunk_size}, | |
chunk_overlap={chunk_overlap}, | |
length_function=len, | |
keep_separator={keep_separator}, | |
add_start_index={add_start_index}, | |
strip_whitespace={strip_whitespace}, | |
) | |
chunks = text_splitter.create_documents([text_content]) | |
# Access chunks: chunks[0].page_content, chunks[0].metadata | |
""" | |
return formatted_chunks, code_example | |
def chunk_python_code(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace): | |
if not text: | |
return [], "" | |
text_splitter = PythonCodeTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
keep_separator=keep_separator, | |
add_start_index=add_start_index, | |
strip_whitespace=strip_whitespace, | |
) | |
chunks = text_splitter.create_documents([text]) | |
formatted_chunks = [] | |
for chunk in chunks: | |
if isinstance(chunk, Document): | |
formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata}) | |
else: | |
formatted_chunks.append({"content": str(chunk), "metadata": {}}) | |
code_example = f""" | |
from langchain.text_splitter import PythonCodeTextSplitter | |
text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example | |
text_splitter = PythonCodeTextSplitter( | |
chunk_size={chunk_size}, | |
chunk_overlap={chunk_overlap}, | |
keep_separator={keep_separator}, | |
add_start_index={add_start_index}, | |
strip_whitespace={strip_whitespace}, | |
) | |
chunks = text_splitter.create_documents([text_content]) | |
# Access chunks: chunks[0].page_content, chunks[0].metadata | |
""" | |
return formatted_chunks, code_example | |
def chunk_javascript_code(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace): | |
if not text: | |
return [], "" | |
text_splitter = RecursiveCharacterTextSplitter.from_language( | |
language=Language.JS, | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
keep_separator=keep_separator, | |
add_start_index=add_start_index, | |
strip_whitespace=strip_whitespace, | |
) | |
chunks = text_splitter.create_documents([text]) | |
formatted_chunks = [] | |
for chunk in chunks: | |
if isinstance(chunk, Document): | |
formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata}) | |
else: | |
formatted_chunks.append({"content": str(chunk), "metadata": {}}) | |
code_example = f""" | |
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language | |
text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example | |
text_splitter = RecursiveCharacterTextSplitter.from_language( | |
language=Language.JS, | |
chunk_size={chunk_size}, | |
chunk_overlap={chunk_overlap}, | |
keep_separator={keep_separator}, | |
add_start_index={add_start_index}, | |
strip_whitespace={strip_whitespace}, | |
) | |
chunks = text_splitter.create_documents([text_content]) | |
# Access chunks: chunks[0].page_content, chunks[0].metadata | |
""" | |
return formatted_chunks, code_example | |
def chunk_markdown(text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace): | |
if not text: | |
return [], "" | |
text_splitter = MarkdownTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=len, | |
keep_separator=keep_separator, | |
add_start_index=add_start_index, | |
strip_whitespace=strip_whitespace, | |
) | |
chunks = text_splitter.create_documents([text]) | |
formatted_chunks = [] | |
for chunk in chunks: | |
if isinstance(chunk, Document): | |
formatted_chunks.append({"content": chunk.page_content, "metadata": chunk.metadata}) | |
else: | |
formatted_chunks.append({"content": str(chunk), "metadata": {}}) | |
code_example = f""" | |
from langchain.text_splitter import MarkdownTextSplitter | |
text_content = \"\"\"{text[:50]}...\"\"\" # Truncated for example | |
text_splitter = MarkdownTextSplitter( | |
chunk_size={chunk_size}, | |
chunk_overlap={chunk_overlap}, | |
length_function=len, | |
keep_separator={keep_separator}, | |
add_start_index={add_start_index}, | |
strip_whitespace={strip_whitespace}, | |
) | |
chunks = text_splitter.create_documents([text_content]) | |
# Access chunks: chunks[0].page_content, chunks[0].metadata | |
""" | |
return formatted_chunks, code_example | |
def main_interface(uploaded_file, chunk_size, chunk_overlap, separator, keep_separator, add_start_index, strip_whitespace): | |
if uploaded_file is None: | |
return "", "", [], [], [], [], [], "", "", "", "", "", "", "", "", "", "", "" | |
# Ensure chunk_size and chunk_overlap are integers | |
chunk_size = int(chunk_size) | |
chunk_overlap = int(chunk_overlap) | |
raw_text, display_content = process_uploaded_file(uploaded_file) | |
recursive_chunks, recursive_code = chunk_recursive(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace) | |
character_chunks, character_code = chunk_character(raw_text, chunk_size, chunk_overlap, separator, keep_separator, add_start_index, strip_whitespace) | |
markdown_chunks, markdown_code = chunk_markdown(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace) | |
python_chunks, python_code = chunk_python_code(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace) | |
javascript_chunks, javascript_code = chunk_javascript_code(raw_text, chunk_size, chunk_overlap, keep_separator, add_start_index, strip_whitespace) | |
return ( | |
display_content, | |
raw_text, | |
recursive_chunks, | |
character_chunks, | |
markdown_chunks, | |
python_chunks, | |
javascript_chunks, | |
f"Number of chunks: {len(recursive_chunks)}", | |
f"Number of chunks: {len(character_chunks)}", | |
f"Number of chunks: {len(markdown_chunks)}", | |
f"Number of chunks: {len(python_chunks)}", | |
f"Number of chunks: {len(javascript_chunks)}", | |
recursive_code, | |
character_code, | |
markdown_code, | |
python_code, | |
javascript_code | |
) | |
# --------------------------------------------- Gradio Interface ----------------------------------------------------------# | |
with gr.Blocks(theme=gr.themes.Soft(), title="π¦οΈπ LangChain Text Chunker") as demo: | |
gr.Markdown( | |
""" | |
# π¦οΈπ LangChain Text Chunker | |
Welcome to the LangChain Text Chunker application! This tool allows you to upload various document types, | |
extract their text content, and then apply different LangChain text splitting (chunking) methods. | |
You can observe how each method breaks down the text into smaller, manageable chunks, along with their metadata. | |
### How to Use: | |
1. **Upload your document**: Select a file (PDF, DOCX, TXT, HTML, CSS, PY, IPYNB, CSV) using the file input. | |
2. **Adjust Chunking Parameters**: Use the sliders and dropdowns to customize `Chunk Size`, `Chunk Overlap`, | |
`Character Splitter Separator`, `Keep Separator` behavior, `Add Start Index` to metadata, and `Strip Whitespace`. | |
3. **Process Document**: Click the "Process Document" button to see the extracted raw text and the results | |
of various chunking methods in their respective tabs. | |
4. **Explore Chunks**: Each tab will display the chunks as JSON, along with the total number of chunks created. | |
5. **Python Example Code**: You can view dynamically generated Python π example code. | |
6. **Inference**: This Gradio app is inferred from [Mervin Praison's work](https://mer.vin/2024/03/chunking-strategy/) about "Advanced Chunking Strategies". | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_input = gr.File(label="Upload your document", file_types=[".pdf", ".docx", ".txt", ".html", ".css", ".py", ".ipynb", ".csv"]) | |
process_button = gr.Button("Process Document", variant="primary") | |
with gr.Accordion("Chunking Parameters", open=False): | |
chunk_size_input = gr.Slider(minimum=100, maximum=2000, value=250, step=50, label="Chunk Size", info="Maximum size of chunks to return.") | |
chunk_overlap_input = gr.Slider(minimum=0, maximum=500, value=0, step=10, label="Chunk Overlap", info="Overlap in characters between chunks.") | |
separator_input = gr.Dropdown( | |
label="Character Splitter Separator", | |
choices=["\\n\\n", "\\n", " ", "", "\n", "." ,",", ";", ":", "!", "?", "-", | |
"β", "(", ")", "[", "]", "{", "}", '"', "'", | |
"β", "β", "β", "β", "..."], # Representing common separators | |
value="\\n\\n", | |
allow_custom_value=True, | |
multiselect=True, | |
info="Characters to split on for Character Chunking. Multiple selections will be joined." | |
) | |
keep_separator_input = gr.Dropdown( | |
label="Keep Separator", | |
choices=[True, False, "start", "end"], | |
value=False, | |
info="Whether to keep the separator and where to place it in each corresponding chunk (True='start')." | |
) | |
add_start_index_input = gr.Checkbox(label="Add Start Index to Metadata", value=True, info="If checked, includes chunkβs start index in metadata.") | |
strip_whitespace_input = gr.Checkbox(label="Strip Whitespace", value=True, info="If checked, strips whitespace from the start and end of every document.") | |
with gr.Column(scale=2): | |
raw_text_display = gr.Textbox(label="Extracted Raw Text", lines=10, interactive=False, show_copy_button=True) | |
hidden_raw_text = gr.State("") # To store the actual raw text for chunking | |
with gr.Tabs(): | |
with gr.TabItem("Recursive Chunking"): | |
recursive_count_output = gr.Markdown() | |
recursive_output = gr.JSON(label="Recursive Chunks") | |
recursive_code_output = gr.Code(label="Python Code Example", language="python", interactive=False) | |
with gr.TabItem("Character Chunking"): | |
character_count_output = gr.Markdown() | |
character_output = gr.JSON(label="Character Chunks") | |
character_code_output = gr.Code(label="Python Code Example", language="python", interactive=False) | |
with gr.TabItem("Markdown Chunking"): | |
markdown_count_output = gr.Markdown() | |
markdown_output = gr.JSON(label="Markdown Chunks") | |
markdown_code_output = gr.Code(label="Python Code Example", language="python", interactive=False) | |
with gr.TabItem("Python Code Chunking"): | |
python_count_output = gr.Markdown() | |
python_output = gr.JSON(label="Python Code Chunks") | |
python_code_output = gr.Code(label="Python Code Example", language="python", interactive=False) | |
with gr.TabItem("JavaScript Code Chunking"): | |
javascript_count_output = gr.Markdown() | |
javascript_output = gr.JSON(label="JavaScript Code Chunks") | |
javascript_code_output = gr.Code(label="Python Code Example", language="python", interactive=False) | |
process_button.click( | |
fn=main_interface, | |
inputs=[ | |
file_input, | |
chunk_size_input, | |
chunk_overlap_input, | |
separator_input, | |
keep_separator_input, | |
add_start_index_input, | |
strip_whitespace_input | |
], | |
outputs=[ | |
raw_text_display, | |
hidden_raw_text, | |
recursive_output, | |
character_output, | |
markdown_output, | |
python_output, | |
javascript_output, | |
recursive_count_output, | |
character_count_output, | |
markdown_count_output, | |
python_count_output, | |
javascript_count_output, | |
recursive_code_output, | |
character_code_output, | |
markdown_code_output, | |
python_code_output, | |
javascript_code_output | |
] | |
) | |
demo.queue().launch(share=False, inbrowser=True) | |