Spaces:

sitwala
/

test_sentence_splitter

Sleeping

File size: 2,174 Bytes

import streamlit as st
import pandas as pd
from dolma.core.utils import split_paragraphs, split_sentences

# Title of the Streamlit app
st.title('Text Splitter: Paragraphs and Sentences')

# File uploader for text document
uploaded_file = st.file_uploader("Upload a text file", type=["txt"])

if uploaded_file:
    # Read the uploaded text file
    sample_text = uploaded_file.read().decode("utf-8")
else:
    # Text input from user
    sample_text = st.text_area("Or paste your text below", height=300)

if sample_text:
    # Split the text into paragraphs
    paragraphs = split_paragraphs(sample_text)

    # Split the text into sentences
    sentences = split_sentences(sample_text)

    # Show number of paragraphs and sentences
    st.write(f"Number of paragraphs: {len(paragraphs)}")
    st.write(f"Number of sentences: {len(sentences)}")

    # Create two columns for separate views
    col1, col2 = st.columns(2)

    # Display paragraphs in the left column
    with col1:
        st.header("Paragraphs")
        for i, paragraph in enumerate(paragraphs):
            st.subheader(f"Paragraph {i + 1}")
            st.write(paragraph.text)

    # Display sentences in the right column
    with col2:
        st.header("Sentences")
        for i, sentence in enumerate(sentences):
            st.subheader(f"Sentence {i + 1}")
            st.write(sentence.text)

    # Convert paragraphs and sentences to pandas DataFrames
    paragraphs_df = pd.DataFrame([p.text for p in paragraphs], columns=["Paragraph"])
    sentences_df = pd.DataFrame([s.text for s in sentences], columns=["Sentence"])

    # Option to download the paragraphs and sentences as CSV files
    st.download_button(
        label="Download Paragraphs as CSV",
        data=paragraphs_df.to_csv(index=False).encode('utf-8'),
        file_name="paragraphs.csv",
        mime="text/csv"
    )

    st.download_button(
        label="Download Sentences as CSV",
        data=sentences_df.to_csv(index=False).encode('utf-8'),
        file_name="sentences.csv",
        mime="text/csv"
    )

else:
    st.write("Please upload a text file or paste your text to split it into paragraphs and sentences.")