import streamlit as st import pandas as pd from dolma.core.utils import split_paragraphs, split_sentences # Title of the Streamlit app st.title('Text Splitter: Paragraphs and Sentences') # File uploader for text document uploaded_file = st.file_uploader("Upload a text file", type=["txt"]) if uploaded_file: # Read the uploaded text file sample_text = uploaded_file.read().decode("utf-8") else: # Text input from user sample_text = st.text_area("Or paste your text below", height=300) if sample_text: # Split the text into paragraphs paragraphs = split_paragraphs(sample_text) # Split the text into sentences sentences = split_sentences(sample_text) # Show number of paragraphs and sentences st.write(f"Number of paragraphs: {len(paragraphs)}") st.write(f"Number of sentences: {len(sentences)}") # Create two columns for separate views col1, col2 = st.columns(2) # Display paragraphs in the left column with col1: st.header("Paragraphs") for i, paragraph in enumerate(paragraphs): st.subheader(f"Paragraph {i + 1}") st.write(paragraph.text) # Display sentences in the right column with col2: st.header("Sentences") for i, sentence in enumerate(sentences): st.subheader(f"Sentence {i + 1}") st.write(sentence.text) # Convert paragraphs and sentences to pandas DataFrames paragraphs_df = pd.DataFrame([p.text for p in paragraphs], columns=["Paragraph"]) sentences_df = pd.DataFrame([s.text for s in sentences], columns=["Sentence"]) # Option to download the paragraphs and sentences as CSV files st.download_button( label="Download Paragraphs as CSV", data=paragraphs_df.to_csv(index=False).encode('utf-8'), file_name="paragraphs.csv", mime="text/csv" ) st.download_button( label="Download Sentences as CSV", data=sentences_df.to_csv(index=False).encode('utf-8'), file_name="sentences.csv", mime="text/csv" ) else: st.write("Please upload a text file or paste your text to split it into paragraphs and sentences.")