Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
from dolma.core.utils import split_paragraphs, split_sentences | |
# Title of the Streamlit app | |
st.title('Text Splitter: Paragraphs and Sentences') | |
# File uploader for text document | |
uploaded_file = st.file_uploader("Upload a text file", type=["txt"]) | |
if uploaded_file: | |
# Read the uploaded text file | |
sample_text = uploaded_file.read().decode("utf-8") | |
else: | |
# Text input from user | |
sample_text = st.text_area("Or paste your text below", height=300) | |
if sample_text: | |
# Split the text into paragraphs | |
paragraphs = split_paragraphs(sample_text) | |
# Split the text into sentences | |
sentences = split_sentences(sample_text) | |
# Show number of paragraphs and sentences | |
st.write(f"Number of paragraphs: {len(paragraphs)}") | |
st.write(f"Number of sentences: {len(sentences)}") | |
# Create two columns for separate views | |
col1, col2 = st.columns(2) | |
# Display paragraphs in the left column | |
with col1: | |
st.header("Paragraphs") | |
for i, paragraph in enumerate(paragraphs): | |
st.subheader(f"Paragraph {i + 1}") | |
st.write(paragraph.text) | |
# Display sentences in the right column | |
with col2: | |
st.header("Sentences") | |
for i, sentence in enumerate(sentences): | |
st.subheader(f"Sentence {i + 1}") | |
st.write(sentence.text) | |
# Convert paragraphs and sentences to pandas DataFrames | |
paragraphs_df = pd.DataFrame([p.text for p in paragraphs], columns=["Paragraph"]) | |
sentences_df = pd.DataFrame([s.text for s in sentences], columns=["Sentence"]) | |
# Option to download the paragraphs and sentences as CSV files | |
st.download_button( | |
label="Download Paragraphs as CSV", | |
data=paragraphs_df.to_csv(index=False).encode('utf-8'), | |
file_name="paragraphs.csv", | |
mime="text/csv" | |
) | |
st.download_button( | |
label="Download Sentences as CSV", | |
data=sentences_df.to_csv(index=False).encode('utf-8'), | |
file_name="sentences.csv", | |
mime="text/csv" | |
) | |
else: | |
st.write("Please upload a text file or paste your text to split it into paragraphs and sentences.") | |