Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
st.set_page_config(page_title="Datasets Preparation", page_icon="š", layout="wide") | |
########################################## | |
# Read lines from the text file | |
with open("datafiles/sample_nep_corpus.txt") as file: | |
items = file.readlines() | |
# Split each line into separate columns | |
datacorpus = pd.DataFrame(items, columns=["Content"]) | |
# datacorpus.columns =["Content"] | |
# st.write(f"{datacorpus}") | |
datasentences = pd.read_csv("datafiles/sample_nep_sentences.csv") | |
data100k = pd.read_csv( | |
r"datafiles/sample_nep_spell_100k.csv", | |
nrows=50, | |
) | |
########################################### | |
st.title("Dataset Preparation") | |
st.write("---") | |
st.header( | |
""" | |
A Large Scale Nepali Text Corpus | |
""" | |
) | |
st.caption("**Table 1.** A Large Scale Nepali Text Corpus") | |
st.dataframe(datacorpus, use_container_width=True) | |
st.write("---") | |
st.header( | |
""" | |
Sentence extracted from A Large Scale Nepali Text Corpus | |
""" | |
) | |
st.caption("**Table 2.** Extracted sentences") | |
st.dataframe(datasentences, use_container_width=True) | |
st.write("---") | |
st.header( | |
""" | |
Parallel dataset using extracted sentences | |
""" | |
) | |
st.caption("**Table 3.** 100k Dataset used for training") | |
st.dataframe(data100k, use_container_width=True) | |