pile-v2-eda / app.py
Reshinth Adithyan
Remove single datapoint datasets
9c88e2b
raw
history blame
2.06 kB
import streamlit as st
import datasets
import os
import json
from transformers import AutoTokenizer
import ast
import re
CACHE_DIR = "cache_ds/" #Use this to build the dataset
contribution_json = "contributors.json"
contribution_dict = json.load(open(contribution_json,"r"))
IGNORE_LIST = ["Bible","Tanzil",""]
splits = [split for split in os.listdir(CACHE_DIR) if split not in IGNORE_LIST]
cached_ds = os.listdir(CACHE_DIR)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
def load_page(split):
with st.spinner('Downloading and buidling dataset...'):
if split not in cached_ds:
ds = datasets.load_dataset('CarperAI/pile-v2-small-filtered',"train", data_files="data/"+split+"/data.json")
else:
ds = datasets.load_from_disk(CACHE_DIR+split)
print("Sucessfully loaded "+split)
st.title("Dataset Explorer")
st.write(f"# {split}")
if split in contribution_dict:
st.caption(f"Contributors: {','.join(contribution_dict[split])}")
else:
st.caption(f"Needs to be updated....")
with st.form("dataset_form"):
index = st.slider('Select a row', 0, len(ds)-1, 0)
if st.form_submit_button("Load"):
st.write(f"Row {index}")
data = ds[index]
content = data["text"]
meta = data["meta"]
with st.expander("Render Content"):
st.write(content)
st.write("### Content:")
st.text(content)
st.write("### Meta:")
st.write(ast.literal_eval(meta))
#Tokenizer related count
tokenized = tokenizer(content, return_length=True)['length'][0]
token_count_metric = st.metric("Token Count(compared to 2048)",value=tokenized,delta=4096-tokenized)
#Word related count
split_words = re.findall(r'\w+', content)
word_count_metric = st.metric("Word Count",value=len(split_words))
demo_name = st.sidebar.selectbox("Choose a demo", splits)
load_page(demo_name)