pile-v2-eda / app.py
Reshinth Adithyan
Add local dedup version
d1b0126
raw
history blame
2.19 kB
import streamlit as st
import datasets
import os
import json
from transformers import AutoTokenizer
import ast
import re
version = st.sidebar.selectbox("Choose a version", ["init","local_dedup"])
if version == "init":
CACHE_DIR = "cache_ds/" #Use this to build the dataset
else:
CACHE_DIR = "local_dedup/"
contribution_json = "contributors.json"
contribution_dict = json.load(open(contribution_json,"r"))
IGNORE_LIST = ["Bible","Tanzil",""]
splits = [split for split in os.listdir(CACHE_DIR) if split not in IGNORE_LIST]
cached_ds = os.listdir(CACHE_DIR)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
def load_page(split):
with st.spinner('Downloading and buidling dataset...'):
if split not in cached_ds:
ds = datasets.load_dataset('CarperAI/pile-v2-small-filtered',"train", data_files="data/"+split+"/data.json")
else:
ds = datasets.load_from_disk(CACHE_DIR+split)
print("Sucessfully loaded "+split)
st.title("Dataset Explorer")
st.write(f"# {split}")
if split in contribution_dict:
st.caption(f"Contributors: {','.join(contribution_dict[split])}")
else:
st.caption(f"Needs to be updated....")
with st.form("dataset_form"):
index = st.slider('Select a row', 0, len(ds)-1, 0)
if st.form_submit_button("Load"):
st.write(f"Row {index}")
data = ds[index]
content = data["text"]
meta = data["meta"]
with st.expander("Render Content"):
st.write(content)
st.write("### Content:")
st.text(content)
st.write("### Meta:")
st.write(ast.literal_eval(meta))
#Tokenizer related count
tokenized = tokenizer(content, return_length=True)['length'][0]
token_count_metric = st.metric("Token Count(compared to 2048)",value=tokenized,delta=4096-tokenized)
#Word related count
split_words = re.findall(r'\w+', content)
word_count_metric = st.metric("Word Count",value=len(split_words))
demo_name = st.sidebar.selectbox("Choose a demo", splits)
load_page(demo_name)