Spaces:
Sleeping
Sleeping
import streamlit as st | |
import json | |
import pandas as pd | |
from datasets import load_dataset | |
st.set_page_config(page_title="The Stack data Inspection", layout="wide") | |
st.title("The Stack data Inspection") | |
df = pd.read_csv("extension_distribution.csv") | |
all_extensions = df["extension"].tolist() | |
tags = {} | |
for index, row in df.iterrows(): | |
if row["language"] not in tags: | |
tags[row["language"]] = [] | |
tags[row["language"]].append(row["extension"]) | |
all_languages = list(tags.keys()) | |
def load_data(language, ext): | |
ds = load_dataset("loubnabnl/the-stack-inspection-data", data_dir=f"data/{language}/{ext}", split="train") | |
return ds | |
col1, col2, col3 = st.columns([1, 1, 4]) | |
with col1: | |
chosen_language = st.selectbox( | |
label="Select a programming language", | |
options=all_languages, | |
index=0) | |
with col2: | |
chosen_ext = st.selectbox( | |
label="Select an extension", | |
options=tags[chosen_language], | |
index=0) | |
samples = load_data(chosen_language, chosen_ext) | |
max_docs = len(samples) | |
samples = samples.add_column("idx", range(len(samples))) | |
not_lexed = samples.filter(lambda x: not x['lexable']) | |
indexes_not_lexed = not_lexed['idx'] | |
# info about extension | |
st.markdown("### Information about the extension:") | |
text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == chosen_ext]['low_alphanum_count'].values[0]} with very low alphanumeric ratio, \ | |
{df[df['extension'] == chosen_ext]['long_lines_count'].values[0]} with very long lines, and {df[df['extension'] == chosen_ext]['non_lexable_count'].values[0]} \ | |
are not lexable. These files are at indexes: {indexes_not_lexed}." | |
st.markdown(text) | |
col_1, col_2 = st.columns([2, 4]) | |
with col_1: | |
index_example = st.number_input(f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:", min_value=0, max_value=max_docs-1, value=0, step=1) | |
st.write(f"Example chosen:{index_example}") | |
# info about the chosen example | |
example = samples[index_example] | |
st.markdown("#### Information about the chosen example:") | |
text_alpha = "**has**" if example['long_lines'] else "doesn't have" | |
text_lines = "**has**" if example['low_alphanum'] else "doesn't have" | |
text_lexer = "is" if example['lexable'] else "**isn't**" | |
st.markdown(f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \ | |
{text_lines} very long lines, and {text_lexer} lexable.") | |
st.markdown("#### File content:") | |
st.code(example["content"], language=chosen_language) | |