Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ st.title("The Stack data Inspection")
|
|
| 8 |
|
| 9 |
df = pd.read_csv("extension_distribution.csv")
|
| 10 |
all_extensions = df["extension"].tolist()
|
|
|
|
| 11 |
tags = {}
|
| 12 |
for index, row in df.iterrows():
|
| 13 |
if row["language"] not in tags:
|
|
@@ -18,26 +19,32 @@ all_languages = list(tags.keys())
|
|
| 18 |
|
| 19 |
@st.cache()
|
| 20 |
def load_data(language, ext):
|
| 21 |
-
ds = load_dataset(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
return ds
|
| 23 |
|
| 24 |
-
|
|
|
|
| 25 |
with col1:
|
| 26 |
chosen_language = st.selectbox(
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
index=0)
|
| 30 |
with col2:
|
| 31 |
chosen_ext = st.selectbox(
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
|
|
|
|
| 36 |
samples = load_data(chosen_language, chosen_ext)
|
| 37 |
max_docs = len(samples)
|
| 38 |
samples = samples.add_column("idx", range(len(samples)))
|
| 39 |
-
not_lexed = samples.filter(lambda x: not x[
|
| 40 |
-
indexes_not_lexed = not_lexed[
|
|
|
|
| 41 |
|
| 42 |
# info about extension
|
| 43 |
st.markdown("### Information about the extension:")
|
|
@@ -46,23 +53,30 @@ text = f"Extension {chosen_ext} has {max_docs} files, {df[df['extension'] == cho
|
|
| 46 |
are not lexable. These files are at indexes: {indexes_not_lexed}."
|
| 47 |
st.markdown(text)
|
| 48 |
|
| 49 |
-
col_1,
|
| 50 |
with col_1:
|
| 51 |
-
index_example = st.number_input(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
# info about the chosen example
|
| 55 |
example = samples[index_example]
|
| 56 |
st.markdown("#### Information about the chosen example:")
|
| 57 |
-
text_alpha = "**has**" if example[
|
| 58 |
-
text_lines = "**has**" if example[
|
| 59 |
-
text_lexer = "is" if example[
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
st.markdown(f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
|
| 63 |
-
{text_lines} very long lines, and {text_lexer} lexable.")
|
| 64 |
|
|
|
|
| 65 |
st.markdown("#### File content:")
|
| 66 |
-
|
| 67 |
st.code(example["content"], language=chosen_language)
|
| 68 |
-
|
|
|
|
| 8 |
|
| 9 |
df = pd.read_csv("extension_distribution.csv")
|
| 10 |
all_extensions = df["extension"].tolist()
|
| 11 |
+
|
| 12 |
tags = {}
|
| 13 |
for index, row in df.iterrows():
|
| 14 |
if row["language"] not in tags:
|
|
|
|
| 19 |
|
| 20 |
@st.cache()
|
| 21 |
def load_data(language, ext):
|
| 22 |
+
ds = load_dataset(
|
| 23 |
+
"loubnabnl/the-stack-inspection-data",
|
| 24 |
+
data_dir=f"data/{language}/{ext}",
|
| 25 |
+
split="train",
|
| 26 |
+
)
|
| 27 |
return ds
|
| 28 |
|
| 29 |
+
|
| 30 |
+
col1, col2, _ = st.columns([1, 1, 4])
|
| 31 |
with col1:
|
| 32 |
chosen_language = st.selectbox(
|
| 33 |
+
label="Select a programming language", options=all_languages, index=0
|
| 34 |
+
)
|
|
|
|
| 35 |
with col2:
|
| 36 |
chosen_ext = st.selectbox(
|
| 37 |
+
label="Select an extension", options=tags[chosen_language], index=0
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
|
| 41 |
+
# load the dataset and get indexes of non lexable files
|
| 42 |
samples = load_data(chosen_language, chosen_ext)
|
| 43 |
max_docs = len(samples)
|
| 44 |
samples = samples.add_column("idx", range(len(samples)))
|
| 45 |
+
not_lexed = samples.filter(lambda x: not x["lexable"])
|
| 46 |
+
indexes_not_lexed = not_lexed["idx"]
|
| 47 |
+
|
| 48 |
|
| 49 |
# info about extension
|
| 50 |
st.markdown("### Information about the extension:")
|
|
|
|
| 53 |
are not lexable. These files are at indexes: {indexes_not_lexed}."
|
| 54 |
st.markdown(text)
|
| 55 |
|
| 56 |
+
col_1, _ = st.columns([2, 4])
|
| 57 |
with col_1:
|
| 58 |
+
index_example = st.number_input(
|
| 59 |
+
f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
|
| 60 |
+
min_value=0,
|
| 61 |
+
max_value=max_docs - 1,
|
| 62 |
+
value=0,
|
| 63 |
+
step=1,
|
| 64 |
+
)
|
| 65 |
|
| 66 |
|
| 67 |
# info about the chosen example
|
| 68 |
example = samples[index_example]
|
| 69 |
st.markdown("#### Information about the chosen example:")
|
| 70 |
+
text_alpha = "**has**" if example["long_lines"] else "doesn't have"
|
| 71 |
+
text_lines = "**has**" if example["low_alphanum"] else "doesn't have"
|
| 72 |
+
text_lexer = "is" if example["lexable"] else "**isn't**"
|
| 73 |
|
| 74 |
+
st.markdown(
|
| 75 |
+
f"Example {index_example} {text_alpha} a very low alphanumeric ratio, \
|
| 76 |
+
{text_lines} very long lines, and {text_lexer} lexable."
|
| 77 |
+
)
|
| 78 |
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
# display file content
|
| 81 |
st.markdown("#### File content:")
|
|
|
|
| 82 |
st.code(example["content"], language=chosen_language)
|
|
|