awacke1's picture
Create app.py
88471fd verified
import streamlit as st
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import pandas as pd
def load_orca_dataset():
st.info("Loading dataset... This may take a while.")
return load_dataset("microsoft/orca-agentinstruct-1M-v1")
@st.cache_data
def load_model_and_tokenizer(model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
return tokenizer, model
def evaluate_model(ds, tokenizer, model, max_samples, text_field):
st.info("Evaluating the model...")
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
results = []
for i, example in enumerate(ds):
if i >= max_samples:
break
input_text = example[text_field]
result = classifier(input_text)[0]
results.append({"input": input_text, "label": result["label"], "score": result["score"]})
return results
def main():
st.title("Orca Dataset Browser and Model Evaluator")
st.sidebar.header("Configuration")
load_dataset_btn = st.sidebar.button("Load Dataset")
if load_dataset_btn:
dataset = load_orca_dataset()
st.session_state["dataset"] = dataset
if "dataset" in st.session_state:
dataset = st.session_state["dataset"]
# List available splits
available_splits = list(dataset.keys())
st.sidebar.subheader("Available Dataset Splits")
selected_split = st.sidebar.selectbox("Select Split", available_splits)
st.subheader("Dataset Explorer")
st.write(f"Displaying information for split: `{selected_split}`")
st.write(dataset[selected_split].info)
# Determine available fields
sample_entry = dataset[selected_split][0]
st.sidebar.subheader("Available Fields in Dataset")
available_fields = list(sample_entry.keys())
st.sidebar.write(available_fields)
text_field = st.sidebar.selectbox("Select Text Field", available_fields)
sample_size = st.slider("Number of Samples to Display", min_value=1, max_value=20, value=5)
st.write(dataset[selected_split].shuffle(seed=42).select(range(sample_size)))
st.subheader("Model Evaluator")
model_name = st.text_input("Enter Hugging Face Model Name", value="distilbert-base-uncased-finetuned-sst-2-english")
max_samples = st.number_input("Number of Samples to Evaluate", min_value=1, max_value=100, value=10)
if st.button("Load Model and Evaluate"):
tokenizer, model = load_model_and_tokenizer(model_name)
results = evaluate_model(dataset[selected_split].shuffle(seed=42).select(range(max_samples)), tokenizer, model, max_samples, text_field)
st.subheader("Evaluation Results")
st.write(results)
st.download_button(
label="Download Results as CSV",
data=pd.DataFrame(results).to_csv(index=False),
file_name="evaluation_results.csv",
mime="text/csv",
)
if __name__ == "__main__":
main()