E.L.N / app.py
Sephfox's picture
Update app.py
d84bf23 verified
raw
history blame
8.25 kB
# Imports
import streamlit as st
import numpy as np
import torch
import random
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from huggingface_hub import HfApi
import plotly.graph_objects as go
import time
from datetime import datetime
# Cyberpunk and Loading Animation Styling
def setup_cyberpunk_style():
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@400;500;700&display=swap');
@import url('https://fonts.googleapis.com/css2?family=Share+Tech+Mono&display=swap');
.stApp {
background: radial-gradient(circle, rgba(0, 0, 0, 0.95) 20%, rgba(0, 50, 80, 0.95) 90%);
color: #00ff9d;
font-family: 'Orbitron', sans-serif;
}
.main-title {
text-align: center;
font-size: 4em;
color: #00ff9d;
letter-spacing: 4px;
animation: glow 2s ease-in-out infinite alternate;
}
@keyframes glow {
from {text-shadow: 0 0 5px #00ff9d, 0 0 10px #00ff9d;}
to {text-shadow: 0 0 15px #00b8ff, 0 0 20px #00b8ff;}
}
.stButton > button {
font-family: 'Orbitron', sans-serif;
background: linear-gradient(45deg, #00ff9d, #00b8ff);
color: #000;
font-size: 1.1em;
padding: 10px 20px;
border: none;
border-radius: 8px;
transition: all 0.3s ease;
}
.stButton > button:hover {
transform: scale(1.1);
box-shadow: 0 0 20px rgba(0, 255, 157, 0.5);
}
.progress-bar-container {
background: rgba(0, 0, 0, 0.5);
border-radius: 15px;
overflow: hidden;
width: 100%;
height: 30px;
position: relative;
margin: 10px 0;
}
.progress-bar {
height: 100%;
width: 0%;
background: linear-gradient(45deg, #00ff9d, #00b8ff);
transition: width 0.5s ease;
}
</style>
""", unsafe_allow_html=True)
# Prepare Dataset Function with Padding Token Fix
def prepare_dataset(data, tokenizer, block_size=128):
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length')
raw_dataset = Dataset.from_dict({'text': data})
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_dataset = tokenized_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
return tokenized_dataset
# Training Dashboard Class with Enhanced Display
class TrainingDashboard:
def __init__(self):
self.metrics = {
'current_loss': 0,
'best_loss': float('inf'),
'generation': 0,
'individual': 0,
'start_time': time.time(),
'training_speed': 0
}
self.history = []
def update(self, loss, generation, individual):
self.metrics['current_loss'] = loss
self.metrics['generation'] = generation
self.metrics['individual'] = individual
if loss < self.metrics['best_loss']:
self.metrics['best_loss'] = loss
elapsed_time = time.time() - self.metrics['start_time']
self.metrics['training_speed'] = (generation * individual) / elapsed_time
self.history.append({'loss': loss, 'timestamp': datetime.now().strftime('%H:%M:%S')})
# Define Model Initialization
def initialize_model(model_name="gpt2"):
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
return model, tokenizer
# Load Dataset Function
def load_dataset(data_source="demo", tokenizer=None):
if data_source == "demo":
data = ["Sample text data for model training. This can be replaced with actual data for better performance."]
else:
data = ["Loaded data from uploaded text file."]
dataset = prepare_dataset(data, tokenizer)
return dataset
# Train Model Function with Customized Progress Bar
def train_model(model, train_dataset, tokenizer, epochs=3, batch_size=4):
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
save_steps=10_000,
save_total_limit=2,
logging_dir="./logs",
logging_steps=100,
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
trainer.train()
# Main App Logic
def main():
setup_cyberpunk_style()
st.markdown('<h1 class="main-title">Cyberpunk Neural Training Hub</h1>', unsafe_allow_html=True)
# Initialize model and tokenizer
model, tokenizer = initialize_model()
# Sidebar Configuration with Additional Options
with st.sidebar:
st.markdown("### Configuration Panel")
# Hugging Face API Token Input
hf_token = st.text_input("Enter your Hugging Face Token", type="password")
if hf_token:
api = HfApi()
api.set_access_token(hf_token)
st.success("Hugging Face token added successfully!")
# Training Parameters
training_epochs = st.slider("Training Epochs", min_value=1, max_value=5, value=3)
batch_size = st.slider("Batch Size", min_value=2, max_value=8, value=4)
model_choice = st.selectbox("Model Selection", ("gpt2", "distilgpt2", "gpt2-medium"))
# Dataset Source Selection
data_source = st.selectbox("Data Source", ("demo", "uploaded file"))
if data_source == "uploaded file":
uploaded_file = st.file_uploader("Upload a text file", type=["txt", "csv"])
custom_learning_rate = st.slider("Learning Rate", min_value=1e-6, max_value=5e-4, value=3e-5, step=1e-6)
# Advanced Settings Toggle
advanced_toggle = st.checkbox("Advanced Training Settings")
if advanced_toggle:
warmup_steps = st.slider("Warmup Steps", min_value=0, max_value=500, value=100)
weight_decay = st.slider("Weight Decay", min_value=0.0, max_value=0.1, step=0.01, value=0.01)
else:
warmup_steps = 100
weight_decay = 0.01
# Load Dataset
train_dataset = load_dataset(data_source, tokenizer, uploaded_file=uploaded_file)
def load_dataset(data_source="demo", tokenizer=None, uploaded_file=None):
if data_source == "demo":
data = ["Sample text data for model training. This can be replaced with actual data for better performance."]
elif uploaded_file is not None:
if uploaded_file.name.endswith(".txt"):
data = [uploaded_file.read().decode("utf-8")]
elif uploaded_file.name.endswith(".csv"):
import pandas as pd
df = pd.read_csv(uploaded_file)
data = df[df.columns[0]].tolist() # assuming first column is text data
else:
data = ["No file uploaded. Please upload a dataset."]
dataset = prepare_dataset(data, tokenizer)
return dataset
# Start Training with Progress Bar
progress_placeholder = st.empty()
st.markdown("### Model Training Progress")
for epoch in range(training_epochs):
train_model(model, train_dataset, tokenizer, epochs=1, batch_size=batch_size)
# Update Progress Bar
progress = (epoch + 1) / training_epochs * 100
progress_placeholder.markdown(f"""
<div class="progress-bar-container">
<div class="progress-bar" style="width: {progress}%;"></div>
</div>
""", unsafe_allow_html=True)
st.success("Training Complete!")
if __name__ == "__main__":
main()