Spaces:
Sleeping
Sleeping
""" | |
Quantumaurora: Advanced Transformer-based Language Model | |
Version: 1.0.0 | |
Created: 2025 | |
""" | |
import numpy as np | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from torch.utils.data import Dataset, DataLoader | |
from transformers import PreTrainedTokenizerFast | |
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders | |
import math | |
from typing import Optional, Dict, List, Tuple | |
from torch.cuda.amp import autocast, GradScaler | |
from torch.nn.parallel import DistributedDataParallel | |
import torch.distributed as dist | |
import torch.multiprocessing as mp | |
from torch.utils.checkpoint import checkpoint | |
import json | |
import os | |
from datetime import datetime | |
class QuantumauroraConfig: | |
"""Configuration class for Quantumaurora model""" | |
def __init__(self, | |
vocab_size: int = 50000, | |
d_model: int = 512, | |
num_heads: int = 8, | |
num_layers: int = 6, | |
d_ff: int = 2048, | |
dropout: float = 0.1, | |
attention_type: str = "full", | |
use_checkpointing: bool = True, | |
max_sequence_length: int = 2048, | |
model_version: str = "1.0.0"): | |
self.vocab_size = vocab_size | |
self.d_model = d_model | |
self.num_heads = num_heads | |
self.num_layers = num_layers | |
self.d_ff = d_ff | |
self.dropout = dropout | |
self.attention_type = attention_type | |
self.use_checkpointing = use_checkpointing | |
self.max_sequence_length = max_sequence_length | |
self.model_version = model_version | |
self.model_type = "quantumaurora" | |
def save(self, path: str): | |
"""Save configuration to JSON file""" | |
config_dict = self.__dict__ | |
config_dict['timestamp'] = datetime.now().isoformat() | |
with open(path, 'w') as f: | |
json.dump(config_dict, f, indent=2) | |
def load(cls, path: str) -> 'QuantumauroraConfig': | |
"""Load configuration from JSON file""" | |
with open(path, 'r') as f: | |
config_dict = json.load(f) | |
# Remove timestamp from loaded config | |
if 'timestamp' in config_dict: | |
del config_dict['timestamp'] | |
return cls(**config_dict) | |
class Quantumaurora(nn.Module): | |
""" | |
Quantumaurora: Advanced Transformer-based Language Model | |
A state-of-the-art language model featuring: | |
- Multi-head attention with sparse/local patterns | |
- Multiple pre-training objectives | |
- Gradient checkpointing | |
- Mixed precision training | |
- Distributed training support | |
""" | |
def __init__(self, config: QuantumauroraConfig): | |
super().__init__() | |
self.config = config | |
# Model components | |
self.token_embedding = nn.Embedding(config.vocab_size, config.d_model) | |
self.positional_encoding = PositionalEncoding(config.d_model) | |
self.transformer_blocks = nn.ModuleList([ | |
TransformerBlock( | |
config.d_model, | |
config.num_heads, | |
config.d_ff, | |
config.dropout, | |
config.attention_type | |
) for _ in range(config.num_layers) | |
]) | |
self.pretraining_objectives = PreTrainingObjectives( | |
config.d_model, | |
config.vocab_size | |
) | |
self.dropout = nn.Dropout(config.dropout) | |
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: | |
x = self.token_embedding(x) | |
x = self.positional_encoding(x) | |
x = self.dropout(x) | |
for transformer_block in self.transformer_blocks: | |
if self.config.use_checkpointing and self.training: | |
x = checkpoint(transformer_block, x, mask) | |
else: | |
x = transformer_block(x, mask) | |
return self.pretraining_objectives(x) | |
def save_pretrained(self, path: str): | |
"""Save model and configuration""" | |
os.makedirs(path, exist_ok=True) | |
# Save configuration | |
config_path = os.path.join(path, 'config.json') | |
self.config.save(config_path) | |
# Save model weights | |
model_path = os.path.join(path, 'model.pt') | |
torch.save(self.state_dict(), model_path) | |
# Save tokenizer if available | |
if hasattr(self, 'tokenizer'): | |
tokenizer_path = os.path.join(path, 'tokenizer.json') | |
self.tokenizer.save(tokenizer_path) | |
def from_pretrained(cls, path: str) -> 'Quantumaurora': | |
"""Load pretrained model and configuration""" | |
config = QuantumauroraConfig.load(os.path.join(path, 'config.json')) | |
model = cls(config) | |
model_path = os.path.join(path, 'model.pt') | |
model.load_state_dict(torch.load(model_path)) | |
# Load tokenizer if available | |
tokenizer_path = os.path.join(path, 'tokenizer.json') | |
if os.path.exists(tokenizer_path): | |
model.tokenizer = PreTrainedTokenizerFast.from_file(tokenizer_path) | |
return model | |
class QuantumauroraTrainer: | |
"""Training manager for Quantumaurora model""" | |
def __init__(self, | |
model: Quantumaurora, | |
train_dataloader: DataLoader, | |
optimizer: torch.optim.Optimizer, | |
device: str = "cuda", | |
use_mixed_precision: bool = True, | |
distributed: bool = True): | |
self.model = model | |
self.train_dataloader = train_dataloader | |
self.optimizer = optimizer | |
self.device = device | |
self.use_mixed_precision = use_mixed_precision | |
self.distributed = distributed | |
if use_mixed_precision: | |
self.scaler = GradScaler() | |
if distributed: | |
self.model = DistributedDataParallel(model) | |
def train(self, num_epochs: int, save_dir: str = None): | |
"""Main training loop""" | |
best_loss = float('inf') | |
for epoch in range(num_epochs): | |
losses = self.train_epoch(epoch) | |
# Save checkpoint if this is the best model | |
if save_dir and losses['total'] < best_loss: | |
best_loss = losses['total'] | |
self.model.save_pretrained(os.path.join(save_dir, f'checkpoint-{epoch}')) | |
print(f"Epoch {epoch+1}/{num_epochs}") | |
for loss_name, loss_value in losses.items(): | |
print(f"{loss_name}: {loss_value:.4f}") | |
def main(): | |
"""Example usage of Quantumaurora""" | |
# Initialize configuration | |
config = QuantumauroraConfig( | |
vocab_size=50000, | |
d_model=768, | |
num_heads=12, | |
num_layers=12, | |
attention_type="sparse" | |
) | |
# Initialize model | |
model = Quantumaurora(config) | |
# Multi-GPU training if available | |
world_size = torch.cuda.device_count() | |
if world_size > 1: | |
mp.spawn( | |
train_distributed, | |
args=(world_size, model, dataset), | |
nprocs=world_size, | |
join=True | |
) | |
else: | |
# Single GPU training | |
trainer = QuantumauroraTrainer( | |
model=model, | |
train_dataloader=train_dataloader, | |
optimizer=torch.optim.Adam(model.parameters()), | |
use_mixed_precision=True, | |
distributed=False | |
) | |
trainer.train( | |
num_epochs=10, | |
save_dir='quantumaurora_checkpoints' | |
) | |
if __name__ == "__main__": | |
main() | |