import torch from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, pipeline from datasets import load_dataset import gradio as gr # Шаг 1: Загружаем и подготавливаем датасеты datasets = [ load_dataset('squad'), load_dataset('conll2003'), load_dataset('glue', 'mrpc'), load_dataset('trec'), load_dataset('babi') ] # Шаг 2: Загружаем модель и токенизатор model_name = 'distilbert-base-uncased' tokenizer = DistilBertTokenizer.from_pretrained(model_name) model = DistilBertForSequenceClassification.from_pretrained(model_name) # Шаг 3: Токенизация и тренировка модели def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) tokenized_datasets = [] for ds in datasets: tokenized_ds = ds.map(tokenize_function, batched=True) tokenized_datasets.append(tokenized_ds) # Шаг 4: Оптимизация модели с помощью quantization model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) # Шаг 5: Создание функции для классификации текста def classify_text(text): tokens = tokenizer(text, return_tensors="pt") outputs = model(**tokens) return torch.nn.functional.softmax(outputs.logits, dim=-1).tolist() # Шаг 6: Настройка Gradio интерфейса interface = gr.Interface(fn=classify_text, inputs="text", outputs="json") interface.launch()