detrina-grad / app.py
portalniy-dev's picture
Update app.py
8d2aba0 verified
raw
history blame
1.59 kB
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
import gradio as gr
# Шаг 1: Загружаем и подготавливаем датасеты
datasets = [
load_dataset('squad'),
load_dataset('conll2003'),
load_dataset('glue', 'mrpc'),
load_dataset('trec'),
load_dataset('babi')
]
# Шаг 2: Загружаем модель и токенизатор
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)
# Шаг 3: Токенизация и тренировка модели
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = []
for ds in datasets:
tokenized_ds = ds.map(tokenize_function, batched=True)
tokenized_datasets.append(tokenized_ds)
# Шаг 4: Оптимизация модели с помощью quantization
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
# Шаг 5: Создание функции для классификации текста
def classify_text(text):
tokens = tokenizer(text, return_tensors="pt")
outputs = model(**tokens)
return torch.nn.functional.softmax(outputs.logits, dim=-1).tolist()
# Шаг 6: Настройка Gradio интерфейса
interface = gr.Interface(fn=classify_text, inputs="text", outputs="json")
interface.launch()