### 训练、测试集划分


```python
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/训练数据.csv')
df['class'] = df['class'].str.replace('正面', '1')
df['class'] = df['class'].str.replace('负面', '0')
df = df.rename(columns={'class': 'label'})
df = df.loc[:, ['text', 'label']]  # 此处列名text\label要与示例的数据集保持一致，后续trainer.train()才会自动识别labels，不报错

train_df, test_df = train_test_split(df, test_size=0.2)

train_df.to_csv('../data/训练数据/train.csv', index=False)
test_df.to_csv('../data/训练数据/test.csv', index=False)
```

### 加载训练所用的base tokenizer、model以及dataset


```python
from datasets import load_dataset
from transformers import AutoTokenizer
tokenizer = 'model/tokenizer-roberta-base-finetuned-dianping-chinese'
model = 'model/roberta-base-finetuned-dianping-chinese'

# 加载预训练的tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer)

# 加载模型，label分类为1、0（正负）两类
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2)

# 从本地或hub加载数据集
dataset = load_dataset('../data/训练数据')
# dataset = load_dataset('Fearao/guba_eastmoney')
```

    loading file vocab.txt
    loading file tokenizer.json
    loading file added_tokens.json
    loading file special_tokens_map.json
    loading file tokenizer_config.json
    loading configuration file model/roberta-base-finetuned-dianping-chinese/config.json
    Model config BertConfig {
      "_name_or_path": "model/roberta-base-finetuned-dianping-chinese",
      "architectures": [
        "BertForSequenceClassification"
      ],
      "attention_probs_dropout_prob": 0.1,
      "classifier_dropout": null,
      "hidden_act": "gelu",
      "hidden_dropout_prob": 0.1,
      "hidden_size": 768,
      "id2label": {
        "0": "negative (stars 1, 2 and 3)",
        "1": "positive (stars 4 and 5)"
      },
      "initializer_range": 0.02,
      "intermediate_size": 3072,
      "label2id": {
        "negative (stars 1, 2 and 3)": 0,
        "positive (stars 4 and 5)": 1
      },
      "layer_norm_eps": 1e-12,
      "max_position_embeddings": 512,
      "model_type": "bert",
      "num_attention_heads": 12,
      "num_hidden_layers": 12,
      "pad_token_id": 0,
      "position_embedding_type": "absolute",
      "torch_dtype": "float32",
      "transformers_version": "4.24.0",
      "type_vocab_size": 2,
      "use_cache": true,
      "vocab_size": 21128
    }
    
    loading weights file model/roberta-base-finetuned-dianping-chinese/pytorch_model.bin
    All model checkpoint weights were used when initializing BertForSequenceClassification.
    
    All the weights of BertForSequenceClassification were initialized from the model checkpoint at model/roberta-base-finetuned-dianping-chinese.
    If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


### 利用tokenizer分词


```python
def tokenize_function(examples):
    # 这里一定要加上max_length长度，要不tokenizer视为无max_length，后续训练可能出现长度不一的报错
    return tokenizer(examples["text"], max_length=256, padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
```

    Using custom data configuration 训练数据_test-3c60bc9557b87513


    Downloading and preparing dataset csv/训练数据_test to /root/.cache/huggingface/datasets/csv/训练数据_test-3c60bc9557b87513/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


    Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]


    Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]


    0 tables [00:00, ? tables/s]


    /home/fearao/anaconda3/envs/pytorch/lib/python3.8/site-packages/datasets/download/streaming_download_manager.py:714: FutureWarning: the 'mangle_dupe_cols' keyword is deprecated and will be removed in a future version. Please take steps to stop the use of 'mangle_dupe_cols'
      return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


    0 tables [00:00, ? tables/s]


    Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/训练数据_test-3c60bc9557b87513/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


    /home/fearao/anaconda3/envs/pytorch/lib/python3.8/site-packages/datasets/download/streaming_download_manager.py:714: FutureWarning: the 'mangle_dupe_cols' keyword is deprecated and will be removed in a future version. Please take steps to stop the use of 'mangle_dupe_cols'
      return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


      0%|          | 0/2 [00:00<?, ?it/s]


      0%|          | 0/8 [00:00<?, ?ba/s]


      0%|          | 0/2 [00:00<?, ?ba/s]


```python
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

import numpy as np
import evaluate

metric = evaluate.load("accuracy")
```

    PyTorch: setting up devices
    The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


```python
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
```


```python
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
```

    PyTorch: setting up devices
    The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


```python
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()
```

    The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
    /home/fearao/anaconda3/envs/pytorch/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
      warnings.warn(
    ***** Running training *****
      Num examples = 7087
      Num Epochs = 3
      Instantaneous batch size per device = 8
      Total train batch size (w. parallel, distributed & accumulation) = 8
      Gradient Accumulation steps = 1
      Total optimization steps = 2658
      Number of trainable parameters = 102269186


    <div>

      <progress value='2' max='2658' style='width:300px; height:20px; vertical-align: middle;'></progress>
      [   2/2658 : < :, Epoch 0.00/3]
    </div>
    <table border="1" class="dataframe">
  <thead>
 <tr style="text-align: left;">
      <th>Epoch</th>
      <th>Training Loss</th>
      <th>Validation Loss</th>
    </tr>
  </thead>
  <tbody>
  </tbody>
</table><p>


    Saving model checkpoint to test_trainer/checkpoint-500
    Configuration saved in test_trainer/checkpoint-500/config.json
    Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
    The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
    ***** Running Evaluation *****
      Num examples = 1772
      Batch size = 8
    Saving model checkpoint to test_trainer/checkpoint-1000
    Configuration saved in test_trainer/checkpoint-1000/config.json
    Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
    Saving model checkpoint to test_trainer/checkpoint-1500
    Configuration saved in test_trainer/checkpoint-1500/config.json
    Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
    The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
    ***** Running Evaluation *****
      Num examples = 1772
      Batch size = 8
    Saving model checkpoint to test_trainer/checkpoint-2000
    Configuration saved in test_trainer/checkpoint-2000/config.json
    Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
    Saving model checkpoint to test_trainer/checkpoint-2500
    Configuration saved in test_trainer/checkpoint-2500/config.json
    Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
    The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
    ***** Running Evaluation *****
      Num examples = 1772
      Batch size = 8
    
    
    Training completed. Do not forget to share your model on huggingface.co/models =)
    
    
    TrainOutput(global_step=2658, training_loss=0.20413605049196448, metrics={'train_runtime': 2680.4781, 'train_samples_per_second': 7.932, 'train_steps_per_second': 0.992, 'total_flos': 2797002074004480.0, 'train_loss': 0.20413605049196448, 'epoch': 3.0})


```python
trainer.save_model('model/fearao/roberta_based_on_eastmoney_guba_comments')  # 保存训练完成的模型
```

    Saving model checkpoint to model/fearao/roberta_based_on_eastmoney_guba_comments
    Configuration saved in model/fearao/roberta_based_on_eastmoney_guba_comments/config.json
    Model weights saved in model/fearao/roberta_based_on_eastmoney_guba_comments/pytorch_model.bin


### 测试


```python
from transformers import pipeline

# 从本地或hub调用模型
model_test = AutoModelForSequenceClassification.from_pretrained('model/fearao/roberta_based_on_eastmoney_guba_comments')
# model_test = AutoModelForSequenceClassification.from_pretrained('Fearao/RoBERTa_based_on_eastmoney_guba_comments')

# 使用原tokenizer
tokenizer = AutoTokenizer.from_pretrained('uer/roberta-base-finetuned-chinanews-chinese')
# tokenizer = AutoTokenizer.from_pretrained('model/tokenizer-roberta-base-finetuned-dianping-chinese')

text_classification = pipeline('sentiment-analysis', model=model_test, tokenizer=tokenizer)
text_classification('又跳水了')
```

    loading configuration file model/fearao/roberta_based_on_eastmoney_guba_comments/config.json
    Model config BertConfig {
      "_name_or_path": "model/fearao/roberta_based_on_eastmoney_guba_comments",
      "architectures": [
        "BertForSequenceClassification"
      ],
      "attention_probs_dropout_prob": 0.1,
      "classifier_dropout": null,
      "hidden_act": "gelu",
      "hidden_dropout_prob": 0.1,
      "hidden_size": 768,
      "id2label": {
        "0": "negative (stars 1, 2 and 3)",
        "1": "positive (stars 4 and 5)"
      },
      "initializer_range": 0.02,
      "intermediate_size": 3072,
      "label2id": {
        "negative (stars 1, 2 and 3)": 0,
        "positive (stars 4 and 5)": 1
      },
      "layer_norm_eps": 1e-12,
      "max_position_embeddings": 512,
      "model_type": "bert",
      "num_attention_heads": 12,
      "num_hidden_layers": 12,
      "pad_token_id": 0,
      "position_embedding_type": "absolute",
      "problem_type": "single_label_classification",
      "torch_dtype": "float32",
      "transformers_version": "4.24.0",
      "type_vocab_size": 2,
      "use_cache": true,
      "vocab_size": 21128
    }
    
    loading weights file model/fearao/roberta_based_on_eastmoney_guba_comments/pytorch_model.bin
    All model checkpoint weights were used when initializing BertForSequenceClassification.
    
    All the weights of BertForSequenceClassification were initialized from the model checkpoint at model/fearao/roberta_based_on_eastmoney_guba_comments.
    If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.