cardiffnlp commited on
Commit
3f3ed66
1 Parent(s): 27ca1af

Adding tweeteval classifier

Browse files
Files changed (6) hide show
  1. README.md +83 -0
  2. config.json +37 -0
  3. pytorch_model.bin +3 -0
  4. special_tokens_map.json +1 -0
  5. tf_model.h5 +3 -0
  6. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: english
3
+ widget:
4
+ - text: "Covid cases are increasing fast!"
5
+ - text: "🤗"
6
+ - text: "I hate you 🤮"
7
+ ---
8
+
9
+
10
+ # Twitter-roBERTa-base for Sentiment Analysis
11
+
12
+ This is a roBERTa-base model trained on ~200M tweets and finetuned for sentiment analysis with the TweetEval benchmark. This model is suitable for English.
13
+
14
+ - Reference Paper: [_TweetEval_ (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf).
15
+ - Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
16
+
17
+ <b>Labels</b>:
18
+ 0 -> Negative;
19
+ 1 -> Neutral;
20
+ 2 -> Positive
21
+
22
+ ## Example Pipeline
23
+ ```python
24
+ from transformers import pipeline
25
+ sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
26
+ sentiment_task("Covid cases are increasing fast!")
27
+ ```
28
+ ```
29
+ [{'label': 'Negative', 'score': 0.7236}]
30
+ ```
31
+
32
+ ## Full classification example
33
+
34
+ ```python
35
+ from transformers import AutoModelForSequenceClassification
36
+ from transformers import TFAutoModelForSequenceClassification
37
+ from transformers import AutoTokenizer, AutoConfig
38
+ import numpy as np
39
+ from scipy.special import softmax
40
+ # Preprocess text (username and link placeholders)
41
+ def preprocess(text):
42
+ new_text = []
43
+ for t in text.split(" "):
44
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
45
+ t = 'http' if t.startswith('http') else t
46
+ new_text.append(t)
47
+ return " ".join(new_text)
48
+ MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
49
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
50
+ config = AutoConfig.from_pretrained(MODEL)
51
+ # PT
52
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
53
+ #model.save_pretrained(MODEL)
54
+ text = "Covid cases are increasing fast!"
55
+ text = preprocess(text)
56
+ encoded_input = tokenizer(text, return_tensors='pt')
57
+ output = model(**encoded_input)
58
+ scores = output[0][0].detach().numpy()
59
+ scores = softmax(scores)
60
+ # # TF
61
+ # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
62
+ # model.save_pretrained(MODEL)
63
+ # text = "Covid cases are increasing fast!"
64
+ # encoded_input = tokenizer(text, return_tensors='tf')
65
+ # output = model(encoded_input)
66
+ # scores = output[0][0].numpy()
67
+ # scores = softmax(scores)
68
+ # Print labels and scores
69
+ ranking = np.argsort(scores)
70
+ ranking = ranking[::-1]
71
+ for i in range(scores.shape[0]):
72
+ l = config.id2label[ranking[i]]
73
+ s = scores[ranking[i]]
74
+ print(f"{i+1}) {l} {np.round(float(s), 4)}")
75
+ ```
76
+
77
+ Output:
78
+
79
+ ```
80
+ 1) Negative 0.7236
81
+ 2) Neutral 0.2287
82
+ 3) Positive 0.0477
83
+ ```
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/jupyter/misc/tweeteval/TweetEval_models/sentiment/sentiment_latest_2021/",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "Negative",
16
+ "1": "Neutral",
17
+ "2": "Positive"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "Negative": 0,
23
+ "Neutral": 1,
24
+ "Positive": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.13.0.dev0",
35
+ "type_vocab_size": 1,
36
+ "vocab_size": 50265
37
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d24a3e32a88ed1c4e5b789fc6644e2e767500554e954b27dccf52a8e762cbae
3
+ size 501045531
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:682358ffb3869b08a144d5e59325534335729720fe64d5f2b3a543f8e5d14a9e
3
+ size 498845224
vocab.json ADDED
The diff for this file is too large to render. See raw diff