vamossyd commited on
Commit
32347fd
Β·
1 Parent(s): 57ddac8

Upload Inference.ipynb

Browse files
Files changed (1) hide show
  1. Inference.ipynb +305 -0
Inference.ipynb ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "750fed8c",
6
+ "metadata": {},
7
+ "source": [
8
+ "Must run the following:"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "ccad76ec",
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\n"
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "!git clone https://github.com/dvamossy/EmTract.git\n",
27
+ "%cd EmTract\n",
28
+ "!pip install -r requirements.txt "
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "id": "2551adee",
34
+ "metadata": {},
35
+ "source": [
36
+ "Text Cleaner for unprocessed text"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "id": "687995ef",
43
+ "metadata": {},
44
+ "outputs": [
45
+ {
46
+ "name": "stderr",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\\emtract\\processors\\cleaning.py:68: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
50
+ " symspell_list = pd.read_csv(\n"
51
+ ]
52
+ },
53
+ {
54
+ "data": {
55
+ "text/plain": [
56
+ "'soo well'"
57
+ ]
58
+ },
59
+ "execution_count": 2,
60
+ "metadata": {},
61
+ "output_type": "execute_result"
62
+ }
63
+ ],
64
+ "source": [
65
+ "from emtract.processors.cleaning import clean_text\n",
66
+ "# Illustrate text cleaning\n",
67
+ "clean_text(\"soooooo well\", segment_words=False)"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "markdown",
72
+ "id": "6b81c0cd",
73
+ "metadata": {},
74
+ "source": [
75
+ "Option I"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "id": "0ca68eb1",
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "from transformers import pipeline\n",
86
+ "classifier = pipeline(\"text-classification\", model=\"vamossyd/emtract-distilbert-base-uncased-emotion\", return_all_scores=True)\n",
87
+ "classifier(\"i love this!\")"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "markdown",
92
+ "id": "0b9cd58f",
93
+ "metadata": {},
94
+ "source": [
95
+ "Option II"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "id": "524cb5d6",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "import torch\n",
106
+ "import pandas as pd\n",
107
+ "import numpy as np\n",
108
+ "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer\n",
109
+ "\n",
110
+ "# Create class for data preparation\n",
111
+ "class SimpleDataset:\n",
112
+ " def __init__(self, tokenized_texts):\n",
113
+ " self.tokenized_texts = tokenized_texts\n",
114
+ " \n",
115
+ " def __len__(self):\n",
116
+ " return len(self.tokenized_texts[\"input_ids\"])\n",
117
+ " \n",
118
+ " def __getitem__(self, idx):\n",
119
+ " return {k: v[idx] for k, v in self.tokenized_texts.items()}"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": null,
125
+ "id": "1f9f01f4",
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "input_path = \"PROVIDE_PATH_TO_DATA\"\n",
130
+ "# data = pd.read_csv(input_path) # ASSUMING DATA IS IN CSV\n",
131
+ "\n",
132
+ "# If text is already cleaned:\n",
133
+ "# texts = data.text.tolist() \n",
134
+ "\n",
135
+ "# Otherwise:\n",
136
+ "# texts = data['text'].apply(clean_text).tolist() # \n",
137
+ "\n",
138
+ "# As an example:\n",
139
+ "texts = ['i love this', 'i do not love you', 'to the moon πŸš€']"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": null,
145
+ "id": "04ce5528",
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "# in case the model does not load, use git to clone it and use emtract-distilbert-base-uncased-emotion in the model_name field\n",
150
+ "\n",
151
+ "#!git clone https://huggingface.co/vamossyd/emtract-distilbert-base-uncased-emotion"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "id": "839cd230",
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "# load tokenizer and model, create trainer\n",
162
+ "model_name = \"vamossyd/emtract-distilbert-base-uncased-emotion\"\n",
163
+ "# model_name = \"emtract-distilbert-base-uncased-emotion\" # in case the model does not load\n",
164
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
165
+ "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
166
+ "trainer = Trainer(model=model)\n",
167
+ "\n",
168
+ "# Tokenize texts and create prediction data set\n",
169
+ "tokenized_texts = tokenizer(texts, truncation=True, padding=True)\n",
170
+ "pred_dataset = SimpleDataset(tokenized_texts)\n",
171
+ "predictions = trainer.predict(pred_dataset)"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "id": "3d903549",
178
+ "metadata": {},
179
+ "outputs": [],
180
+ "source": [
181
+ "# scores raw\n",
182
+ "temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))\n",
183
+ "preds = predictions.predictions.argmax(-1)\n",
184
+ "labels = pd.Series(preds).map(model.config.id2label)\n",
185
+ "\n",
186
+ "# container\n",
187
+ "anger = []\n",
188
+ "disgust = []\n",
189
+ "fear = []\n",
190
+ "happy = []\n",
191
+ "neutral = []\n",
192
+ "sadness = []\n",
193
+ "surprise = []\n",
194
+ "\n",
195
+ "# extract scores (as many entries as exist in pred_texts)\n",
196
+ "for i in range(len(texts)):\n",
197
+ " anger.append(temp[i][3])\n",
198
+ " disgust.append(temp[i][4])\n",
199
+ " fear.append(temp[i][6])\n",
200
+ " happy.append(temp[i][1])\n",
201
+ " neutral.append(temp[i][0])\n",
202
+ " sadness.append(temp[i][2])\n",
203
+ " surprise.append(temp[i][5])\n",
204
+ " \n",
205
+ "df = pd.DataFrame(list(zip(texts, labels, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text','pred_label', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n",
206
+ "df.head()"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": null,
212
+ "id": "577f10b8",
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "# save results to csv\n",
217
+ "output_path = \"YOUR_FILENAME_EMOTIONS.csv\" # name your output file\n",
218
+ "# df.to_csv(YOUR_FILENAME)"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "markdown",
223
+ "id": "ddd22317",
224
+ "metadata": {},
225
+ "source": [
226
+ "Option III\n",
227
+ "\n",
228
+ "Batch prediction in case data is too large."
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": null,
234
+ "id": "6f39375b",
235
+ "metadata": {},
236
+ "outputs": [],
237
+ "source": [
238
+ "# Specify batch size\n",
239
+ "batch_size = 100000\n",
240
+ "\n",
241
+ "# Split the texts into batches\n",
242
+ "text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]\n",
243
+ "\n",
244
+ "# Store the predictions\n",
245
+ "all_predictions = []\n",
246
+ "\n",
247
+ "# Iterate through batches\n",
248
+ "for batch in tqdm(text_batches):\n",
249
+ " # Tokenize texts and create prediction dataset\n",
250
+ " tokenized_texts = tokenizer(batch, truncation=True, padding=True)\n",
251
+ " pred_dataset = SimpleDataset(tokenized_texts)\n",
252
+ " predictions = trainer.predict(pred_dataset)[0]\n",
253
+ " all_predictions.extend(predictions)\n",
254
+ "\n",
255
+ "all_predictions = np.array(all_predictions)\n",
256
+ "\n",
257
+ "# scores raw\n",
258
+ "temp = (np.exp(all_predictions)/np.exp(all_predictions).sum(-1,keepdims=True))\n",
259
+ "\n",
260
+ "# container\n",
261
+ "anger = []\n",
262
+ "disgust = []\n",
263
+ "fear = []\n",
264
+ "happy = []\n",
265
+ "neutral = []\n",
266
+ "sadness = []\n",
267
+ "surprise = []\n",
268
+ "\n",
269
+ "# extract scores (as many entries as exist in pred_texts)\n",
270
+ "for i in range(len(texts)):\n",
271
+ " anger.append(temp[i][3])\n",
272
+ " disgust.append(temp[i][4])\n",
273
+ " fear.append(temp[i][6])\n",
274
+ " happy.append(temp[i][1])\n",
275
+ " neutral.append(temp[i][0])\n",
276
+ " sadness.append(temp[i][2])\n",
277
+ " surprise.append(temp[i][5])\n",
278
+ " \n",
279
+ "df = pd.DataFrame(list(zip(texts, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n",
280
+ "df.head()"
281
+ ]
282
+ }
283
+ ],
284
+ "metadata": {
285
+ "kernelspec": {
286
+ "display_name": "Python 3 (ipykernel)",
287
+ "language": "python",
288
+ "name": "python3"
289
+ },
290
+ "language_info": {
291
+ "codemirror_mode": {
292
+ "name": "ipython",
293
+ "version": 3
294
+ },
295
+ "file_extension": ".py",
296
+ "mimetype": "text/x-python",
297
+ "name": "python",
298
+ "nbconvert_exporter": "python",
299
+ "pygments_lexer": "ipython3",
300
+ "version": "3.10.9"
301
+ }
302
+ },
303
+ "nbformat": 4,
304
+ "nbformat_minor": 5
305
+ }