vamossyd commited on
Commit
b8f33ae
β€’
1 Parent(s): 54c9ed4

Upload Inference.ipynb

Browse files

Adding notebook for simple huggingface inference.

Files changed (1) hide show
  1. Inference.ipynb +230 -0
Inference.ipynb ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "750fed8c",
6
+ "metadata": {},
7
+ "source": [
8
+ "Must run the following:"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "ccad76ec",
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\n"
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "!git clone https://github.com/dvamossy/EmTract.git\n",
27
+ "%cd EmTract\n",
28
+ "!pip install -r requirements.txt "
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "id": "2551adee",
34
+ "metadata": {},
35
+ "source": [
36
+ "Text Cleaner for unprocessed text"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "id": "687995ef",
43
+ "metadata": {},
44
+ "outputs": [
45
+ {
46
+ "name": "stderr",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\\emtract\\processors\\cleaning.py:68: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
50
+ " symspell_list = pd.read_csv(\n"
51
+ ]
52
+ },
53
+ {
54
+ "data": {
55
+ "text/plain": [
56
+ "'soo well'"
57
+ ]
58
+ },
59
+ "execution_count": 2,
60
+ "metadata": {},
61
+ "output_type": "execute_result"
62
+ }
63
+ ],
64
+ "source": [
65
+ "from emtract.processors.cleaning import clean_text\n",
66
+ "# Illustrate text cleaning\n",
67
+ "clean_text(\"soooooo well\", segment_words=False)"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "markdown",
72
+ "id": "6b81c0cd",
73
+ "metadata": {},
74
+ "source": [
75
+ "Option I"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "id": "0ca68eb1",
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "from transformers import pipeline\n",
86
+ "classifier = pipeline(\"text-classification\", model=\"vamossyd/emtract-distilbert-base-uncased-emotion\", return_all_scores=True)\n",
87
+ "classifier(\"i love this!\")"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "markdown",
92
+ "id": "0b9cd58f",
93
+ "metadata": {},
94
+ "source": [
95
+ "Option II"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "id": "524cb5d6",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "import torch\n",
106
+ "import pandas as pd\n",
107
+ "import numpy as np\n",
108
+ "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer\n",
109
+ "\n",
110
+ "# Create class for data preparation\n",
111
+ "class SimpleDataset:\n",
112
+ " def __init__(self, tokenized_texts):\n",
113
+ " self.tokenized_texts = tokenized_texts\n",
114
+ " \n",
115
+ " def __len__(self):\n",
116
+ " return len(self.tokenized_texts[\"input_ids\"])\n",
117
+ " \n",
118
+ " def __getitem__(self, idx):\n",
119
+ " return {k: v[idx] for k, v in self.tokenized_texts.items()}"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": null,
125
+ "id": "1f9f01f4",
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "input_path = \"PROVIDE_PATH_TO_DATA\"\n",
130
+ "# data = pd.read_csv(input_path) # ASSUMING DATA IS IN CSV\n",
131
+ "\n",
132
+ "# If text is already cleaned:\n",
133
+ "# texts = data.text.tolist() \n",
134
+ "\n",
135
+ "# Otherwise:\n",
136
+ "# texts = data['text'].apply(clean_text).tolist() # \n",
137
+ "\n",
138
+ "# As an example:\n",
139
+ "texts = ['i love this', 'i do not love you', 'to the moon πŸš€']"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": null,
145
+ "id": "839cd230",
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "# load tokenizer and model, create trainer\n",
150
+ "model_name = \"vamossyd/emtract-distilbert-base-uncased-emotion\"\n",
151
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
152
+ "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
153
+ "trainer = Trainer(model=model)\n",
154
+ "\n",
155
+ "# Tokenize texts and create prediction data set\n",
156
+ "tokenized_texts = tokenizer(texts, truncation=True, padding=True)\n",
157
+ "pred_dataset = SimpleDataset(tokenized_texts)\n",
158
+ "predictions = trainer.predict(pred_dataset)"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": null,
164
+ "id": "3d903549",
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "# scores raw\n",
169
+ "temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))\n",
170
+ "preds = predictions.predictions.argmax(-1)\n",
171
+ "labels = pd.Series(preds).map(model.config.id2label)\n",
172
+ "\n",
173
+ "# container\n",
174
+ "anger = []\n",
175
+ "disgust = []\n",
176
+ "fear = []\n",
177
+ "happy = []\n",
178
+ "neutral = []\n",
179
+ "sadness = []\n",
180
+ "surprise = []\n",
181
+ "\n",
182
+ "# extract scores (as many entries as exist in pred_texts)\n",
183
+ "for i in range(len(texts)):\n",
184
+ " anger.append(temp[i][3])\n",
185
+ " disgust.append(temp[i][4])\n",
186
+ " fear.append(temp[i][6])\n",
187
+ " happy.append(temp[i][1])\n",
188
+ " neutral.append(temp[i][0])\n",
189
+ " sadness.append(temp[i][2])\n",
190
+ " surprise.append(temp[i][5])\n",
191
+ " \n",
192
+ "df = pd.DataFrame(list(zip(texts, labels, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text','pred_label', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n",
193
+ "df.head()"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": null,
199
+ "id": "577f10b8",
200
+ "metadata": {},
201
+ "outputs": [],
202
+ "source": [
203
+ "# save results to csv\n",
204
+ "output_path = \"YOUR_FILENAME_EMOTIONS.csv\" # name your output file\n",
205
+ "# df.to_csv(YOUR_FILENAME)"
206
+ ]
207
+ }
208
+ ],
209
+ "metadata": {
210
+ "kernelspec": {
211
+ "display_name": "Python 3 (ipykernel)",
212
+ "language": "python",
213
+ "name": "python3"
214
+ },
215
+ "language_info": {
216
+ "codemirror_mode": {
217
+ "name": "ipython",
218
+ "version": 3
219
+ },
220
+ "file_extension": ".py",
221
+ "mimetype": "text/x-python",
222
+ "name": "python",
223
+ "nbconvert_exporter": "python",
224
+ "pygments_lexer": "ipython3",
225
+ "version": "3.10.9"
226
+ }
227
+ },
228
+ "nbformat": 4,
229
+ "nbformat_minor": 5
230
+ }