vamossyd commited on
Commit
8d8fbd5
β€’
1 Parent(s): 09fec06

Delete Inference.ipynb

Browse files
Files changed (1) hide show
  1. Inference.ipynb +0 -230
Inference.ipynb DELETED
@@ -1,230 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "750fed8c",
6
- "metadata": {},
7
- "source": [
8
- "Must run the following:"
9
- ]
10
- },
11
- {
12
- "cell_type": "code",
13
- "execution_count": 1,
14
- "id": "ccad76ec",
15
- "metadata": {},
16
- "outputs": [
17
- {
18
- "name": "stdout",
19
- "output_type": "stream",
20
- "text": [
21
- "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\n"
22
- ]
23
- }
24
- ],
25
- "source": [
26
- "!git clone https://github.com/dvamossy/EmTract.git\n",
27
- "%cd EmTract\n",
28
- "!pip install -r requirements.txt "
29
- ]
30
- },
31
- {
32
- "cell_type": "markdown",
33
- "id": "2551adee",
34
- "metadata": {},
35
- "source": [
36
- "Text Cleaner for unprocessed text"
37
- ]
38
- },
39
- {
40
- "cell_type": "code",
41
- "execution_count": 2,
42
- "id": "687995ef",
43
- "metadata": {},
44
- "outputs": [
45
- {
46
- "name": "stderr",
47
- "output_type": "stream",
48
- "text": [
49
- "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\\emtract\\processors\\cleaning.py:68: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
50
- " symspell_list = pd.read_csv(\n"
51
- ]
52
- },
53
- {
54
- "data": {
55
- "text/plain": [
56
- "'soo well'"
57
- ]
58
- },
59
- "execution_count": 2,
60
- "metadata": {},
61
- "output_type": "execute_result"
62
- }
63
- ],
64
- "source": [
65
- "from emtract.processors.cleaning import clean_text\n",
66
- "# Illustrate text cleaning\n",
67
- "clean_text(\"soooooo well\", segment_words=False)"
68
- ]
69
- },
70
- {
71
- "cell_type": "markdown",
72
- "id": "6b81c0cd",
73
- "metadata": {},
74
- "source": [
75
- "Option I"
76
- ]
77
- },
78
- {
79
- "cell_type": "code",
80
- "execution_count": null,
81
- "id": "0ca68eb1",
82
- "metadata": {},
83
- "outputs": [],
84
- "source": [
85
- "from transformers import pipeline\n",
86
- "classifier = pipeline(\"text-classification\", model=\"vamossyd/emtract-distilbert-base-uncased-emotion\", return_all_scores=True)\n",
87
- "classifier(\"i love this!\")"
88
- ]
89
- },
90
- {
91
- "cell_type": "markdown",
92
- "id": "0b9cd58f",
93
- "metadata": {},
94
- "source": [
95
- "Option II"
96
- ]
97
- },
98
- {
99
- "cell_type": "code",
100
- "execution_count": null,
101
- "id": "524cb5d6",
102
- "metadata": {},
103
- "outputs": [],
104
- "source": [
105
- "import torch\n",
106
- "import pandas as pd\n",
107
- "import numpy as np\n",
108
- "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer\n",
109
- "\n",
110
- "# Create class for data preparation\n",
111
- "class SimpleDataset:\n",
112
- " def __init__(self, tokenized_texts):\n",
113
- " self.tokenized_texts = tokenized_texts\n",
114
- " \n",
115
- " def __len__(self):\n",
116
- " return len(self.tokenized_texts[\"input_ids\"])\n",
117
- " \n",
118
- " def __getitem__(self, idx):\n",
119
- " return {k: v[idx] for k, v in self.tokenized_texts.items()}"
120
- ]
121
- },
122
- {
123
- "cell_type": "code",
124
- "execution_count": null,
125
- "id": "1f9f01f4",
126
- "metadata": {},
127
- "outputs": [],
128
- "source": [
129
- "input_path = \"PROVIDE_PATH_TO_DATA\"\n",
130
- "# data = pd.read_csv(input_path) # ASSUMING DATA IS IN CSV\n",
131
- "\n",
132
- "# If text is already cleaned:\n",
133
- "# texts = data.text.tolist() \n",
134
- "\n",
135
- "# Otherwise:\n",
136
- "# texts = data['text'].apply(clean_text).tolist() # \n",
137
- "\n",
138
- "# As an example:\n",
139
- "texts = ['i love this', 'i do not love you', 'to the moon πŸš€']"
140
- ]
141
- },
142
- {
143
- "cell_type": "code",
144
- "execution_count": null,
145
- "id": "839cd230",
146
- "metadata": {},
147
- "outputs": [],
148
- "source": [
149
- "# load tokenizer and model, create trainer\n",
150
- "model_name = \"vamossyd/emtract-distilbert-base-uncased-emotion\"\n",
151
- "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
152
- "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
153
- "trainer = Trainer(model=model)\n",
154
- "\n",
155
- "# Tokenize texts and create prediction data set\n",
156
- "tokenized_texts = tokenizer(texts, truncation=True, padding=True)\n",
157
- "pred_dataset = SimpleDataset(tokenized_texts)\n",
158
- "predictions = trainer.predict(pred_dataset)"
159
- ]
160
- },
161
- {
162
- "cell_type": "code",
163
- "execution_count": null,
164
- "id": "3d903549",
165
- "metadata": {},
166
- "outputs": [],
167
- "source": [
168
- "# scores raw\n",
169
- "temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))\n",
170
- "preds = predictions.predictions.argmax(-1)\n",
171
- "labels = pd.Series(preds).map(model.config.id2label)\n",
172
- "\n",
173
- "# container\n",
174
- "anger = []\n",
175
- "disgust = []\n",
176
- "fear = []\n",
177
- "happy = []\n",
178
- "neutral = []\n",
179
- "sadness = []\n",
180
- "surprise = []\n",
181
- "\n",
182
- "# extract scores (as many entries as exist in pred_texts)\n",
183
- "for i in range(len(texts)):\n",
184
- " anger.append(temp[i][3])\n",
185
- " disgust.append(temp[i][4])\n",
186
- " fear.append(temp[i][6])\n",
187
- " happy.append(temp[i][1])\n",
188
- " neutral.append(temp[i][0])\n",
189
- " sadness.append(temp[i][2])\n",
190
- " surprise.append(temp[i][5])\n",
191
- " \n",
192
- "df = pd.DataFrame(list(zip(texts, labels, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text','pred_label', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n",
193
- "df.head()"
194
- ]
195
- },
196
- {
197
- "cell_type": "code",
198
- "execution_count": null,
199
- "id": "577f10b8",
200
- "metadata": {},
201
- "outputs": [],
202
- "source": [
203
- "# save results to csv\n",
204
- "output_path = \"YOUR_FILENAME_EMOTIONS.csv\" # name your output file\n",
205
- "# df.to_csv(YOUR_FILENAME)"
206
- ]
207
- }
208
- ],
209
- "metadata": {
210
- "kernelspec": {
211
- "display_name": "Python 3 (ipykernel)",
212
- "language": "python",
213
- "name": "python3"
214
- },
215
- "language_info": {
216
- "codemirror_mode": {
217
- "name": "ipython",
218
- "version": 3
219
- },
220
- "file_extension": ".py",
221
- "mimetype": "text/x-python",
222
- "name": "python",
223
- "nbconvert_exporter": "python",
224
- "pygments_lexer": "ipython3",
225
- "version": "3.10.9"
226
- }
227
- },
228
- "nbformat": 4,
229
- "nbformat_minor": 5
230
- }