vamossyd
/

emtract-distilbert-base-uncased-emotion

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "750fed8c",
+   "metadata": {},
+   "source": [
+    "Must run the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ccad76ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\n"
+     ]
+    }
+   ],
+   "source": [
+    "!git clone https://github.com/dvamossy/EmTract.git\n",
+    "%cd EmTract\n",
+    "!pip install -r requirements.txt "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2551adee",
+   "metadata": {},
+   "source": [
+    "Text Cleaner for unprocessed text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "687995ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\\emtract\\processors\\cleaning.py:68: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
+      "  symspell_list = pd.read_csv(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'soo well'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from emtract.processors.cleaning import clean_text\n",
+    "# Illustrate text cleaning\n",
+    "clean_text(\"soooooo well\", segment_words=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b81c0cd",
+   "metadata": {},
+   "source": [
+    "Option I"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ca68eb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "classifier = pipeline(\"text-classification\", model=\"vamossyd/emtract-distilbert-base-uncased-emotion\", return_all_scores=True)\n",
+    "classifier(\"i love this!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b9cd58f",
+   "metadata": {},
+   "source": [
+    "Option II"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "524cb5d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer\n",
+    "\n",
+    "# Create class for data preparation\n",
+    "class SimpleDataset:\n",
+    "    def __init__(self, tokenized_texts):\n",
+    "        self.tokenized_texts = tokenized_texts\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.tokenized_texts[\"input_ids\"])\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        return {k: v[idx] for k, v in self.tokenized_texts.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f9f01f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_path = \"PROVIDE_PATH_TO_DATA\"\n",
+    "# data = pd.read_csv(input_path) # ASSUMING DATA IS IN CSV\n",
+    "\n",
+    "# If text is already cleaned:\n",
+    "# texts = data.text.tolist() \n",
+    "\n",
+    "# Otherwise:\n",
+    "# texts = data['text'].apply(clean_text).tolist() # \n",
+    "\n",
+    "# As an example:\n",
+    "texts = ['i love this', 'i do not love you', 'to the moon 🚀']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04ce5528",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# in case the model does not load, use git to clone it and use emtract-distilbert-base-uncased-emotion in the model_name field\n",
+    "\n",
+    "#!git clone https://huggingface.co/vamossyd/emtract-distilbert-base-uncased-emotion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "839cd230",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load tokenizer and model, create trainer\n",
+    "model_name = \"vamossyd/emtract-distilbert-base-uncased-emotion\"\n",
+    "# model_name = \"emtract-distilbert-base-uncased-emotion\" # in case the model does not load\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
+    "trainer = Trainer(model=model)\n",
+    "\n",
+    "# Tokenize texts and create prediction data set\n",
+    "tokenized_texts = tokenizer(texts, truncation=True, padding=True)\n",
+    "pred_dataset = SimpleDataset(tokenized_texts)\n",
+    "predictions = trainer.predict(pred_dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d903549",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# scores raw\n",
+    "temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))\n",
+    "preds = predictions.predictions.argmax(-1)\n",
+    "labels = pd.Series(preds).map(model.config.id2label)\n",
+    "\n",
+    "# container\n",
+    "anger = []\n",
+    "disgust = []\n",
+    "fear = []\n",
+    "happy = []\n",
+    "neutral = []\n",
+    "sadness = []\n",
+    "surprise = []\n",
+    "\n",
+    "# extract scores (as many entries as exist in pred_texts)\n",
+    "for i in range(len(texts)):\n",
+    "    anger.append(temp[i][3])\n",
+    "    disgust.append(temp[i][4])\n",
+    "    fear.append(temp[i][6])\n",
+    "    happy.append(temp[i][1])\n",
+    "    neutral.append(temp[i][0])\n",
+    "    sadness.append(temp[i][2])\n",
+    "    surprise.append(temp[i][5])\n",
+    "    \n",
+    "df = pd.DataFrame(list(zip(texts, labels, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text','pred_label', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "577f10b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save results to csv\n",
+    "output_path = \"YOUR_FILENAME_EMOTIONS.csv\"  # name your output file\n",
+    "# df.to_csv(YOUR_FILENAME)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ddd22317",
+   "metadata": {},
+   "source": [
+    "Option III\n",
+    "\n",
+    "Batch prediction in case data is too large."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f39375b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Specify batch size\n",
+    "batch_size = 100000\n",
+    "\n",
+    "# Split the texts into batches\n",
+    "text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]\n",
+    "\n",
+    "# Store the predictions\n",
+    "all_predictions = []\n",
+    "\n",
+    "# Iterate through batches\n",
+    "for batch in tqdm(text_batches):\n",
+    "    # Tokenize texts and create prediction dataset\n",
+    "    tokenized_texts = tokenizer(batch, truncation=True, padding=True)\n",
+    "    pred_dataset = SimpleDataset(tokenized_texts)\n",
+    "    predictions = trainer.predict(pred_dataset)[0]\n",
+    "    all_predictions.extend(predictions)\n",
+    "\n",
+    "all_predictions = np.array(all_predictions)\n",
+    "\n",
+    "# scores raw\n",
+    "temp = (np.exp(all_predictions)/np.exp(all_predictions).sum(-1,keepdims=True))\n",
+    "\n",
+    "# container\n",
+    "anger = []\n",
+    "disgust = []\n",
+    "fear = []\n",
+    "happy = []\n",
+    "neutral = []\n",
+    "sadness = []\n",
+    "surprise = []\n",
+    "\n",
+    "# extract scores (as many entries as exist in pred_texts)\n",
+    "for i in range(len(texts)):\n",
+    "    anger.append(temp[i][3])\n",
+    "    disgust.append(temp[i][4])\n",
+    "    fear.append(temp[i][6])\n",
+    "    happy.append(temp[i][1])\n",
+    "    neutral.append(temp[i][0])\n",
+    "    sadness.append(temp[i][2])\n",
+    "    surprise.append(temp[i][5])\n",
+    "    \n",
+    "df = pd.DataFrame(list(zip(texts, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n",
+    "df.head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}