vamossyd
/

emtract-distilbert-base-uncased-emotion

@@ -1,230 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "750fed8c",
-   "metadata": {},
-   "source": [
-    "Must run the following:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "ccad76ec",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\n"
-     ]
-    }
-   ],
-   "source": [
-    "!git clone https://github.com/dvamossy/EmTract.git\n",
-    "%cd EmTract\n",
-    "!pip install -r requirements.txt "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2551adee",
-   "metadata": {},
-   "source": [
-    "Text Cleaner for unprocessed text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "687995ef",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "D:\\Research\\FinancialMarkets\\Emotions\\Emtract\\Training\\EmTract\\emtract\\processors\\cleaning.py:68: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
-      "  symspell_list = pd.read_csv(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'soo well'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from emtract.processors.cleaning import clean_text\n",
-    "# Illustrate text cleaning\n",
-    "clean_text(\"soooooo well\", segment_words=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6b81c0cd",
-   "metadata": {},
-   "source": [
-    "Option I"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0ca68eb1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import pipeline\n",
-    "classifier = pipeline(\"text-classification\", model=\"vamossyd/emtract-distilbert-base-uncased-emotion\", return_all_scores=True)\n",
-    "classifier(\"i love this!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0b9cd58f",
-   "metadata": {},
-   "source": [
-    "Option II"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "524cb5d6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer\n",
-    "\n",
-    "# Create class for data preparation\n",
-    "class SimpleDataset:\n",
-    "    def __init__(self, tokenized_texts):\n",
-    "        self.tokenized_texts = tokenized_texts\n",
-    "    \n",
-    "    def __len__(self):\n",
-    "        return len(self.tokenized_texts[\"input_ids\"])\n",
-    "    \n",
-    "    def __getitem__(self, idx):\n",
-    "        return {k: v[idx] for k, v in self.tokenized_texts.items()}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1f9f01f4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_path = \"PROVIDE_PATH_TO_DATA\"\n",
-    "# data = pd.read_csv(input_path) # ASSUMING DATA IS IN CSV\n",
-    "\n",
-    "# If text is already cleaned:\n",
-    "# texts = data.text.tolist() \n",
-    "\n",
-    "# Otherwise:\n",
-    "# texts = data['text'].apply(clean_text).tolist() # \n",
-    "\n",
-    "# As an example:\n",
-    "texts = ['i love this', 'i do not love you', 'to the moon 🚀']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "839cd230",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load tokenizer and model, create trainer\n",
-    "model_name = \"vamossyd/emtract-distilbert-base-uncased-emotion\"\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
-    "trainer = Trainer(model=model)\n",
-    "\n",
-    "# Tokenize texts and create prediction data set\n",
-    "tokenized_texts = tokenizer(texts, truncation=True, padding=True)\n",
-    "pred_dataset = SimpleDataset(tokenized_texts)\n",
-    "predictions = trainer.predict(pred_dataset)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3d903549",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# scores raw\n",
-    "temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))\n",
-    "preds = predictions.predictions.argmax(-1)\n",
-    "labels = pd.Series(preds).map(model.config.id2label)\n",
-    "\n",
-    "# container\n",
-    "anger = []\n",
-    "disgust = []\n",
-    "fear = []\n",
-    "happy = []\n",
-    "neutral = []\n",
-    "sadness = []\n",
-    "surprise = []\n",
-    "\n",
-    "# extract scores (as many entries as exist in pred_texts)\n",
-    "for i in range(len(texts)):\n",
-    "    anger.append(temp[i][3])\n",
-    "    disgust.append(temp[i][4])\n",
-    "    fear.append(temp[i][6])\n",
-    "    happy.append(temp[i][1])\n",
-    "    neutral.append(temp[i][0])\n",
-    "    sadness.append(temp[i][2])\n",
-    "    surprise.append(temp[i][5])\n",
-    "    \n",
-    "df = pd.DataFrame(list(zip(texts, labels, anger, disgust, fear, happy, neutral, sadness, surprise)), columns=['text','pred_label', 'anger', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'])\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "577f10b8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# save results to csv\n",
-    "output_path = \"YOUR_FILENAME_EMOTIONS.csv\"  # name your output file\n",
-    "# df.to_csv(YOUR_FILENAME)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}