File size: 5,571 Bytes
22c9072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0bac3852",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-12-06 02:00:53.739133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'label': 'LABEL_1', 'score': 0.7463672161102295}]\n"
     ]
    }
   ],
   "source": [
    "!pip install -q transformers torch\n",
    "from transformers import pipeline\n",
    "\n",
    "model_name = \"XerOpred/twitter-climate-sentiment-model\"\n",
    "classifier = pipeline('sentiment-analysis', model=model_name)\n",
    "\n",
    "text = \"some power and authority u can not spell, let alone define and wield, thas just more evidence of ur arrogant IGNORANCE same as u apply to ur climate change denial THEORY as if u know shit u do not, TOLD U DareDevil does not mean what the hell u think it does, HELL?? been there\"\n",
    "result = classifier(text)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "34f09a3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
    "import torch\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('data/combined-usa.csv' )\n",
    "\n",
    "model_name = \"XerOpred/twitter-climate-sentiment-model\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
    "\n",
    "def sentiment_analysis(model, tokenizer, text):\n",
    "    if not isinstance(text, str):\n",
    "        raise ValueError(\"Input text must be a string.\")\n",
    "\n",
    "    # tokenize and get model predictions\n",
    "    inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=512)\n",
    "    with torch.no_grad():\n",
    "        outputs = model(**inputs)\n",
    "\n",
    "    # extract logits and apply softmax to get probabilities\n",
    "    logits = outputs.logits\n",
    "    probabilities = torch.nn.functional.softmax(logits, dim=-1)\n",
    "\n",
    "    # determine label and its confidence\n",
    "    predicted_label_idx = torch.argmax(probabilities, dim=1).item()\n",
    "    confidence = probabilities[0][predicted_label_idx].item()\n",
    "    labels = model.config.id2label\n",
    "    predicted_label = labels[predicted_label_idx]\n",
    "\n",
    "    # sentiment score = positive logit - negative logit\n",
    "    sentiment_score = logits[0][1] - logits[0][0]\n",
    "\n",
    "    return predicted_label, confidence, sentiment_score.item(), logits[0].tolist()\n",
    "\n",
    "def process_in_batches(df, model, tokenizer, batch_size=1000):\n",
    "    batches = [df[i:i + batch_size] for i in range(0, df.shape[0], batch_size)]\n",
    "\n",
    "    results = []\n",
    "    for batch in batches:\n",
    "        batch_results = batch['Content'].apply(\n",
    "            lambda x: pd.Series(sentiment_analysis(model, tokenizer, str(x)))\n",
    "        )\n",
    "        batch_results.index = batch.index\n",
    "        results.append(batch_results)\n",
    "        \n",
    "    return pd.concat(results)\n",
    "\n",
    "# Apply the batch processing function\n",
    "df[['Label', 'Confidence', 'SentimentScore', 'Logits']] = process_in_batches(df, model, tokenizer, batch_size=1000)\n",
    "\n",
    "# Save the DataFrame to a CSV file\n",
    "df.to_csv('data/distilbert-sentiment-usa-FINAL.csv', index=False)\n",
    "\n",
    "# sample_text = \"AnnCoulter Global Warming? Climate Change https://t.co/TYleYPslqu Looks like global warming's the trend Now it's climate change, they changed it again These scientists, they get grants from the gov Theyll say anything, or lose that money they love https://t.co/odBcgDMIfp\"\n",
    "# predicted_label, confidence, sentiment_score, logits = sentiment_analysis(model, tokenizer, sample_text)\n",
    "\n",
    "# print(f\"Label: {predicted_label}\")\n",
    "# print(f\"Confidence: {confidence}\")\n",
    "# print(f\"Sentiment score for the text: {sentiment_score}\")\n",
    "# print(f\"Logits: {logits}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "4703a4cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop duplicates\n",
    "duplicates_df = pd.read_csv('data/distilbert-sentiment-usa-FINAL.csv', lineterminator='\\n', low_memory=False)\n",
    "\n",
    "duplicates_df = duplicates_df.drop_duplicates(subset=['Username', 'Content'], keep='first')\n",
    "duplicates_df.to_csv('data/distilbert-usa.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "408484cf",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}