File size: 5,571 Bytes
22c9072 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "0bac3852",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-12-06 02:00:53.739133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{'label': 'LABEL_1', 'score': 0.7463672161102295}]\n"
]
}
],
"source": [
"!pip install -q transformers torch\n",
"from transformers import pipeline\n",
"\n",
"model_name = \"XerOpred/twitter-climate-sentiment-model\"\n",
"classifier = pipeline('sentiment-analysis', model=model_name)\n",
"\n",
"text = \"some power and authority u can not spell, let alone define and wield, thas just more evidence of ur arrogant IGNORANCE same as u apply to ur climate change denial THEORY as if u know shit u do not, TOLD U DareDevil does not mean what the hell u think it does, HELL?? been there\"\n",
"result = classifier(text)\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "34f09a3e",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
"import torch\n",
"import pandas as pd\n",
"\n",
"df = pd.read_csv('data/combined-usa.csv' )\n",
"\n",
"model_name = \"XerOpred/twitter-climate-sentiment-model\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
"\n",
"def sentiment_analysis(model, tokenizer, text):\n",
" if not isinstance(text, str):\n",
" raise ValueError(\"Input text must be a string.\")\n",
"\n",
" # tokenize and get model predictions\n",
" inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=512)\n",
" with torch.no_grad():\n",
" outputs = model(**inputs)\n",
"\n",
" # extract logits and apply softmax to get probabilities\n",
" logits = outputs.logits\n",
" probabilities = torch.nn.functional.softmax(logits, dim=-1)\n",
"\n",
" # determine label and its confidence\n",
" predicted_label_idx = torch.argmax(probabilities, dim=1).item()\n",
" confidence = probabilities[0][predicted_label_idx].item()\n",
" labels = model.config.id2label\n",
" predicted_label = labels[predicted_label_idx]\n",
"\n",
" # sentiment score = positive logit - negative logit\n",
" sentiment_score = logits[0][1] - logits[0][0]\n",
"\n",
" return predicted_label, confidence, sentiment_score.item(), logits[0].tolist()\n",
"\n",
"def process_in_batches(df, model, tokenizer, batch_size=1000):\n",
" batches = [df[i:i + batch_size] for i in range(0, df.shape[0], batch_size)]\n",
"\n",
" results = []\n",
" for batch in batches:\n",
" batch_results = batch['Content'].apply(\n",
" lambda x: pd.Series(sentiment_analysis(model, tokenizer, str(x)))\n",
" )\n",
" batch_results.index = batch.index\n",
" results.append(batch_results)\n",
" \n",
" return pd.concat(results)\n",
"\n",
"# Apply the batch processing function\n",
"df[['Label', 'Confidence', 'SentimentScore', 'Logits']] = process_in_batches(df, model, tokenizer, batch_size=1000)\n",
"\n",
"# Save the DataFrame to a CSV file\n",
"df.to_csv('data/distilbert-sentiment-usa-FINAL.csv', index=False)\n",
"\n",
"# sample_text = \"AnnCoulter Global Warming? Climate Change https://t.co/TYleYPslqu Looks like global warming's the trend Now it's climate change, they changed it again These scientists, they get grants from the gov Theyll say anything, or lose that money they love https://t.co/odBcgDMIfp\"\n",
"# predicted_label, confidence, sentiment_score, logits = sentiment_analysis(model, tokenizer, sample_text)\n",
"\n",
"# print(f\"Label: {predicted_label}\")\n",
"# print(f\"Confidence: {confidence}\")\n",
"# print(f\"Sentiment score for the text: {sentiment_score}\")\n",
"# print(f\"Logits: {logits}\")"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "4703a4cf",
"metadata": {},
"outputs": [],
"source": [
"# drop duplicates\n",
"duplicates_df = pd.read_csv('data/distilbert-sentiment-usa-FINAL.csv', lineterminator='\\n', low_memory=False)\n",
"\n",
"duplicates_df = duplicates_df.drop_duplicates(subset=['Username', 'Content'], keep='first')\n",
"duplicates_df.to_csv('data/distilbert-usa.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "408484cf",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|