calibr234 commited on
Commit
c71268c
·
verified ·
1 Parent(s): 9a7f2e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +884 -5
app.py CHANGED
@@ -1,7 +1,886 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ import subprocess
5
+ import sys
6
+
7
+ # Install spaCy model
8
+ os.system("python -m spacy download en_core_web_sm")
9
+
10
+ def process_tweets(files, reset_processing=False):
11
+ # Save uploaded files
12
+ file_paths = []
13
+ for file in files:
14
+ if file.name.endswith('.csv'):
15
+ # Ensure directory exists
16
+ os.makedirs("projects_twitter_post", exist_ok=True)
17
+
18
+ # Save file to the directory
19
+ dest_path = f"projects_twitter_post/{os.path.basename(file.name)}"
20
+ os.system(f"cp {file.name} {dest_path}")
21
+ file_paths.append(dest_path)
22
+
23
+ if not file_paths:
24
+ return "No CSV files uploaded. Please upload CSV files containing tweet data."
25
+
26
+ # Run the processing script
27
+ reset_flag = "--reset" if reset_processing else ""
28
+ result = subprocess.run(
29
+ f"python process_tweet_huggingface.py {reset_flag}",
30
+ shell=True,
31
+ capture_output=True,
32
+ text=True
33
+ )
34
+
35
+ # Check if output files were created
36
+ output_files = []
37
+ for file_path in file_paths:
38
+ base_name = os.path.basename(file_path).replace('.csv', '')
39
+ processed_path = f"projects_twitter_post/{base_name}_processed.csv"
40
+ analysis_path = f"projects_twitter_post/{base_name}_analysis.csv"
41
+
42
+ if os.path.exists(processed_path):
43
+ output_files.append(processed_path)
44
+ if os.path.exists(analysis_path):
45
+ output_files.append(analysis_path)
46
+
47
+ return_files = [f for f in output_files if os.path.exists(f)]
48
+
49
+ log_output = result.stdout + "\n" + result.stderr
50
+
51
+ return log_output, return_files
52
+
53
+ with gr.Blocks() as demo:
54
+ gr.Markdown("# Crypto Tweet Processor")
55
+ gr.Markdown("Upload CSV files containing tweet data to process")
56
+
57
+ with gr.Row():
58
+ files_input = gr.File(file_count="multiple", label="Upload CSV Files")
59
+ reset_checkbox = gr.Checkbox(label="Reset Processing", value=False)
60
+
61
+ process_btn = gr.Button("Process Tweets")
62
+
63
+ output_text = gr.Textbox(label="Processing Log")
64
+ output_files = gr.File(label="Processed Files", file_count="multiple")
65
+
66
+ process_btn.click(
67
+ process_tweets,
68
+ inputs=[files_input, reset_checkbox],
69
+ outputs=[output_text, output_files]
70
+ )
71
+
72
+ # Add the modified processing script code here
73
+ with open("process_tweet_huggingface.py", "w") as f:
74
+ f.write(#!/usr/bin/env python3
75
+ """
76
+ Tweet Processing Script for Google Colab - Enhanced with NLP and Sentiment Analysis
77
+ This version is optimized for Google Colab with GPU acceleration and Google Drive integration.
78
+ """
79
 
80
+ import os
81
+ import re
82
+ import json
83
+ import pandas as pd
84
+ import numpy as np
85
+ import torch
86
+ import math
87
+ import gc
88
+ from tqdm import tqdm
89
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
90
+ import spacy
91
 
92
+ # ==============================================
93
+ # COLAB SETUP - Run these cells first in Colab
94
+ # ==============================================
95
+
96
+ # Uncomment and run this cell to mount your Google Drive
97
+ """
98
+ from google.colab import drive
99
+ drive.mount('/content/drive')
100
+ """
101
+
102
+ # Uncomment and run this cell to install required packages
103
+ """
104
+ !pip install pandas tqdm transformers spacy
105
+ !python -m spacy download en_core_web_sm
106
+ """
107
+
108
+ # Uncomment and run this cell to verify GPU availability
109
+ """
110
+ import torch
111
+ print(f"GPU available: {torch.cuda.is_available()}")
112
+ print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
113
+ """
114
+
115
+ # ==============================================
116
+ # Constants - Update these paths for your setup
117
+ # ==============================================
118
+
119
+ # Update this to your Google Drive path
120
+ DRIVE_PATH = "./projects_twitter_post"
121
+ OUTPUT_FOLDER = f"{DRIVE_PATH}"
122
+ CHECKPOINT_FILE = f"{OUTPUT_FOLDER}/processing_checkpoint.json"
123
+ BATCH_SIZE = 500 # Reduced batch size for GPU memory management
124
+
125
+ # Create output folder if it doesn't exist
126
+ if not os.path.exists(OUTPUT_FOLDER):
127
+ os.makedirs(OUTPUT_FOLDER)
128
+
129
+ # ==============================================
130
+ # Model Initialization with GPU Acceleration
131
+ # ==============================================
132
+
133
+ print("Loading RoBERTa model...")
134
+ model_name = "roberta-base"
135
+ device = "cuda" if torch.cuda.is_available() else "cpu"
136
+ print(f"Using device: {device}")
137
+
138
+ # Initialize with GPU acceleration
139
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
140
+ model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
141
+ nlp_pipeline = pipeline("fill-mask", model=model_name, device=0 if torch.cuda.is_available() else -1)
142
+
143
+ # Initialize sentiment analysis pipeline
144
+ print("Loading sentiment analysis model...")
145
+ try:
146
+ # Using a Twitter-specific sentiment model for better results on social media text
147
+ sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment"
148
+ sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, device=0 if torch.cuda.is_available() else -1)
149
+ SENTIMENT_AVAILABLE = True
150
+ except Exception as e:
151
+ print(f"Error loading sentiment model: {e}")
152
+ # Fallback to a simpler sentiment model if the Twitter-specific one fails
153
+ try:
154
+ sentiment_pipeline = pipeline("sentiment-analysis", device=0 if torch.cuda.is_available() else -1)
155
+ SENTIMENT_AVAILABLE = True
156
+ except:
157
+ print("Sentiment analysis not available. Continuing without sentiment analysis.")
158
+ SENTIMENT_AVAILABLE = False
159
+
160
+ # Try to load spaCy for basic text preprocessing
161
+ try:
162
+ import spacy
163
+ spacy_nlp = spacy.load("en_core_web_sm")
164
+ SPACY_AVAILABLE = True
165
+ except:
166
+ SPACY_AVAILABLE = False
167
+ print("SpaCy not available. Using basic text processing instead.")
168
+
169
+ # Crypto-specific keywords with hierarchical categories
170
+ CRYPTO_TAXONOMY = {
171
+ "COIN": {
172
+ "MAJOR": [
173
+ "bitcoin", "ethereum", "btc", "eth", "bnb", "xrp", "sol", "doge",
174
+ "cardano", "polkadot", "dot", "avalanche", "avax", "solana", "polygon", "matic"
175
+ ],
176
+ "STABLECOIN": [
177
+ "tether", "usdt", "usdc", "busd", "dai", "frax", "tusd", "usdd", "lusd", "gusd", "husd"
178
+ ],
179
+ "ALTCOIN": [
180
+ "litecoin", "ltc", "chainlink", "link", "stellar", "xlm", "dogecoin", "shib",
181
+ "tron", "trx", "cosmos", "atom", "near", "algo", "fantom", "ftm", "monero", "xmr"
182
+ ],
183
+ "DEFI": [
184
+ "uniswap", "uni", "aave", "sushi", "cake", "comp", "maker", "mkr", "curve", "crv",
185
+ "yearn", "yfi", "compound", "balancer", "bal", "synthetix", "snx"
186
+ ],
187
+ "UTILITY": [
188
+ "filecoin", "fil", "the graph", "grt", "arweave", "ar", "chainlink", "link",
189
+ "helium", "hnt", "theta", "icp"
190
+ ],
191
+ "NFT": [
192
+ "enjin", "enj", "decentraland", "mana", "sandbox", "sand", "axie", "axs",
193
+ "gala", "apecoin", "ape", "flow", "ens", "stepn", "gmt"
194
+ ]
195
+ },
196
+
197
+ "TECH": {
198
+ "CONCEPTS": [
199
+ "blockchain", "defi", "nft", "dao", "smart contract", "web3", "dapp", "protocol",
200
+ "consensus", "tokenomics", "tokenization"
201
+ ],
202
+ "CHAIN_TYPES": [
203
+ "layer1", "layer2", "rollup", "sidechain", "mainnet", "testnet", "devnet",
204
+ "pow", "pos", "poh", "pbft", "dpos"
205
+ ],
206
+ "PRIVACY": [
207
+ "zk", "zk-rollups", "zero-knowledge", "zkp", "zksnark", "zkstark", "mpc",
208
+ "privacy", "private", "anonymous", "confidential", "encrypted"
209
+ ],
210
+ "SECTORS": [
211
+ "defi", "cefi", "gamefi", "metaverse", "socialfi", "fintech", "realfi",
212
+ "play-to-earn", "move-to-earn", "learn-to-earn", "x-to-earn", "defai", "depin", "desci",
213
+ "refi", "did", "dedata", "dedao", "deid", "deai", "degov", "decloud", "dehealth",
214
+ "decex", "deinsurance", "deworkplace", "public goods", "zk", "ordinals", "soulbound",
215
+ "onchain gaming", "ai agents", "infrastructure", "credentials", "restaking", "modular blockchain",
216
+ "liquid staking", "real world assets", "rwa", "synthetic assets", "account abstraction"
217
+ ]
218
+ },
219
+
220
+ "ACTION": {
221
+ "TRADING": [
222
+ "buy", "sell", "long", "short", "margin", "leverage", "trade", "swap",
223
+ "arbitrage", "dca", "ape", "pump", "dump", "moon", "ath", "atl", "breakout",
224
+ "correction", "consolidation", "accumulate", "distribute", "front run", "front runner",
225
+ "front running", "mev", "sandwich attack"
226
+ ],
227
+ "DEFI": [
228
+ "stake", "yield", "farm", "lend", "borrow", "supply", "withdraw", "claim",
229
+ "harvest", "flash loan", "liquidate", "collateralize", "wrap", "unwrap", "bridge",
230
+ "provide liquidity", "withdraw liquidity", "impermanent loss"
231
+ ],
232
+ "GOVERNANCE": [
233
+ "delegate", "vote", "propose", "governance", "dao", "snapshot", "quorum",
234
+ "execution", "timelock", "veto"
235
+ ],
236
+ "NFT": [
237
+ "mint", "airdrop", "whitelist", "burn", "floor price", "rarity", "trait", "pfp",
238
+ "collection", "secondary", "flip"
239
+ ],
240
+ "DEVELOPMENT": [
241
+ "deploy", "audit", "fork", "bootstrap", "initiate", "merge", "split",
242
+ "rebase", "optimize", "gas optimization", "implement", "compile"
243
+ ]
244
+ },
245
+
246
+ "PLATFORM": {
247
+ "EXCHANGE": [
248
+ "coinbase", "binance", "kraken", "kucoin", "ftx", "okx", "bybit", "bitfinex",
249
+ "huobi", "gate", "gemini", "bitstamp", "bittrex", "crypto.com", "cex", "dex"
250
+ ],
251
+ "WALLET": [
252
+ "metamask", "phantom", "trust wallet", "ledger", "trezor", "argent", "rainbow",
253
+ "wallet", "hot wallet", "cold storage", "hardware wallet", "seed phrase"
254
+ ],
255
+ "NFT_MARKET": [
256
+ "opensea", "rarible", "foundation", "superrare", "looksrare", "blur", "magic eden",
257
+ "nifty gateway", "zora", "x2y2", "element"
258
+ ],
259
+ "INFRA": [
260
+ "alchemy", "infura", "moralis", "quicknode", "ceramic", "arweave", "ipfs",
261
+ "node", "rpc", "api", "indexer", "subgraph"
262
+ ]
263
+ },
264
+
265
+ "NETWORK": {
266
+ "LAYER1": [
267
+ "ethereum", "bitcoin", "solana", "avalanche", "polygon", "bnb chain", "bsc",
268
+ "cardano", "polkadot", "cosmos", "algorand", "tezos", "flow", "near", "tron"
269
+ ],
270
+ "LAYER2": [
271
+ "arbitrum", "optimism", "zksync", "starknet", "base", "polygon", "loopring",
272
+ "immutablex", "metis", "boba", "aztec", "validium", "zkevm"
273
+ ],
274
+ "INTEROPERABILITY": [
275
+ "cosmos", "polkadot", "kusama", "moonbeam", "moonriver", "parachains", "relay chain",
276
+ "ibc", "cross-chain", "bridge"
277
+ ]
278
+ },
279
+
280
+ "EVENTS": {
281
+ "MARKET": [
282
+ "bull market", "bear market", "bull run", "bear trap", "bull trap", "halving",
283
+ "capitulation", "golden cross", "death cross", "breakout", "resistance", "support"
284
+ ],
285
+ "SECURITY": [
286
+ "hack", "exploit", "vulnerability", "scam", "phishing", "rug pull", "honeypot",
287
+ "flash crash", "attack", "51% attack", "front running", "sandwich attack", "mev extraction"
288
+ ],
289
+ "TOKEN_EVENTS": [
290
+ "airdrop", "token unlock", "vesting", "ico", "ido", "ito", "ieo", "fair launch",
291
+ "private sale", "seed round", "listing", "delisting"
292
+ ]
293
+ },
294
+
295
+ "METRICS": {
296
+ "FINANCIAL": [
297
+ "apy", "apr", "roi", "tvl", "market cap", "mcap", "volume", "liquidity", "supply",
298
+ "circulating supply", "total supply", "max supply", "inflation", "deflation",
299
+ "volatility", "dominance"
300
+ ],
301
+ "TECHNICAL": [
302
+ "gas fee", "gas price", "gas limit", "slippage", "impermanent loss", "yield",
303
+ "hashrate", "difficulty", "tps", "latency", "finality", "block time", "block size",
304
+ "block reward"
305
+ ]
306
+ },
307
+
308
+ "COMMUNITY": {
309
+ "ROLES": [
310
+ "whale", "degen", "anon", "influencer", "kol", "thought leader", "ambassador",
311
+ "advocate", "og", "contributor", "dev", "builder", "founder", "investor", "vc",
312
+ "angel", "team", "core team", "front runner", "mev bot", "searcher", "validator",
313
+ "miner", "node operator", "liquidity provider", "market maker", "arbitrageur"
314
+ ],
315
+ "SLANG": [
316
+ "diamond hands", "paper hands", "wagmi", "ngmi", "gm", "gn", "ser", "based",
317
+ "crypto twitter", "ct", "alpha", "dyor", "fomo", "fud", "hodl", "rekt"
318
+ ]
319
+ }
320
+ }
321
+
322
+ # ==============================================
323
+ # Helper Functions
324
+ # ==============================================
325
+
326
+ def clear_gpu_memory():
327
+ """Clear GPU memory to prevent OOM errors"""
328
+ if torch.cuda.is_available():
329
+ torch.cuda.empty_cache()
330
+ gc.collect()
331
+
332
+ def load_checkpoint():
333
+ """Load processing checkpoint if it exists"""
334
+ if os.path.exists(CHECKPOINT_FILE):
335
+ with open(CHECKPOINT_FILE, 'r') as f:
336
+ return json.load(f)
337
+ return {'last_processed_index': 0}
338
+
339
+ def save_checkpoint(index):
340
+ """Save the current processing index to a checkpoint file"""
341
+ with open(CHECKPOINT_FILE, 'w') as f:
342
+ json.dump({'last_processed_index': index}, f)
343
+
344
+ def identify_crypto_entities(text: str) -> list:
345
+ """
346
+ Identify crypto-specific entities in text using the hierarchical taxonomy.
347
+
348
+ Args:
349
+ text (str): Text to analyze
350
+
351
+ Returns:
352
+ list: List of tuples (entity, main_category, sub_category)
353
+ """
354
+ if not isinstance(text, str):
355
+ return []
356
+
357
+ text_lower = text.lower()
358
+ found_entities = []
359
+
360
+ # Search for each entity in the taxonomy
361
+ for main_cat, subcats in CRYPTO_TAXONOMY.items():
362
+ for subcat, terms in subcats.items():
363
+ for term in terms:
364
+ # Avoid matching partial words (ensure word boundaries)
365
+ pattern = r'\b' + re.escape(term) + r'\b'
366
+ if re.search(pattern, text_lower):
367
+ found_entities.append((term, main_cat, subcat))
368
+
369
+ return found_entities
370
+
371
+ def clean_text(text: str) -> str:
372
+ """Clean text while preserving mentions and hashtags"""
373
+ if not isinstance(text, str):
374
+ return ""
375
+
376
+ # Remove URLs
377
+ text = re.sub(r'http\S+', '', text)
378
+
379
+ # Remove non-alphanumeric characters (except mentions, hashtags, and spaces)
380
+ text = re.sub(r'[^\w\s@#]', ' ', text)
381
+
382
+ # Remove extra whitespace
383
+ text = re.sub(r'\s+', ' ', text).strip()
384
+
385
+ return text.lower()
386
+
387
+ def process_nlp_text(text: str) -> str:
388
+ """Process text with advanced NLP (lemmatization, stopword removal)"""
389
+ if not isinstance(text, str):
390
+ return ""
391
+
392
+ # Basic cleaning
393
+ text = clean_text(text)
394
+
395
+ if SPACY_AVAILABLE:
396
+ # Process with spaCy for advanced NLP
397
+ doc = spacy_nlp(text)
398
+
399
+ # Lemmatize and remove stopwords
400
+ processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
401
+
402
+ return " ".join(processed_tokens)
403
+ else:
404
+ # Fallback to basic cleaning if spaCy is not available
405
+ return text
406
+
407
+ def extract_mentions(text: str) -> list:
408
+ """Extract @mentions from text"""
409
+ if not isinstance(text, str):
410
+ return []
411
+ return re.findall(r'@(\w+)', text)
412
+
413
+ def extract_hashtags(text: str) -> list:
414
+ """Extract #hashtags from text"""
415
+ if not isinstance(text, str):
416
+ return []
417
+ return re.findall(r'#(\w+)', text)
418
+
419
+ def extract_urls(text: str) -> list:
420
+ """Extract URLs from text"""
421
+ if not isinstance(text, str):
422
+ return []
423
+ urls = re.findall(r'(https?://\S+)', text)
424
+ return urls
425
+
426
+ def analyze_sentiment(text: str) -> dict:
427
+ """
428
+ Analyze the sentiment of a text using the sentiment analysis pipeline.
429
+
430
+ Args:
431
+ text (str): The text to analyze
432
+
433
+ Returns:
434
+ dict: A dictionary containing sentiment label and score
435
+ """
436
+ if not SENTIMENT_AVAILABLE or not text.strip():
437
+ return {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}
438
+
439
+ try:
440
+ # Pre-process the text to improve sentiment analysis accuracy
441
+ # Limit text length to avoid errors with very long tweets
442
+ truncated_text = text[:512] if len(text) > 512 else text
443
+
444
+ # Get sentiment prediction
445
+ sentiment_result = sentiment_pipeline(truncated_text)[0]
446
+ label = sentiment_result['label']
447
+ score = sentiment_result['score']
448
+
449
+ # Map to standardized format (positive, negative, neutral)
450
+ sentiment_mapping = {
451
+ 'LABEL_0': 'negative',
452
+ 'LABEL_1': 'neutral',
453
+ 'LABEL_2': 'positive',
454
+ 'NEGATIVE': 'negative',
455
+ 'NEUTRAL': 'neutral',
456
+ 'POSITIVE': 'positive'
457
+ }
458
+
459
+ standardized_sentiment = sentiment_mapping.get(label, label.lower())
460
+
461
+ # Calculate magnitude (confidence) - useful for filtering high-confidence sentiments
462
+ magnitude = abs(score - 0.5) * 2 if standardized_sentiment != 'neutral' else score
463
+
464
+ return {
465
+ "sentiment": standardized_sentiment,
466
+ "sentiment_score": score,
467
+ "sentiment_magnitude": magnitude
468
+ }
469
+ except Exception as e:
470
+ print(f"Error in sentiment analysis: {e}")
471
+ return {"sentiment": "error", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}
472
+
473
+ def process_with_nlp(text: str) -> dict:
474
+ """
475
+ Process text with NLP to extract named entities, key phrases, etc.
476
+
477
+ Args:
478
+ text (str): The text to process
479
+
480
+ Returns:
481
+ dict: A dictionary containing NLP processing results
482
+ """
483
+ results = {
484
+ "named_entities": [],
485
+ "pos_tags": [],
486
+ "lemmatized_tokens": [],
487
+ "key_phrases": [],
488
+ "important_nouns": [],
489
+ "sentiment_analysis": {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}
490
+ }
491
+
492
+ if not text or text.isspace():
493
+ return results
494
+
495
+ # First, analyze sentiment
496
+ results["sentiment_analysis"] = analyze_sentiment(text)
497
+
498
+ try:
499
+ # Use spaCy for advanced NLP if available
500
+ if SPACY_AVAILABLE:
501
+ doc = spacy_nlp(text)
502
+
503
+ # Extract named entities (excluding crypto entities which are handled separately)
504
+ results["named_entities"] = [(ent.text, ent.label_) for ent in doc.ents]
505
+
506
+ # Extract POS tags for content words
507
+ results["pos_tags"] = [(token.text, token.pos_) for token in doc
508
+ if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"] and not token.is_stop]
509
+
510
+ # Get lemmatized tokens (normalized words)
511
+ results["lemmatized_tokens"] = [token.lemma_ for token in doc
512
+ if not token.is_stop and not token.is_punct and token.text.strip()]
513
+
514
+ # Extract important nouns (potential topics)
515
+ results["important_nouns"] = [token.text for token in doc
516
+ if token.pos_ == "NOUN" and not token.is_stop]
517
+
518
+ # Try to extract key phrases using noun chunks
519
+ results["key_phrases"] = [chunk.text for chunk in doc.noun_chunks
520
+ if len(chunk.text.split()) > 1]
521
+
522
+ # If key phrases are empty, use RoBERTa to attempt extraction
523
+ if not results["key_phrases"] and len(text.split()) > 3:
524
+ try:
525
+ # Create a masked sentence from the text
526
+ words = text.split()
527
+ if len(words) > 5:
528
+ # Get 3 random positions to mask
529
+ import random
530
+ positions = sorted(random.sample(range(len(words)), min(3, len(words))))
531
+
532
+ # Create masked sentences
533
+ key_terms = []
534
+ for pos in positions:
535
+ words_copy = words.copy()
536
+ words_copy[pos] = tokenizer.mask_token
537
+ masked_text = " ".join(words_copy)
538
+
539
+ # Get predictions for the masked token
540
+ predictions = nlp_pipeline(masked_text, top_k=2)
541
+ for prediction in predictions:
542
+ key_terms.append(prediction["token_str"].strip())
543
+
544
+ results["key_phrases"].extend(key_terms)
545
+ except Exception as e:
546
+ print(f"Error in key phrase extraction: {e}")
547
+
548
+ # Ensure all results are strings for CSV output
549
+ results["named_entities"] = ";".join([f"{ent[0]}:{ent[1]}" for ent in results["named_entities"]])
550
+ results["pos_tags"] = ";".join([f"{tag[0]}:{tag[1]}" for tag in results["pos_tags"]])
551
+ results["lemmatized_tokens"] = ";".join(results["lemmatized_tokens"])
552
+ results["key_phrases"] = ";".join(list(set(results["key_phrases"]))) # Remove duplicates
553
+ results["important_nouns"] = ";".join(list(set(results["important_nouns"]))) # Remove duplicates
554
+
555
+ except Exception as e:
556
+ print(f"Error in NLP processing: {e}")
557
+
558
+ # Clear GPU memory after processing
559
+ if (results["named_entities"].count(";") > 100) or (len(text) > 1000):
560
+ clear_gpu_memory()
561
+
562
+ return results
563
+
564
+ def process_tweet(text: str) -> tuple:
565
+ """
566
+ Process a tweet to extract mentions, hashtags, URLs, crypto entities, and perform NLP analysis.
567
+ Also performs sentiment analysis.
568
+
569
+ Args:
570
+ text (str): The tweet text to process
571
+
572
+ Returns:
573
+ tuple: A tuple containing mentions, hashtags, URLs, NLP results, and sentiment analysis
574
+ """
575
+ if not text or not isinstance(text, str):
576
+ return [], [], [], "", "", {}, {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}
577
+
578
+ # Clean the text while preserving mentions and hashtags
579
+ cleaned_text = clean_text(text)
580
+
581
+ # Process text with NLP
582
+ processed_text = process_nlp_text(text)
583
+
584
+ # Extract mentions, hashtags, and URLs
585
+ mentions = extract_mentions(text)
586
+ hashtags = extract_hashtags(text)
587
+ urls = extract_urls(text)
588
+
589
+ # Identify crypto entities
590
+ crypto_entities = identify_crypto_entities(text)
591
+
592
+ # Process with NLP models
593
+ nlp_results = process_with_nlp(text)
594
+
595
+ # Ensure we have the sentiment analysis results
596
+ sentiment_results = nlp_results.pop("sentiment_analysis", {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0})
597
+
598
+ # Add crypto entities to the named entities
599
+ formatted_crypto_entities = [f"{entity}:{main_cat}.{sub_cat}" for entity, main_cat, sub_cat in crypto_entities]
600
+
601
+ # If named_entities is a string (joined with semicolons), we need to handle differently
602
+ if isinstance(nlp_results.get("named_entities", ""), str):
603
+ nlp_results["named_entities"] = nlp_results.get("named_entities", "")
604
+ if nlp_results["named_entities"] and formatted_crypto_entities:
605
+ nlp_results["named_entities"] += ";" + ";".join(formatted_crypto_entities)
606
+ elif formatted_crypto_entities:
607
+ nlp_results["named_entities"] = ";".join(formatted_crypto_entities)
608
+
609
+ return mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results
610
+
611
+ def process_batch(df_batch):
612
+ """Process a batch of tweets"""
613
+ processed_data = []
614
+
615
+ for idx, row in df_batch.iterrows():
616
+ text = row.get('text', '')
617
+
618
+ # Process the tweet
619
+ mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results = process_tweet(text)
620
+
621
+ # Create a dictionary with the results
622
+ result = {
623
+ 'id': row.get('id', ''),
624
+ 'original_text': text, # Store the original text
625
+ 'cleaned_text': cleaned_text,
626
+ 'nlp_processed_text': processed_text,
627
+ 'extracted_mentions': ';'.join(mentions),
628
+ 'extracted_hashtags': ';'.join(hashtags),
629
+ 'extracted_urls': ';'.join(urls),
630
+ 'named_entities': nlp_results.get('named_entities', ''),
631
+ 'pos_tags': nlp_results.get('pos_tags', ''),
632
+ 'lemmatized_tokens': nlp_results.get('lemmatized_tokens', ''),
633
+ 'key_phrases': nlp_results.get('key_phrases', ''),
634
+ 'important_nouns': nlp_results.get('important_nouns', ''),
635
+ 'sentiment': sentiment_results.get('sentiment', 'unknown'),
636
+ 'sentiment_score': sentiment_results.get('sentiment_score', 0.0),
637
+ 'sentiment_magnitude': sentiment_results.get('sentiment_magnitude', 0.0)
638
+ }
639
+
640
+ processed_data.append(result)
641
+
642
+ return pd.DataFrame(processed_data)
643
+
644
+ # ==============================================
645
+ # Main Processing Function
646
+ # ==============================================
647
+
648
+ def main(reset_checkpoint=False, input_file=None):
649
+ """
650
+ Main function to process tweets
651
+
652
+ Args:
653
+ reset_checkpoint (bool): Whether to reset the checkpoint and process all data
654
+ input_file (str): Optional specific input file to process, otherwise processes all CSV files
655
+ """
656
+ if reset_checkpoint and os.path.exists(CHECKPOINT_FILE):
657
+ os.remove(CHECKPOINT_FILE)
658
+ print("Checkpoint reset. Will process all data from the beginning.")
659
+
660
+ # Get list of CSV files to process
661
+ if input_file:
662
+ # Process a specific file
663
+ input_files = [input_file]
664
+ else:
665
+ # Find all CSV files in the OUTPUT_FOLDER
666
+ import glob
667
+ input_files = glob.glob(f"{OUTPUT_FOLDER}/*.csv")
668
+
669
+ # Exclude our output files
670
+ input_files = [f for f in input_files if not any(x in f for x in ["_processed.csv", "_analysis.csv"])]
671
+
672
+ if not input_files:
673
+ print(f"No input CSV files found in {OUTPUT_FOLDER}")
674
+ return
675
+
676
+ print(f"Found {len(input_files)} files to process: {[os.path.basename(f) for f in input_files]}")
677
+
678
+ # Process each file
679
+ for input_csv in input_files:
680
+ print(f"\nProcessing file: {os.path.basename(input_csv)}")
681
+
682
+ print("Loading dataset...")
683
+ # Check if input file exists
684
+ if not os.path.exists(input_csv):
685
+ print(f"Input file {input_csv} not found. Skipping.")
686
+ continue
687
+
688
+ # Load the dataset
689
+ try:
690
+ df = pd.read_csv(input_csv)
691
+ print(f"Loaded dataset with {len(df)} records and {len(df.columns)} columns.")
692
+ except Exception as e:
693
+ print(f"Error loading {input_csv}: {e}")
694
+ continue
695
+
696
+ # Load checkpoint if it exists
697
+ checkpoint = load_checkpoint()
698
+ start_idx = checkpoint['last_processed_index']
699
+
700
+ # For simplicity, reset checkpoints between files
701
+ start_idx = 0
702
+ save_checkpoint(0)
703
+
704
+ print("\nProcessing tweets...")
705
+ print(f"Starting from index {start_idx}")
706
+
707
+ # Filter to only unprocessed rows
708
+ df_to_process = df.iloc[start_idx:]
709
+
710
+ if len(df_to_process) == 0:
711
+ print("No new data to process in this file.")
712
+ continue
713
+
714
+ # Process in batches for memory efficiency
715
+ batch_size = BATCH_SIZE
716
+ num_batches = math.ceil(len(df_to_process) / batch_size)
717
+ print(f"Processing in {num_batches} batches of {batch_size} records each")
718
+
719
+ processed_batches = []
720
+
721
+ # Create progress bar
722
+ for i in tqdm(range(num_batches)):
723
+ batch_start = i * batch_size
724
+ batch_end = min((i + 1) * batch_size, len(df_to_process))
725
+
726
+ # Get current batch
727
+ df_batch = df_to_process.iloc[batch_start:batch_end]
728
+
729
+ # Process the batch
730
+ processed_batch = process_batch(df_batch)
731
+ processed_batches.append(processed_batch)
732
+
733
+ # Save checkpoint
734
+ save_checkpoint(start_idx + batch_end)
735
+
736
+ # Save intermediate results every 5 batches to prevent data loss in case of session timeout
737
+ if i % 5 == 0 and i > 0:
738
+ file_basename = os.path.splitext(os.path.basename(input_csv))[0]
739
+ interim_df = pd.concat(processed_batches, ignore_index=True)
740
+ interim_file = f"{OUTPUT_FOLDER}/{file_basename}_interim_{i}.csv"
741
+ interim_df.to_csv(interim_file, index=False)
742
+ print(f"\nSaved interim results to {interim_file}")
743
+
744
+ # Clear memory
745
+ clear_gpu_memory()
746
+
747
+ # Combine all batches
748
+ if processed_batches:
749
+ file_basename = os.path.splitext(os.path.basename(input_csv))[0]
750
+
751
+ final_df = pd.concat(processed_batches, ignore_index=True)
752
+
753
+ # Calculate statistics columns
754
+ final_df["mention_count"] = final_df["extracted_mentions"].str.count(";") + (final_df["extracted_mentions"] != "").astype(int)
755
+ final_df["hashtag_count"] = final_df["extracted_hashtags"].str.count(";") + (final_df["extracted_hashtags"] != "").astype(int)
756
+ final_df["entity_count"] = final_df["named_entities"].str.count(";") + (final_df["named_entities"] != "").astype(int)
757
+
758
+ # Save the full processed dataset
759
+ output_file = f"{OUTPUT_FOLDER}/{file_basename}_processed.csv"
760
+ final_df.to_csv(output_file, index=False)
761
+ print(f"Processed data saved to {output_file}")
762
+
763
+ # Create a lighter version with just the analysis
764
+ analysis_columns = [
765
+ "id", "original_text", "cleaned_text", "nlp_processed_text",
766
+ "extracted_mentions", "extracted_hashtags", "extracted_urls",
767
+ "named_entities", "key_phrases", "important_nouns",
768
+ "sentiment", "sentiment_score", "sentiment_magnitude",
769
+ "mention_count", "hashtag_count", "entity_count"
770
+ ]
771
+
772
+ # Ensure all columns exist before subsetting
773
+ available_columns = [col for col in analysis_columns if col in final_df.columns]
774
+ analysis_df = final_df[available_columns]
775
+ analysis_file = f"{OUTPUT_FOLDER}/{file_basename}_analysis.csv"
776
+ analysis_df.to_csv(analysis_file, index=False)
777
+ print(f"Analysis results saved to {analysis_file}")
778
+
779
+ # Print statistics
780
+ print(f"\nAnalysis completed successfully!")
781
+ print(f"Total records: {len(final_df)}")
782
+ print(f"Tweets with Mentions: {(final_df['extracted_mentions'] != '').sum()}")
783
+ print(f"Tweets with Hashtags: {(final_df['extracted_hashtags'] != '').sum()}")
784
+ print(f"Tweets with Named Entities: {(final_df['named_entities'] != '').sum()}")
785
+
786
+ # Print sentiment statistics
787
+ sentiment_counts = final_df['sentiment'].value_counts()
788
+ print("\nSentiment Distribution:")
789
+ for sentiment, count in sentiment_counts.items():
790
+ percentage = (count / len(final_df)) * 100
791
+ print(f" {sentiment}: {count} tweets ({percentage:.1f}%)")
792
+
793
+ # Get average sentiment scores
794
+ avg_score = final_df['sentiment_score'].mean()
795
+ avg_magnitude = final_df['sentiment_magnitude'].mean()
796
+ print(f"\nAverage sentiment score: {avg_score:.3f}")
797
+ print(f"Average sentiment magnitude: {avg_magnitude:.3f}")
798
+
799
+ # Get top entities by sentiment
800
+ positive_entities = []
801
+ for idx, row in final_df[final_df['sentiment'] == 'positive'].iterrows():
802
+ entities = row['named_entities'].split(';') if isinstance(row['named_entities'], str) and row['named_entities'] else []
803
+ for entity in entities:
804
+ if entity and ':' in entity:
805
+ entity_name = entity.split(':')[0]
806
+ positive_entities.append(entity_name)
807
+
808
+ # Get the most common positive entities
809
+ from collections import Counter
810
+ top_positive = Counter(positive_entities).most_common(5)
811
+ if top_positive:
812
+ print("\nTop entities with positive sentiment:")
813
+ for entity, count in top_positive:
814
+ print(f" {entity}: {count} mentions")
815
+
816
+ # Print sample results
817
+ print("\nSample of processing results:")
818
+ for i, row in analysis_df.head(3).iterrows():
819
+ print(f"\nOriginal Text: {row['original_text']}")
820
+ print(f"Cleaned Text: {row['cleaned_text']}")
821
+ print(f"NLP Processed Text: {row['nlp_processed_text']}")
822
+ print(f"Mentions: {row['extracted_mentions']}")
823
+ print(f"Hashtags: {row['extracted_hashtags']}")
824
+ print(f"Named Entities: {row['named_entities']}")
825
+ print(f"Key Phrases: {row['key_phrases']}")
826
+ print(f"Sentiment: {row['sentiment']} (Score: {row['sentiment_score']:.3f}, Magnitude: {row['sentiment_magnitude']:.3f})")
827
+ print("-" * 80)
828
+
829
+ # Delete interim files
830
+ import glob
831
+ interim_files = glob.glob(f"{OUTPUT_FOLDER}/{file_basename}_interim_*.csv")
832
+ for f in interim_files:
833
+ try:
834
+ os.remove(f)
835
+ print(f"Deleted interim file: {os.path.basename(f)}")
836
+ except:
837
+ pass
838
+
839
+ # Clear memory after processing each file
840
+ clear_gpu_memory()
841
+ else:
842
+ print("No data processed for this file.")
843
+
844
+ # Clean up checkpoint file after successful processing
845
+ if os.path.exists(CHECKPOINT_FILE):
846
+ os.remove(CHECKPOINT_FILE)
847
+ print("\nAll files processed successfully!")
848
+
849
+ # ==============================================
850
+ # Colab Usage Example
851
+ # ==============================================
852
+
853
+ """
854
+ # EXAMPLE USAGE IN COLAB:
855
+
856
+ # 1. Install packages and mount drive
857
+ from google.colab import drive
858
+ drive.mount('/content/drive')
859
+
860
+ # 2. Process one specific file
861
+ input_file = "/content/drive/MyDrive/projects_twitter_post/zilliqa.csv"
862
+ main(reset_checkpoint=True, input_file=input_file)
863
+
864
+ # 3. Process all files
865
+ main(reset_checkpoint=True)
866
+ """
867
+
868
+ if __name__ == "__main__":
869
+ import sys
870
+
871
+ # Check if --reset flag is provided
872
+ reset_checkpoint = "--reset" in sys.argv
873
+
874
+ # Check if --file flag is provided
875
+ input_file = None
876
+ if "--file" in sys.argv:
877
+ try:
878
+ input_file = sys.argv[sys.argv.index("--file") + 1]
879
+ except IndexError:
880
+ print("Error: --file flag requires a filename argument")
881
+ sys.exit(1)
882
+
883
+ # Run the main function
884
+ main(reset_checkpoint=reset_checkpoint, input_file=input_file) )
885
+
886
+ demo.launch()