Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import os | |
import subprocess | |
import sys | |
# Install spaCy model | |
os.system("python -m spacy download en_core_web_sm") | |
def process_tweets(files, reset_processing=False): | |
# Save uploaded files | |
file_paths = [] | |
for file in files: | |
if file.name.endswith('.csv'): | |
# Ensure directory exists | |
os.makedirs("projects_twitter_post", exist_ok=True) | |
# Save file to the directory | |
dest_path = f"projects_twitter_post/{os.path.basename(file.name)}" | |
os.system(f"cp {file.name} {dest_path}") | |
file_paths.append(dest_path) | |
if not file_paths: | |
return "No CSV files uploaded. Please upload CSV files containing tweet data." | |
# Run the processing script | |
reset_flag = "--reset" if reset_processing else "" | |
result = subprocess.run( | |
f"python process_tweet_huggingface.py {reset_flag}", | |
shell=True, | |
capture_output=True, | |
text=True | |
) | |
# Check if output files were created | |
output_files = [] | |
for file_path in file_paths: | |
base_name = os.path.basename(file_path).replace('.csv', '') | |
processed_path = f"projects_twitter_post/{base_name}_processed.csv" | |
analysis_path = f"projects_twitter_post/{base_name}_analysis.csv" | |
if os.path.exists(processed_path): | |
output_files.append(processed_path) | |
if os.path.exists(analysis_path): | |
output_files.append(analysis_path) | |
return_files = [f for f in output_files if os.path.exists(f)] | |
log_output = result.stdout + "\n" + result.stderr | |
return log_output, return_files | |
with gr.Blocks() as demo: | |
gr.Markdown("# Crypto Tweet Processor") | |
gr.Markdown("Upload CSV files containing tweet data to process") | |
with gr.Row(): | |
files_input = gr.File(file_count="multiple", label="Upload CSV Files") | |
reset_checkbox = gr.Checkbox(label="Reset Processing", value=False) | |
process_btn = gr.Button("Process Tweets") | |
output_text = gr.Textbox(label="Processing Log") | |
output_files = gr.File(label="Processed Files", file_count="multiple") | |
process_btn.click( | |
process_tweets, | |
inputs=[files_input, reset_checkbox], | |
outputs=[output_text, output_files] | |
) | |
# Add the modified processing script code here | |
with open("process_tweet_huggingface.py", "w") as f: | |
f.write( | |
import os | |
import re | |
import json | |
import numpy as np | |
import torch | |
import math | |
import gc | |
from tqdm import tqdm | |
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline | |
import spacy | |
# ============================================== | |
# COLAB SETUP - Run these cells first in Colab | |
# ============================================== | |
# Uncomment and run this cell to mount your Google Drive | |
""" | |
from google.colab import drive | |
drive.mount('/content/drive') | |
""" | |
# Uncomment and run this cell to install required packages | |
""" | |
!pip install pandas tqdm transformers spacy | |
!python -m spacy download en_core_web_sm | |
""" | |
# Uncomment and run this cell to verify GPU availability | |
""" | |
import torch | |
print(f"GPU available: {torch.cuda.is_available()}") | |
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}") | |
""" | |
# ============================================== | |
# Constants - Update these paths for your setup | |
# ============================================== | |
# Update this to your Google Drive path | |
DRIVE_PATH = "./projects_twitter_post" | |
OUTPUT_FOLDER = f"{DRIVE_PATH}" | |
CHECKPOINT_FILE = f"{OUTPUT_FOLDER}/processing_checkpoint.json" | |
BATCH_SIZE = 500 # Reduced batch size for GPU memory management | |
# Create output folder if it doesn't exist | |
if not os.path.exists(OUTPUT_FOLDER): | |
os.makedirs(OUTPUT_FOLDER) | |
# ============================================== | |
# Model Initialization with GPU Acceleration | |
# ============================================== | |
print("Loading RoBERTa model...") | |
model_name = "roberta-base" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {device}") | |
# Initialize with GPU acceleration | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device) | |
nlp_pipeline = pipeline("fill-mask", model=model_name, device=0 if torch.cuda.is_available() else -1) | |
# Initialize sentiment analysis pipeline | |
print("Loading sentiment analysis model...") | |
try: | |
# Using a Twitter-specific sentiment model for better results on social media text | |
sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment" | |
sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, device=0 if torch.cuda.is_available() else -1) | |
SENTIMENT_AVAILABLE = True | |
except Exception as e: | |
print(f"Error loading sentiment model: {e}") | |
# Fallback to a simpler sentiment model if the Twitter-specific one fails | |
try: | |
sentiment_pipeline = pipeline("sentiment-analysis", device=0 if torch.cuda.is_available() else -1) | |
SENTIMENT_AVAILABLE = True | |
except: | |
print("Sentiment analysis not available. Continuing without sentiment analysis.") | |
SENTIMENT_AVAILABLE = False | |
# Try to load spaCy for basic text preprocessing | |
try: | |
import spacy | |
spacy_nlp = spacy.load("en_core_web_sm") | |
SPACY_AVAILABLE = True | |
except: | |
SPACY_AVAILABLE = False | |
print("SpaCy not available. Using basic text processing instead.") | |
# Crypto-specific keywords with hierarchical categories | |
CRYPTO_TAXONOMY = { | |
"COIN": { | |
"MAJOR": [ | |
"bitcoin", "ethereum", "btc", "eth", "bnb", "xrp", "sol", "doge", | |
"cardano", "polkadot", "dot", "avalanche", "avax", "solana", "polygon", "matic" | |
], | |
"STABLECOIN": [ | |
"tether", "usdt", "usdc", "busd", "dai", "frax", "tusd", "usdd", "lusd", "gusd", "husd" | |
], | |
"ALTCOIN": [ | |
"litecoin", "ltc", "chainlink", "link", "stellar", "xlm", "dogecoin", "shib", | |
"tron", "trx", "cosmos", "atom", "near", "algo", "fantom", "ftm", "monero", "xmr" | |
], | |
"DEFI": [ | |
"uniswap", "uni", "aave", "sushi", "cake", "comp", "maker", "mkr", "curve", "crv", | |
"yearn", "yfi", "compound", "balancer", "bal", "synthetix", "snx" | |
], | |
"UTILITY": [ | |
"filecoin", "fil", "the graph", "grt", "arweave", "ar", "chainlink", "link", | |
"helium", "hnt", "theta", "icp" | |
], | |
"NFT": [ | |
"enjin", "enj", "decentraland", "mana", "sandbox", "sand", "axie", "axs", | |
"gala", "apecoin", "ape", "flow", "ens", "stepn", "gmt" | |
] | |
}, | |
"TECH": { | |
"CONCEPTS": [ | |
"blockchain", "defi", "nft", "dao", "smart contract", "web3", "dapp", "protocol", | |
"consensus", "tokenomics", "tokenization" | |
], | |
"CHAIN_TYPES": [ | |
"layer1", "layer2", "rollup", "sidechain", "mainnet", "testnet", "devnet", | |
"pow", "pos", "poh", "pbft", "dpos" | |
], | |
"PRIVACY": [ | |
"zk", "zk-rollups", "zero-knowledge", "zkp", "zksnark", "zkstark", "mpc", | |
"privacy", "private", "anonymous", "confidential", "encrypted" | |
], | |
"SECTORS": [ | |
"defi", "cefi", "gamefi", "metaverse", "socialfi", "fintech", "realfi", | |
"play-to-earn", "move-to-earn", "learn-to-earn", "x-to-earn", "defai", "depin", "desci", | |
"refi", "did", "dedata", "dedao", "deid", "deai", "degov", "decloud", "dehealth", | |
"decex", "deinsurance", "deworkplace", "public goods", "zk", "ordinals", "soulbound", | |
"onchain gaming", "ai agents", "infrastructure", "credentials", "restaking", "modular blockchain", | |
"liquid staking", "real world assets", "rwa", "synthetic assets", "account abstraction" | |
] | |
}, | |
"ACTION": { | |
"TRADING": [ | |
"buy", "sell", "long", "short", "margin", "leverage", "trade", "swap", | |
"arbitrage", "dca", "ape", "pump", "dump", "moon", "ath", "atl", "breakout", | |
"correction", "consolidation", "accumulate", "distribute", "front run", "front runner", | |
"front running", "mev", "sandwich attack" | |
], | |
"DEFI": [ | |
"stake", "yield", "farm", "lend", "borrow", "supply", "withdraw", "claim", | |
"harvest", "flash loan", "liquidate", "collateralize", "wrap", "unwrap", "bridge", | |
"provide liquidity", "withdraw liquidity", "impermanent loss" | |
], | |
"GOVERNANCE": [ | |
"delegate", "vote", "propose", "governance", "dao", "snapshot", "quorum", | |
"execution", "timelock", "veto" | |
], | |
"NFT": [ | |
"mint", "airdrop", "whitelist", "burn", "floor price", "rarity", "trait", "pfp", | |
"collection", "secondary", "flip" | |
], | |
"DEVELOPMENT": [ | |
"deploy", "audit", "fork", "bootstrap", "initiate", "merge", "split", | |
"rebase", "optimize", "gas optimization", "implement", "compile" | |
] | |
}, | |
"PLATFORM": { | |
"EXCHANGE": [ | |
"coinbase", "binance", "kraken", "kucoin", "ftx", "okx", "bybit", "bitfinex", | |
"huobi", "gate", "gemini", "bitstamp", "bittrex", "crypto.com", "cex", "dex" | |
], | |
"WALLET": [ | |
"metamask", "phantom", "trust wallet", "ledger", "trezor", "argent", "rainbow", | |
"wallet", "hot wallet", "cold storage", "hardware wallet", "seed phrase" | |
], | |
"NFT_MARKET": [ | |
"opensea", "rarible", "foundation", "superrare", "looksrare", "blur", "magic eden", | |
"nifty gateway", "zora", "x2y2", "element" | |
], | |
"INFRA": [ | |
"alchemy", "infura", "moralis", "quicknode", "ceramic", "arweave", "ipfs", | |
"node", "rpc", "api", "indexer", "subgraph" | |
] | |
}, | |
"NETWORK": { | |
"LAYER1": [ | |
"ethereum", "bitcoin", "solana", "avalanche", "polygon", "bnb chain", "bsc", | |
"cardano", "polkadot", "cosmos", "algorand", "tezos", "flow", "near", "tron" | |
], | |
"LAYER2": [ | |
"arbitrum", "optimism", "zksync", "starknet", "base", "polygon", "loopring", | |
"immutablex", "metis", "boba", "aztec", "validium", "zkevm" | |
], | |
"INTEROPERABILITY": [ | |
"cosmos", "polkadot", "kusama", "moonbeam", "moonriver", "parachains", "relay chain", | |
"ibc", "cross-chain", "bridge" | |
] | |
}, | |
"EVENTS": { | |
"MARKET": [ | |
"bull market", "bear market", "bull run", "bear trap", "bull trap", "halving", | |
"capitulation", "golden cross", "death cross", "breakout", "resistance", "support" | |
], | |
"SECURITY": [ | |
"hack", "exploit", "vulnerability", "scam", "phishing", "rug pull", "honeypot", | |
"flash crash", "attack", "51% attack", "front running", "sandwich attack", "mev extraction" | |
], | |
"TOKEN_EVENTS": [ | |
"airdrop", "token unlock", "vesting", "ico", "ido", "ito", "ieo", "fair launch", | |
"private sale", "seed round", "listing", "delisting" | |
] | |
}, | |
"METRICS": { | |
"FINANCIAL": [ | |
"apy", "apr", "roi", "tvl", "market cap", "mcap", "volume", "liquidity", "supply", | |
"circulating supply", "total supply", "max supply", "inflation", "deflation", | |
"volatility", "dominance" | |
], | |
"TECHNICAL": [ | |
"gas fee", "gas price", "gas limit", "slippage", "impermanent loss", "yield", | |
"hashrate", "difficulty", "tps", "latency", "finality", "block time", "block size", | |
"block reward" | |
] | |
}, | |
"COMMUNITY": { | |
"ROLES": [ | |
"whale", "degen", "anon", "influencer", "kol", "thought leader", "ambassador", | |
"advocate", "og", "contributor", "dev", "builder", "founder", "investor", "vc", | |
"angel", "team", "core team", "front runner", "mev bot", "searcher", "validator", | |
"miner", "node operator", "liquidity provider", "market maker", "arbitrageur" | |
], | |
"SLANG": [ | |
"diamond hands", "paper hands", "wagmi", "ngmi", "gm", "gn", "ser", "based", | |
"crypto twitter", "ct", "alpha", "dyor", "fomo", "fud", "hodl", "rekt" | |
] | |
} | |
} | |
# ============================================== | |
# Helper Functions | |
# ============================================== | |
def clear_gpu_memory(): | |
"""Clear GPU memory to prevent OOM errors""" | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
gc.collect() | |
def load_checkpoint(): | |
"""Load processing checkpoint if it exists""" | |
if os.path.exists(CHECKPOINT_FILE): | |
with open(CHECKPOINT_FILE, 'r') as f: | |
return json.load(f) | |
return {'last_processed_index': 0} | |
def save_checkpoint(index): | |
"""Save the current processing index to a checkpoint file""" | |
with open(CHECKPOINT_FILE, 'w') as f: | |
json.dump({'last_processed_index': index}, f) | |
def identify_crypto_entities(text: str) -> list: | |
""" | |
Identify crypto-specific entities in text using the hierarchical taxonomy. | |
Args: | |
text (str): Text to analyze | |
Returns: | |
list: List of tuples (entity, main_category, sub_category) | |
""" | |
if not isinstance(text, str): | |
return [] | |
text_lower = text.lower() | |
found_entities = [] | |
# Search for each entity in the taxonomy | |
for main_cat, subcats in CRYPTO_TAXONOMY.items(): | |
for subcat, terms in subcats.items(): | |
for term in terms: | |
# Avoid matching partial words (ensure word boundaries) | |
pattern = r'\b' + re.escape(term) + r'\b' | |
if re.search(pattern, text_lower): | |
found_entities.append((term, main_cat, subcat)) | |
return found_entities | |
def clean_text(text: str) -> str: | |
"""Clean text while preserving mentions and hashtags""" | |
if not isinstance(text, str): | |
return "" | |
# Remove URLs | |
text = re.sub(r'http\S+', '', text) | |
# Remove non-alphanumeric characters (except mentions, hashtags, and spaces) | |
text = re.sub(r'[^\w\s@#]', ' ', text) | |
# Remove extra whitespace | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text.lower() | |
def process_nlp_text(text: str) -> str: | |
"""Process text with advanced NLP (lemmatization, stopword removal)""" | |
if not isinstance(text, str): | |
return "" | |
# Basic cleaning | |
text = clean_text(text) | |
if SPACY_AVAILABLE: | |
# Process with spaCy for advanced NLP | |
doc = spacy_nlp(text) | |
# Lemmatize and remove stopwords | |
processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct] | |
return " ".join(processed_tokens) | |
else: | |
# Fallback to basic cleaning if spaCy is not available | |
return text | |
def extract_mentions(text: str) -> list: | |
"""Extract @mentions from text""" | |
if not isinstance(text, str): | |
return [] | |
return re.findall(r'@(\w+)', text) | |
def extract_hashtags(text: str) -> list: | |
"""Extract #hashtags from text""" | |
if not isinstance(text, str): | |
return [] | |
return re.findall(r'#(\w+)', text) | |
def extract_urls(text: str) -> list: | |
"""Extract URLs from text""" | |
if not isinstance(text, str): | |
return [] | |
urls = re.findall(r'(https?://\S+)', text) | |
return urls | |
def analyze_sentiment(text: str) -> dict: | |
""" | |
Analyze the sentiment of a text using the sentiment analysis pipeline. | |
Args: | |
text (str): The text to analyze | |
Returns: | |
dict: A dictionary containing sentiment label and score | |
""" | |
if not SENTIMENT_AVAILABLE or not text.strip(): | |
return {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0} | |
try: | |
# Pre-process the text to improve sentiment analysis accuracy | |
# Limit text length to avoid errors with very long tweets | |
truncated_text = text[:512] if len(text) > 512 else text | |
# Get sentiment prediction | |
sentiment_result = sentiment_pipeline(truncated_text)[0] | |
label = sentiment_result['label'] | |
score = sentiment_result['score'] | |
# Map to standardized format (positive, negative, neutral) | |
sentiment_mapping = { | |
'LABEL_0': 'negative', | |
'LABEL_1': 'neutral', | |
'LABEL_2': 'positive', | |
'NEGATIVE': 'negative', | |
'NEUTRAL': 'neutral', | |
'POSITIVE': 'positive' | |
} | |
standardized_sentiment = sentiment_mapping.get(label, label.lower()) | |
# Calculate magnitude (confidence) - useful for filtering high-confidence sentiments | |
magnitude = abs(score - 0.5) * 2 if standardized_sentiment != 'neutral' else score | |
return { | |
"sentiment": standardized_sentiment, | |
"sentiment_score": score, | |
"sentiment_magnitude": magnitude | |
} | |
except Exception as e: | |
print(f"Error in sentiment analysis: {e}") | |
return {"sentiment": "error", "sentiment_score": 0.0, "sentiment_magnitude": 0.0} | |
def process_with_nlp(text: str) -> dict: | |
""" | |
Process text with NLP to extract named entities, key phrases, etc. | |
Args: | |
text (str): The text to process | |
Returns: | |
dict: A dictionary containing NLP processing results | |
""" | |
results = { | |
"named_entities": [], | |
"pos_tags": [], | |
"lemmatized_tokens": [], | |
"key_phrases": [], | |
"important_nouns": [], | |
"sentiment_analysis": {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0} | |
} | |
if not text or text.isspace(): | |
return results | |
# First, analyze sentiment | |
results["sentiment_analysis"] = analyze_sentiment(text) | |
try: | |
# Use spaCy for advanced NLP if available | |
if SPACY_AVAILABLE: | |
doc = spacy_nlp(text) | |
# Extract named entities (excluding crypto entities which are handled separately) | |
results["named_entities"] = [(ent.text, ent.label_) for ent in doc.ents] | |
# Extract POS tags for content words | |
results["pos_tags"] = [(token.text, token.pos_) for token in doc | |
if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"] and not token.is_stop] | |
# Get lemmatized tokens (normalized words) | |
results["lemmatized_tokens"] = [token.lemma_ for token in doc | |
if not token.is_stop and not token.is_punct and token.text.strip()] | |
# Extract important nouns (potential topics) | |
results["important_nouns"] = [token.text for token in doc | |
if token.pos_ == "NOUN" and not token.is_stop] | |
# Try to extract key phrases using noun chunks | |
results["key_phrases"] = [chunk.text for chunk in doc.noun_chunks | |
if len(chunk.text.split()) > 1] | |
# If key phrases are empty, use RoBERTa to attempt extraction | |
if not results["key_phrases"] and len(text.split()) > 3: | |
try: | |
# Create a masked sentence from the text | |
words = text.split() | |
if len(words) > 5: | |
# Get 3 random positions to mask | |
import random | |
positions = sorted(random.sample(range(len(words)), min(3, len(words)))) | |
# Create masked sentences | |
key_terms = [] | |
for pos in positions: | |
words_copy = words.copy() | |
words_copy[pos] = tokenizer.mask_token | |
masked_text = " ".join(words_copy) | |
# Get predictions for the masked token | |
predictions = nlp_pipeline(masked_text, top_k=2) | |
for prediction in predictions: | |
key_terms.append(prediction["token_str"].strip()) | |
results["key_phrases"].extend(key_terms) | |
except Exception as e: | |
print(f"Error in key phrase extraction: {e}") | |
# Ensure all results are strings for CSV output | |
results["named_entities"] = ";".join([f"{ent[0]}:{ent[1]}" for ent in results["named_entities"]]) | |
results["pos_tags"] = ";".join([f"{tag[0]}:{tag[1]}" for tag in results["pos_tags"]]) | |
results["lemmatized_tokens"] = ";".join(results["lemmatized_tokens"]) | |
results["key_phrases"] = ";".join(list(set(results["key_phrases"]))) # Remove duplicates | |
results["important_nouns"] = ";".join(list(set(results["important_nouns"]))) # Remove duplicates | |
except Exception as e: | |
print(f"Error in NLP processing: {e}") | |
# Clear GPU memory after processing | |
if (results["named_entities"].count(";") > 100) or (len(text) > 1000): | |
clear_gpu_memory() | |
return results | |
def process_tweet(text: str) -> tuple: | |
""" | |
Process a tweet to extract mentions, hashtags, URLs, crypto entities, and perform NLP analysis. | |
Also performs sentiment analysis. | |
Args: | |
text (str): The tweet text to process | |
Returns: | |
tuple: A tuple containing mentions, hashtags, URLs, NLP results, and sentiment analysis | |
""" | |
if not text or not isinstance(text, str): | |
return [], [], [], "", "", {}, {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0} | |
# Clean the text while preserving mentions and hashtags | |
cleaned_text = clean_text(text) | |
# Process text with NLP | |
processed_text = process_nlp_text(text) | |
# Extract mentions, hashtags, and URLs | |
mentions = extract_mentions(text) | |
hashtags = extract_hashtags(text) | |
urls = extract_urls(text) | |
# Identify crypto entities | |
crypto_entities = identify_crypto_entities(text) | |
# Process with NLP models | |
nlp_results = process_with_nlp(text) | |
# Ensure we have the sentiment analysis results | |
sentiment_results = nlp_results.pop("sentiment_analysis", {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}) | |
# Add crypto entities to the named entities | |
formatted_crypto_entities = [f"{entity}:{main_cat}.{sub_cat}" for entity, main_cat, sub_cat in crypto_entities] | |
# If named_entities is a string (joined with semicolons), we need to handle differently | |
if isinstance(nlp_results.get("named_entities", ""), str): | |
nlp_results["named_entities"] = nlp_results.get("named_entities", "") | |
if nlp_results["named_entities"] and formatted_crypto_entities: | |
nlp_results["named_entities"] += ";" + ";".join(formatted_crypto_entities) | |
elif formatted_crypto_entities: | |
nlp_results["named_entities"] = ";".join(formatted_crypto_entities) | |
return mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results | |
def process_batch(df_batch): | |
"""Process a batch of tweets""" | |
processed_data = [] | |
for idx, row in df_batch.iterrows(): | |
text = row.get('text', '') | |
# Process the tweet | |
mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results = process_tweet(text) | |
# Create a dictionary with the results | |
result = { | |
'id': row.get('id', ''), | |
'original_text': text, # Store the original text | |
'cleaned_text': cleaned_text, | |
'nlp_processed_text': processed_text, | |
'extracted_mentions': ';'.join(mentions), | |
'extracted_hashtags': ';'.join(hashtags), | |
'extracted_urls': ';'.join(urls), | |
'named_entities': nlp_results.get('named_entities', ''), | |
'pos_tags': nlp_results.get('pos_tags', ''), | |
'lemmatized_tokens': nlp_results.get('lemmatized_tokens', ''), | |
'key_phrases': nlp_results.get('key_phrases', ''), | |
'important_nouns': nlp_results.get('important_nouns', ''), | |
'sentiment': sentiment_results.get('sentiment', 'unknown'), | |
'sentiment_score': sentiment_results.get('sentiment_score', 0.0), | |
'sentiment_magnitude': sentiment_results.get('sentiment_magnitude', 0.0) | |
} | |
processed_data.append(result) | |
return pd.DataFrame(processed_data) | |
# ============================================== | |
# Main Processing Function | |
# ============================================== | |
def main(reset_checkpoint=False, input_file=None): | |
""" | |
Main function to process tweets | |
Args: | |
reset_checkpoint (bool): Whether to reset the checkpoint and process all data | |
input_file (str): Optional specific input file to process, otherwise processes all CSV files | |
""" | |
if reset_checkpoint and os.path.exists(CHECKPOINT_FILE): | |
os.remove(CHECKPOINT_FILE) | |
print("Checkpoint reset. Will process all data from the beginning.") | |
# Get list of CSV files to process | |
if input_file: | |
# Process a specific file | |
input_files = [input_file] | |
else: | |
# Find all CSV files in the OUTPUT_FOLDER | |
import glob | |
input_files = glob.glob(f"{OUTPUT_FOLDER}/*.csv") | |
# Exclude our output files | |
input_files = [f for f in input_files if not any(x in f for x in ["_processed.csv", "_analysis.csv"])] | |
if not input_files: | |
print(f"No input CSV files found in {OUTPUT_FOLDER}") | |
return | |
print(f"Found {len(input_files)} files to process: {[os.path.basename(f) for f in input_files]}") | |
# Process each file | |
for input_csv in input_files: | |
print(f"\nProcessing file: {os.path.basename(input_csv)}") | |
print("Loading dataset...") | |
# Check if input file exists | |
if not os.path.exists(input_csv): | |
print(f"Input file {input_csv} not found. Skipping.") | |
continue | |
# Load the dataset | |
try: | |
df = pd.read_csv(input_csv) | |
print(f"Loaded dataset with {len(df)} records and {len(df.columns)} columns.") | |
except Exception as e: | |
print(f"Error loading {input_csv}: {e}") | |
continue | |
# Load checkpoint if it exists | |
checkpoint = load_checkpoint() | |
start_idx = checkpoint['last_processed_index'] | |
# For simplicity, reset checkpoints between files | |
start_idx = 0 | |
save_checkpoint(0) | |
print("\nProcessing tweets...") | |
print(f"Starting from index {start_idx}") | |
# Filter to only unprocessed rows | |
df_to_process = df.iloc[start_idx:] | |
if len(df_to_process) == 0: | |
print("No new data to process in this file.") | |
continue | |
# Process in batches for memory efficiency | |
batch_size = BATCH_SIZE | |
num_batches = math.ceil(len(df_to_process) / batch_size) | |
print(f"Processing in {num_batches} batches of {batch_size} records each") | |
processed_batches = [] | |
# Create progress bar | |
for i in tqdm(range(num_batches)): | |
batch_start = i * batch_size | |
batch_end = min((i + 1) * batch_size, len(df_to_process)) | |
# Get current batch | |
df_batch = df_to_process.iloc[batch_start:batch_end] | |
# Process the batch | |
processed_batch = process_batch(df_batch) | |
processed_batches.append(processed_batch) | |
# Save checkpoint | |
save_checkpoint(start_idx + batch_end) | |
# Save intermediate results every 5 batches to prevent data loss in case of session timeout | |
if i % 5 == 0 and i > 0: | |
file_basename = os.path.splitext(os.path.basename(input_csv))[0] | |
interim_df = pd.concat(processed_batches, ignore_index=True) | |
interim_file = f"{OUTPUT_FOLDER}/{file_basename}_interim_{i}.csv" | |
interim_df.to_csv(interim_file, index=False) | |
print(f"\nSaved interim results to {interim_file}") | |
# Clear memory | |
clear_gpu_memory() | |
# Combine all batches | |
if processed_batches: | |
file_basename = os.path.splitext(os.path.basename(input_csv))[0] | |
final_df = pd.concat(processed_batches, ignore_index=True) | |
# Calculate statistics columns | |
final_df["mention_count"] = final_df["extracted_mentions"].str.count(";") + (final_df["extracted_mentions"] != "").astype(int) | |
final_df["hashtag_count"] = final_df["extracted_hashtags"].str.count(";") + (final_df["extracted_hashtags"] != "").astype(int) | |
final_df["entity_count"] = final_df["named_entities"].str.count(";") + (final_df["named_entities"] != "").astype(int) | |
# Save the full processed dataset | |
output_file = f"{OUTPUT_FOLDER}/{file_basename}_processed.csv" | |
final_df.to_csv(output_file, index=False) | |
print(f"Processed data saved to {output_file}") | |
# Create a lighter version with just the analysis | |
analysis_columns = [ | |
"id", "original_text", "cleaned_text", "nlp_processed_text", | |
"extracted_mentions", "extracted_hashtags", "extracted_urls", | |
"named_entities", "key_phrases", "important_nouns", | |
"sentiment", "sentiment_score", "sentiment_magnitude", | |
"mention_count", "hashtag_count", "entity_count" | |
] | |
# Ensure all columns exist before subsetting | |
available_columns = [col for col in analysis_columns if col in final_df.columns] | |
analysis_df = final_df[available_columns] | |
analysis_file = f"{OUTPUT_FOLDER}/{file_basename}_analysis.csv" | |
analysis_df.to_csv(analysis_file, index=False) | |
print(f"Analysis results saved to {analysis_file}") | |
# Print statistics | |
print(f"\nAnalysis completed successfully!") | |
print(f"Total records: {len(final_df)}") | |
print(f"Tweets with Mentions: {(final_df['extracted_mentions'] != '').sum()}") | |
print(f"Tweets with Hashtags: {(final_df['extracted_hashtags'] != '').sum()}") | |
print(f"Tweets with Named Entities: {(final_df['named_entities'] != '').sum()}") | |
# Print sentiment statistics | |
sentiment_counts = final_df['sentiment'].value_counts() | |
print("\nSentiment Distribution:") | |
for sentiment, count in sentiment_counts.items(): | |
percentage = (count / len(final_df)) * 100 | |
print(f" {sentiment}: {count} tweets ({percentage:.1f}%)") | |
# Get average sentiment scores | |
avg_score = final_df['sentiment_score'].mean() | |
avg_magnitude = final_df['sentiment_magnitude'].mean() | |
print(f"\nAverage sentiment score: {avg_score:.3f}") | |
print(f"Average sentiment magnitude: {avg_magnitude:.3f}") | |
# Get top entities by sentiment | |
positive_entities = [] | |
for idx, row in final_df[final_df['sentiment'] == 'positive'].iterrows(): | |
entities = row['named_entities'].split(';') if isinstance(row['named_entities'], str) and row['named_entities'] else [] | |
for entity in entities: | |
if entity and ':' in entity: | |
entity_name = entity.split(':')[0] | |
positive_entities.append(entity_name) | |
# Get the most common positive entities | |
from collections import Counter | |
top_positive = Counter(positive_entities).most_common(5) | |
if top_positive: | |
print("\nTop entities with positive sentiment:") | |
for entity, count in top_positive: | |
print(f" {entity}: {count} mentions") | |
# Print sample results | |
print("\nSample of processing results:") | |
for i, row in analysis_df.head(3).iterrows(): | |
print(f"\nOriginal Text: {row['original_text']}") | |
print(f"Cleaned Text: {row['cleaned_text']}") | |
print(f"NLP Processed Text: {row['nlp_processed_text']}") | |
print(f"Mentions: {row['extracted_mentions']}") | |
print(f"Hashtags: {row['extracted_hashtags']}") | |
print(f"Named Entities: {row['named_entities']}") | |
print(f"Key Phrases: {row['key_phrases']}") | |
print(f"Sentiment: {row['sentiment']} (Score: {row['sentiment_score']:.3f}, Magnitude: {row['sentiment_magnitude']:.3f})") | |
print("-" * 80) | |
# Delete interim files | |
import glob | |
interim_files = glob.glob(f"{OUTPUT_FOLDER}/{file_basename}_interim_*.csv") | |
for f in interim_files: | |
try: | |
os.remove(f) | |
print(f"Deleted interim file: {os.path.basename(f)}") | |
except: | |
pass | |
# Clear memory after processing each file | |
clear_gpu_memory() | |
else: | |
print("No data processed for this file.") | |
# Clean up checkpoint file after successful processing | |
if os.path.exists(CHECKPOINT_FILE): | |
os.remove(CHECKPOINT_FILE) | |
print("\nAll files processed successfully!") | |
# ============================================== | |
# Colab Usage Example | |
# ============================================== | |
""" | |
# EXAMPLE USAGE IN COLAB: | |
# 1. Install packages and mount drive | |
from google.colab import drive | |
drive.mount('/content/drive') | |
# 2. Process one specific file | |
input_file = "/content/drive/MyDrive/projects_twitter_post/zilliqa.csv" | |
main(reset_checkpoint=True, input_file=input_file) | |
# 3. Process all files | |
main(reset_checkpoint=True) | |
""" | |
if __name__ == "__main__": | |
import sys | |
# Check if --reset flag is provided | |
reset_checkpoint = "--reset" in sys.argv | |
# Check if --file flag is provided | |
input_file = None | |
if "--file" in sys.argv: | |
try: | |
input_file = sys.argv[sys.argv.index("--file") + 1] | |
except IndexError: | |
print("Error: --file flag requires a filename argument") | |
sys.exit(1) | |
# Run the main function | |
main(reset_checkpoint=reset_checkpoint, input_file=input_file) ) | |
demo.launch() |