import gradio as gr import pandas as pd import os import subprocess import sys # Install spaCy model os.system("python -m spacy download en_core_web_sm") def process_tweets(files, reset_processing=False): # Save uploaded files file_paths = [] for file in files: if file.name.endswith('.csv'): # Ensure directory exists os.makedirs("projects_twitter_post", exist_ok=True) # Save file to the directory dest_path = f"projects_twitter_post/{os.path.basename(file.name)}" os.system(f"cp {file.name} {dest_path}") file_paths.append(dest_path) if not file_paths: return "No CSV files uploaded. Please upload CSV files containing tweet data." # Run the processing script reset_flag = "--reset" if reset_processing else "" result = subprocess.run( f"python process_tweet_huggingface.py {reset_flag}", shell=True, capture_output=True, text=True ) # Check if output files were created output_files = [] for file_path in file_paths: base_name = os.path.basename(file_path).replace('.csv', '') processed_path = f"projects_twitter_post/{base_name}_processed.csv" analysis_path = f"projects_twitter_post/{base_name}_analysis.csv" if os.path.exists(processed_path): output_files.append(processed_path) if os.path.exists(analysis_path): output_files.append(analysis_path) return_files = [f for f in output_files if os.path.exists(f)] log_output = result.stdout + "\n" + result.stderr return log_output, return_files with gr.Blocks() as demo: gr.Markdown("# Crypto Tweet Processor") gr.Markdown("Upload CSV files containing tweet data to process") with gr.Row(): files_input = gr.File(file_count="multiple", label="Upload CSV Files") reset_checkbox = gr.Checkbox(label="Reset Processing", value=False) process_btn = gr.Button("Process Tweets") output_text = gr.Textbox(label="Processing Log") output_files = gr.File(label="Processed Files", file_count="multiple") process_btn.click( process_tweets, inputs=[files_input, reset_checkbox], outputs=[output_text, output_files] ) # Add the modified processing script code here with open("process_tweet_huggingface.py", "w") as f: f.write( import os import re import json import numpy as np import torch import math import gc from tqdm import tqdm from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline import spacy # ============================================== # COLAB SETUP - Run these cells first in Colab # ============================================== # Uncomment and run this cell to mount your Google Drive """ from google.colab import drive drive.mount('/content/drive') """ # Uncomment and run this cell to install required packages """ !pip install pandas tqdm transformers spacy !python -m spacy download en_core_web_sm """ # Uncomment and run this cell to verify GPU availability """ import torch print(f"GPU available: {torch.cuda.is_available()}") print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}") """ # ============================================== # Constants - Update these paths for your setup # ============================================== # Update this to your Google Drive path DRIVE_PATH = "./projects_twitter_post" OUTPUT_FOLDER = f"{DRIVE_PATH}" CHECKPOINT_FILE = f"{OUTPUT_FOLDER}/processing_checkpoint.json" BATCH_SIZE = 500 # Reduced batch size for GPU memory management # Create output folder if it doesn't exist if not os.path.exists(OUTPUT_FOLDER): os.makedirs(OUTPUT_FOLDER) # ============================================== # Model Initialization with GPU Acceleration # ============================================== print("Loading RoBERTa model...") model_name = "roberta-base" device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Initialize with GPU acceleration tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForMaskedLM.from_pretrained(model_name).to(device) nlp_pipeline = pipeline("fill-mask", model=model_name, device=0 if torch.cuda.is_available() else -1) # Initialize sentiment analysis pipeline print("Loading sentiment analysis model...") try: # Using a Twitter-specific sentiment model for better results on social media text sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment" sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, device=0 if torch.cuda.is_available() else -1) SENTIMENT_AVAILABLE = True except Exception as e: print(f"Error loading sentiment model: {e}") # Fallback to a simpler sentiment model if the Twitter-specific one fails try: sentiment_pipeline = pipeline("sentiment-analysis", device=0 if torch.cuda.is_available() else -1) SENTIMENT_AVAILABLE = True except: print("Sentiment analysis not available. Continuing without sentiment analysis.") SENTIMENT_AVAILABLE = False # Try to load spaCy for basic text preprocessing try: import spacy spacy_nlp = spacy.load("en_core_web_sm") SPACY_AVAILABLE = True except: SPACY_AVAILABLE = False print("SpaCy not available. Using basic text processing instead.") # Crypto-specific keywords with hierarchical categories CRYPTO_TAXONOMY = { "COIN": { "MAJOR": [ "bitcoin", "ethereum", "btc", "eth", "bnb", "xrp", "sol", "doge", "cardano", "polkadot", "dot", "avalanche", "avax", "solana", "polygon", "matic" ], "STABLECOIN": [ "tether", "usdt", "usdc", "busd", "dai", "frax", "tusd", "usdd", "lusd", "gusd", "husd" ], "ALTCOIN": [ "litecoin", "ltc", "chainlink", "link", "stellar", "xlm", "dogecoin", "shib", "tron", "trx", "cosmos", "atom", "near", "algo", "fantom", "ftm", "monero", "xmr" ], "DEFI": [ "uniswap", "uni", "aave", "sushi", "cake", "comp", "maker", "mkr", "curve", "crv", "yearn", "yfi", "compound", "balancer", "bal", "synthetix", "snx" ], "UTILITY": [ "filecoin", "fil", "the graph", "grt", "arweave", "ar", "chainlink", "link", "helium", "hnt", "theta", "icp" ], "NFT": [ "enjin", "enj", "decentraland", "mana", "sandbox", "sand", "axie", "axs", "gala", "apecoin", "ape", "flow", "ens", "stepn", "gmt" ] }, "TECH": { "CONCEPTS": [ "blockchain", "defi", "nft", "dao", "smart contract", "web3", "dapp", "protocol", "consensus", "tokenomics", "tokenization" ], "CHAIN_TYPES": [ "layer1", "layer2", "rollup", "sidechain", "mainnet", "testnet", "devnet", "pow", "pos", "poh", "pbft", "dpos" ], "PRIVACY": [ "zk", "zk-rollups", "zero-knowledge", "zkp", "zksnark", "zkstark", "mpc", "privacy", "private", "anonymous", "confidential", "encrypted" ], "SECTORS": [ "defi", "cefi", "gamefi", "metaverse", "socialfi", "fintech", "realfi", "play-to-earn", "move-to-earn", "learn-to-earn", "x-to-earn", "defai", "depin", "desci", "refi", "did", "dedata", "dedao", "deid", "deai", "degov", "decloud", "dehealth", "decex", "deinsurance", "deworkplace", "public goods", "zk", "ordinals", "soulbound", "onchain gaming", "ai agents", "infrastructure", "credentials", "restaking", "modular blockchain", "liquid staking", "real world assets", "rwa", "synthetic assets", "account abstraction" ] }, "ACTION": { "TRADING": [ "buy", "sell", "long", "short", "margin", "leverage", "trade", "swap", "arbitrage", "dca", "ape", "pump", "dump", "moon", "ath", "atl", "breakout", "correction", "consolidation", "accumulate", "distribute", "front run", "front runner", "front running", "mev", "sandwich attack" ], "DEFI": [ "stake", "yield", "farm", "lend", "borrow", "supply", "withdraw", "claim", "harvest", "flash loan", "liquidate", "collateralize", "wrap", "unwrap", "bridge", "provide liquidity", "withdraw liquidity", "impermanent loss" ], "GOVERNANCE": [ "delegate", "vote", "propose", "governance", "dao", "snapshot", "quorum", "execution", "timelock", "veto" ], "NFT": [ "mint", "airdrop", "whitelist", "burn", "floor price", "rarity", "trait", "pfp", "collection", "secondary", "flip" ], "DEVELOPMENT": [ "deploy", "audit", "fork", "bootstrap", "initiate", "merge", "split", "rebase", "optimize", "gas optimization", "implement", "compile" ] }, "PLATFORM": { "EXCHANGE": [ "coinbase", "binance", "kraken", "kucoin", "ftx", "okx", "bybit", "bitfinex", "huobi", "gate", "gemini", "bitstamp", "bittrex", "crypto.com", "cex", "dex" ], "WALLET": [ "metamask", "phantom", "trust wallet", "ledger", "trezor", "argent", "rainbow", "wallet", "hot wallet", "cold storage", "hardware wallet", "seed phrase" ], "NFT_MARKET": [ "opensea", "rarible", "foundation", "superrare", "looksrare", "blur", "magic eden", "nifty gateway", "zora", "x2y2", "element" ], "INFRA": [ "alchemy", "infura", "moralis", "quicknode", "ceramic", "arweave", "ipfs", "node", "rpc", "api", "indexer", "subgraph" ] }, "NETWORK": { "LAYER1": [ "ethereum", "bitcoin", "solana", "avalanche", "polygon", "bnb chain", "bsc", "cardano", "polkadot", "cosmos", "algorand", "tezos", "flow", "near", "tron" ], "LAYER2": [ "arbitrum", "optimism", "zksync", "starknet", "base", "polygon", "loopring", "immutablex", "metis", "boba", "aztec", "validium", "zkevm" ], "INTEROPERABILITY": [ "cosmos", "polkadot", "kusama", "moonbeam", "moonriver", "parachains", "relay chain", "ibc", "cross-chain", "bridge" ] }, "EVENTS": { "MARKET": [ "bull market", "bear market", "bull run", "bear trap", "bull trap", "halving", "capitulation", "golden cross", "death cross", "breakout", "resistance", "support" ], "SECURITY": [ "hack", "exploit", "vulnerability", "scam", "phishing", "rug pull", "honeypot", "flash crash", "attack", "51% attack", "front running", "sandwich attack", "mev extraction" ], "TOKEN_EVENTS": [ "airdrop", "token unlock", "vesting", "ico", "ido", "ito", "ieo", "fair launch", "private sale", "seed round", "listing", "delisting" ] }, "METRICS": { "FINANCIAL": [ "apy", "apr", "roi", "tvl", "market cap", "mcap", "volume", "liquidity", "supply", "circulating supply", "total supply", "max supply", "inflation", "deflation", "volatility", "dominance" ], "TECHNICAL": [ "gas fee", "gas price", "gas limit", "slippage", "impermanent loss", "yield", "hashrate", "difficulty", "tps", "latency", "finality", "block time", "block size", "block reward" ] }, "COMMUNITY": { "ROLES": [ "whale", "degen", "anon", "influencer", "kol", "thought leader", "ambassador", "advocate", "og", "contributor", "dev", "builder", "founder", "investor", "vc", "angel", "team", "core team", "front runner", "mev bot", "searcher", "validator", "miner", "node operator", "liquidity provider", "market maker", "arbitrageur" ], "SLANG": [ "diamond hands", "paper hands", "wagmi", "ngmi", "gm", "gn", "ser", "based", "crypto twitter", "ct", "alpha", "dyor", "fomo", "fud", "hodl", "rekt" ] } } # ============================================== # Helper Functions # ============================================== def clear_gpu_memory(): """Clear GPU memory to prevent OOM errors""" if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() def load_checkpoint(): """Load processing checkpoint if it exists""" if os.path.exists(CHECKPOINT_FILE): with open(CHECKPOINT_FILE, 'r') as f: return json.load(f) return {'last_processed_index': 0} def save_checkpoint(index): """Save the current processing index to a checkpoint file""" with open(CHECKPOINT_FILE, 'w') as f: json.dump({'last_processed_index': index}, f) def identify_crypto_entities(text: str) -> list: """ Identify crypto-specific entities in text using the hierarchical taxonomy. Args: text (str): Text to analyze Returns: list: List of tuples (entity, main_category, sub_category) """ if not isinstance(text, str): return [] text_lower = text.lower() found_entities = [] # Search for each entity in the taxonomy for main_cat, subcats in CRYPTO_TAXONOMY.items(): for subcat, terms in subcats.items(): for term in terms: # Avoid matching partial words (ensure word boundaries) pattern = r'\b' + re.escape(term) + r'\b' if re.search(pattern, text_lower): found_entities.append((term, main_cat, subcat)) return found_entities def clean_text(text: str) -> str: """Clean text while preserving mentions and hashtags""" if not isinstance(text, str): return "" # Remove URLs text = re.sub(r'http\S+', '', text) # Remove non-alphanumeric characters (except mentions, hashtags, and spaces) text = re.sub(r'[^\w\s@#]', ' ', text) # Remove extra whitespace text = re.sub(r'\s+', ' ', text).strip() return text.lower() def process_nlp_text(text: str) -> str: """Process text with advanced NLP (lemmatization, stopword removal)""" if not isinstance(text, str): return "" # Basic cleaning text = clean_text(text) if SPACY_AVAILABLE: # Process with spaCy for advanced NLP doc = spacy_nlp(text) # Lemmatize and remove stopwords processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct] return " ".join(processed_tokens) else: # Fallback to basic cleaning if spaCy is not available return text def extract_mentions(text: str) -> list: """Extract @mentions from text""" if not isinstance(text, str): return [] return re.findall(r'@(\w+)', text) def extract_hashtags(text: str) -> list: """Extract #hashtags from text""" if not isinstance(text, str): return [] return re.findall(r'#(\w+)', text) def extract_urls(text: str) -> list: """Extract URLs from text""" if not isinstance(text, str): return [] urls = re.findall(r'(https?://\S+)', text) return urls def analyze_sentiment(text: str) -> dict: """ Analyze the sentiment of a text using the sentiment analysis pipeline. Args: text (str): The text to analyze Returns: dict: A dictionary containing sentiment label and score """ if not SENTIMENT_AVAILABLE or not text.strip(): return {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0} try: # Pre-process the text to improve sentiment analysis accuracy # Limit text length to avoid errors with very long tweets truncated_text = text[:512] if len(text) > 512 else text # Get sentiment prediction sentiment_result = sentiment_pipeline(truncated_text)[0] label = sentiment_result['label'] score = sentiment_result['score'] # Map to standardized format (positive, negative, neutral) sentiment_mapping = { 'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive', 'NEGATIVE': 'negative', 'NEUTRAL': 'neutral', 'POSITIVE': 'positive' } standardized_sentiment = sentiment_mapping.get(label, label.lower()) # Calculate magnitude (confidence) - useful for filtering high-confidence sentiments magnitude = abs(score - 0.5) * 2 if standardized_sentiment != 'neutral' else score return { "sentiment": standardized_sentiment, "sentiment_score": score, "sentiment_magnitude": magnitude } except Exception as e: print(f"Error in sentiment analysis: {e}") return {"sentiment": "error", "sentiment_score": 0.0, "sentiment_magnitude": 0.0} def process_with_nlp(text: str) -> dict: """ Process text with NLP to extract named entities, key phrases, etc. Args: text (str): The text to process Returns: dict: A dictionary containing NLP processing results """ results = { "named_entities": [], "pos_tags": [], "lemmatized_tokens": [], "key_phrases": [], "important_nouns": [], "sentiment_analysis": {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0} } if not text or text.isspace(): return results # First, analyze sentiment results["sentiment_analysis"] = analyze_sentiment(text) try: # Use spaCy for advanced NLP if available if SPACY_AVAILABLE: doc = spacy_nlp(text) # Extract named entities (excluding crypto entities which are handled separately) results["named_entities"] = [(ent.text, ent.label_) for ent in doc.ents] # Extract POS tags for content words results["pos_tags"] = [(token.text, token.pos_) for token in doc if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"] and not token.is_stop] # Get lemmatized tokens (normalized words) results["lemmatized_tokens"] = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.text.strip()] # Extract important nouns (potential topics) results["important_nouns"] = [token.text for token in doc if token.pos_ == "NOUN" and not token.is_stop] # Try to extract key phrases using noun chunks results["key_phrases"] = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1] # If key phrases are empty, use RoBERTa to attempt extraction if not results["key_phrases"] and len(text.split()) > 3: try: # Create a masked sentence from the text words = text.split() if len(words) > 5: # Get 3 random positions to mask import random positions = sorted(random.sample(range(len(words)), min(3, len(words)))) # Create masked sentences key_terms = [] for pos in positions: words_copy = words.copy() words_copy[pos] = tokenizer.mask_token masked_text = " ".join(words_copy) # Get predictions for the masked token predictions = nlp_pipeline(masked_text, top_k=2) for prediction in predictions: key_terms.append(prediction["token_str"].strip()) results["key_phrases"].extend(key_terms) except Exception as e: print(f"Error in key phrase extraction: {e}") # Ensure all results are strings for CSV output results["named_entities"] = ";".join([f"{ent[0]}:{ent[1]}" for ent in results["named_entities"]]) results["pos_tags"] = ";".join([f"{tag[0]}:{tag[1]}" for tag in results["pos_tags"]]) results["lemmatized_tokens"] = ";".join(results["lemmatized_tokens"]) results["key_phrases"] = ";".join(list(set(results["key_phrases"]))) # Remove duplicates results["important_nouns"] = ";".join(list(set(results["important_nouns"]))) # Remove duplicates except Exception as e: print(f"Error in NLP processing: {e}") # Clear GPU memory after processing if (results["named_entities"].count(";") > 100) or (len(text) > 1000): clear_gpu_memory() return results def process_tweet(text: str) -> tuple: """ Process a tweet to extract mentions, hashtags, URLs, crypto entities, and perform NLP analysis. Also performs sentiment analysis. Args: text (str): The tweet text to process Returns: tuple: A tuple containing mentions, hashtags, URLs, NLP results, and sentiment analysis """ if not text or not isinstance(text, str): return [], [], [], "", "", {}, {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0} # Clean the text while preserving mentions and hashtags cleaned_text = clean_text(text) # Process text with NLP processed_text = process_nlp_text(text) # Extract mentions, hashtags, and URLs mentions = extract_mentions(text) hashtags = extract_hashtags(text) urls = extract_urls(text) # Identify crypto entities crypto_entities = identify_crypto_entities(text) # Process with NLP models nlp_results = process_with_nlp(text) # Ensure we have the sentiment analysis results sentiment_results = nlp_results.pop("sentiment_analysis", {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}) # Add crypto entities to the named entities formatted_crypto_entities = [f"{entity}:{main_cat}.{sub_cat}" for entity, main_cat, sub_cat in crypto_entities] # If named_entities is a string (joined with semicolons), we need to handle differently if isinstance(nlp_results.get("named_entities", ""), str): nlp_results["named_entities"] = nlp_results.get("named_entities", "") if nlp_results["named_entities"] and formatted_crypto_entities: nlp_results["named_entities"] += ";" + ";".join(formatted_crypto_entities) elif formatted_crypto_entities: nlp_results["named_entities"] = ";".join(formatted_crypto_entities) return mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results def process_batch(df_batch): """Process a batch of tweets""" processed_data = [] for idx, row in df_batch.iterrows(): text = row.get('text', '') # Process the tweet mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results = process_tweet(text) # Create a dictionary with the results result = { 'id': row.get('id', ''), 'original_text': text, # Store the original text 'cleaned_text': cleaned_text, 'nlp_processed_text': processed_text, 'extracted_mentions': ';'.join(mentions), 'extracted_hashtags': ';'.join(hashtags), 'extracted_urls': ';'.join(urls), 'named_entities': nlp_results.get('named_entities', ''), 'pos_tags': nlp_results.get('pos_tags', ''), 'lemmatized_tokens': nlp_results.get('lemmatized_tokens', ''), 'key_phrases': nlp_results.get('key_phrases', ''), 'important_nouns': nlp_results.get('important_nouns', ''), 'sentiment': sentiment_results.get('sentiment', 'unknown'), 'sentiment_score': sentiment_results.get('sentiment_score', 0.0), 'sentiment_magnitude': sentiment_results.get('sentiment_magnitude', 0.0) } processed_data.append(result) return pd.DataFrame(processed_data) # ============================================== # Main Processing Function # ============================================== def main(reset_checkpoint=False, input_file=None): """ Main function to process tweets Args: reset_checkpoint (bool): Whether to reset the checkpoint and process all data input_file (str): Optional specific input file to process, otherwise processes all CSV files """ if reset_checkpoint and os.path.exists(CHECKPOINT_FILE): os.remove(CHECKPOINT_FILE) print("Checkpoint reset. Will process all data from the beginning.") # Get list of CSV files to process if input_file: # Process a specific file input_files = [input_file] else: # Find all CSV files in the OUTPUT_FOLDER import glob input_files = glob.glob(f"{OUTPUT_FOLDER}/*.csv") # Exclude our output files input_files = [f for f in input_files if not any(x in f for x in ["_processed.csv", "_analysis.csv"])] if not input_files: print(f"No input CSV files found in {OUTPUT_FOLDER}") return print(f"Found {len(input_files)} files to process: {[os.path.basename(f) for f in input_files]}") # Process each file for input_csv in input_files: print(f"\nProcessing file: {os.path.basename(input_csv)}") print("Loading dataset...") # Check if input file exists if not os.path.exists(input_csv): print(f"Input file {input_csv} not found. Skipping.") continue # Load the dataset try: df = pd.read_csv(input_csv) print(f"Loaded dataset with {len(df)} records and {len(df.columns)} columns.") except Exception as e: print(f"Error loading {input_csv}: {e}") continue # Load checkpoint if it exists checkpoint = load_checkpoint() start_idx = checkpoint['last_processed_index'] # For simplicity, reset checkpoints between files start_idx = 0 save_checkpoint(0) print("\nProcessing tweets...") print(f"Starting from index {start_idx}") # Filter to only unprocessed rows df_to_process = df.iloc[start_idx:] if len(df_to_process) == 0: print("No new data to process in this file.") continue # Process in batches for memory efficiency batch_size = BATCH_SIZE num_batches = math.ceil(len(df_to_process) / batch_size) print(f"Processing in {num_batches} batches of {batch_size} records each") processed_batches = [] # Create progress bar for i in tqdm(range(num_batches)): batch_start = i * batch_size batch_end = min((i + 1) * batch_size, len(df_to_process)) # Get current batch df_batch = df_to_process.iloc[batch_start:batch_end] # Process the batch processed_batch = process_batch(df_batch) processed_batches.append(processed_batch) # Save checkpoint save_checkpoint(start_idx + batch_end) # Save intermediate results every 5 batches to prevent data loss in case of session timeout if i % 5 == 0 and i > 0: file_basename = os.path.splitext(os.path.basename(input_csv))[0] interim_df = pd.concat(processed_batches, ignore_index=True) interim_file = f"{OUTPUT_FOLDER}/{file_basename}_interim_{i}.csv" interim_df.to_csv(interim_file, index=False) print(f"\nSaved interim results to {interim_file}") # Clear memory clear_gpu_memory() # Combine all batches if processed_batches: file_basename = os.path.splitext(os.path.basename(input_csv))[0] final_df = pd.concat(processed_batches, ignore_index=True) # Calculate statistics columns final_df["mention_count"] = final_df["extracted_mentions"].str.count(";") + (final_df["extracted_mentions"] != "").astype(int) final_df["hashtag_count"] = final_df["extracted_hashtags"].str.count(";") + (final_df["extracted_hashtags"] != "").astype(int) final_df["entity_count"] = final_df["named_entities"].str.count(";") + (final_df["named_entities"] != "").astype(int) # Save the full processed dataset output_file = f"{OUTPUT_FOLDER}/{file_basename}_processed.csv" final_df.to_csv(output_file, index=False) print(f"Processed data saved to {output_file}") # Create a lighter version with just the analysis analysis_columns = [ "id", "original_text", "cleaned_text", "nlp_processed_text", "extracted_mentions", "extracted_hashtags", "extracted_urls", "named_entities", "key_phrases", "important_nouns", "sentiment", "sentiment_score", "sentiment_magnitude", "mention_count", "hashtag_count", "entity_count" ] # Ensure all columns exist before subsetting available_columns = [col for col in analysis_columns if col in final_df.columns] analysis_df = final_df[available_columns] analysis_file = f"{OUTPUT_FOLDER}/{file_basename}_analysis.csv" analysis_df.to_csv(analysis_file, index=False) print(f"Analysis results saved to {analysis_file}") # Print statistics print(f"\nAnalysis completed successfully!") print(f"Total records: {len(final_df)}") print(f"Tweets with Mentions: {(final_df['extracted_mentions'] != '').sum()}") print(f"Tweets with Hashtags: {(final_df['extracted_hashtags'] != '').sum()}") print(f"Tweets with Named Entities: {(final_df['named_entities'] != '').sum()}") # Print sentiment statistics sentiment_counts = final_df['sentiment'].value_counts() print("\nSentiment Distribution:") for sentiment, count in sentiment_counts.items(): percentage = (count / len(final_df)) * 100 print(f" {sentiment}: {count} tweets ({percentage:.1f}%)") # Get average sentiment scores avg_score = final_df['sentiment_score'].mean() avg_magnitude = final_df['sentiment_magnitude'].mean() print(f"\nAverage sentiment score: {avg_score:.3f}") print(f"Average sentiment magnitude: {avg_magnitude:.3f}") # Get top entities by sentiment positive_entities = [] for idx, row in final_df[final_df['sentiment'] == 'positive'].iterrows(): entities = row['named_entities'].split(';') if isinstance(row['named_entities'], str) and row['named_entities'] else [] for entity in entities: if entity and ':' in entity: entity_name = entity.split(':')[0] positive_entities.append(entity_name) # Get the most common positive entities from collections import Counter top_positive = Counter(positive_entities).most_common(5) if top_positive: print("\nTop entities with positive sentiment:") for entity, count in top_positive: print(f" {entity}: {count} mentions") # Print sample results print("\nSample of processing results:") for i, row in analysis_df.head(3).iterrows(): print(f"\nOriginal Text: {row['original_text']}") print(f"Cleaned Text: {row['cleaned_text']}") print(f"NLP Processed Text: {row['nlp_processed_text']}") print(f"Mentions: {row['extracted_mentions']}") print(f"Hashtags: {row['extracted_hashtags']}") print(f"Named Entities: {row['named_entities']}") print(f"Key Phrases: {row['key_phrases']}") print(f"Sentiment: {row['sentiment']} (Score: {row['sentiment_score']:.3f}, Magnitude: {row['sentiment_magnitude']:.3f})") print("-" * 80) # Delete interim files import glob interim_files = glob.glob(f"{OUTPUT_FOLDER}/{file_basename}_interim_*.csv") for f in interim_files: try: os.remove(f) print(f"Deleted interim file: {os.path.basename(f)}") except: pass # Clear memory after processing each file clear_gpu_memory() else: print("No data processed for this file.") # Clean up checkpoint file after successful processing if os.path.exists(CHECKPOINT_FILE): os.remove(CHECKPOINT_FILE) print("\nAll files processed successfully!") # ============================================== # Colab Usage Example # ============================================== """ # EXAMPLE USAGE IN COLAB: # 1. Install packages and mount drive from google.colab import drive drive.mount('/content/drive') # 2. Process one specific file input_file = "/content/drive/MyDrive/projects_twitter_post/zilliqa.csv" main(reset_checkpoint=True, input_file=input_file) # 3. Process all files main(reset_checkpoint=True) """ if __name__ == "__main__": import sys # Check if --reset flag is provided reset_checkpoint = "--reset" in sys.argv # Check if --file flag is provided input_file = None if "--file" in sys.argv: try: input_file = sys.argv[sys.argv.index("--file") + 1] except IndexError: print("Error: --file flag requires a filename argument") sys.exit(1) # Run the main function main(reset_checkpoint=reset_checkpoint, input_file=input_file) ) demo.launch()