Spaces:

calibr234
/

tweet_processor

Runtime error

App Files Files Community

calibr234 commited on Mar 6

Commit

c71268c

verified ·

1 Parent(s): 9a7f2e8

Update app.py

Browse files

Files changed (1) hide show

app.py +884 -5

app.py CHANGED Viewed

@@ -1,7 +1,886 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+   import gradio as gr
+   import pandas as pd
+   import os
+   import subprocess
+   import sys
+   # Install spaCy model
+   os.system("python -m spacy download en_core_web_sm")
+   def process_tweets(files, reset_processing=False):
+       # Save uploaded files
+       file_paths = []
+       for file in files:
+           if file.name.endswith('.csv'):
+               # Ensure directory exists
+               os.makedirs("projects_twitter_post", exist_ok=True)
+               # Save file to the directory
+               dest_path = f"projects_twitter_post/{os.path.basename(file.name)}"
+               os.system(f"cp {file.name} {dest_path}")
+               file_paths.append(dest_path)
+       if not file_paths:
+           return "No CSV files uploaded. Please upload CSV files containing tweet data."
+       # Run the processing script
+       reset_flag = "--reset" if reset_processing else ""
+       result = subprocess.run(
+           f"python process_tweet_huggingface.py {reset_flag}",
+           shell=True,
+           capture_output=True,
+           text=True
+       )
+       # Check if output files were created
+       output_files = []
+       for file_path in file_paths:
+           base_name = os.path.basename(file_path).replace('.csv', '')
+           processed_path = f"projects_twitter_post/{base_name}_processed.csv"
+           analysis_path = f"projects_twitter_post/{base_name}_analysis.csv"
+           if os.path.exists(processed_path):
+               output_files.append(processed_path)
+           if os.path.exists(analysis_path):
+               output_files.append(analysis_path)
+       return_files = [f for f in output_files if os.path.exists(f)]
+       log_output = result.stdout + "\n" + result.stderr
+       return log_output, return_files
+   with gr.Blocks() as demo:
+       gr.Markdown("# Crypto Tweet Processor")
+       gr.Markdown("Upload CSV files containing tweet data to process")
+       with gr.Row():
+           files_input = gr.File(file_count="multiple", label="Upload CSV Files")
+           reset_checkbox = gr.Checkbox(label="Reset Processing", value=False)
+       process_btn = gr.Button("Process Tweets")
+       output_text = gr.Textbox(label="Processing Log")
+       output_files = gr.File(label="Processed Files", file_count="multiple")
+       process_btn.click(
+           process_tweets,
+           inputs=[files_input, reset_checkbox],
+           outputs=[output_text, output_files]
+       )
+   # Add the modified processing script code here
+   with open("process_tweet_huggingface.py", "w") as f:
+       f.write(#!/usr/bin/env python3
+"""
+Tweet Processing Script for Google Colab - Enhanced with NLP and Sentiment Analysis
+This version is optimized for Google Colab with GPU acceleration and Google Drive integration.
+"""
+import os
+import re
+import json
+import pandas as pd
+import numpy as np
+import torch
+import math
+import gc
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
+import spacy
+# ==============================================
+# COLAB SETUP - Run these cells first in Colab
+# ==============================================
+# Uncomment and run this cell to mount your Google Drive
+"""
+from google.colab import drive
+drive.mount('/content/drive')
+"""
+# Uncomment and run this cell to install required packages
+"""
+!pip install pandas tqdm transformers spacy
+!python -m spacy download en_core_web_sm
+"""
+# Uncomment and run this cell to verify GPU availability
+"""
+import torch
+print(f"GPU available: {torch.cuda.is_available()}")
+print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
+"""
+# ==============================================
+# Constants - Update these paths for your setup
+# ==============================================
+# Update this to your Google Drive path
+DRIVE_PATH = "./projects_twitter_post"
+OUTPUT_FOLDER = f"{DRIVE_PATH}"
+CHECKPOINT_FILE = f"{OUTPUT_FOLDER}/processing_checkpoint.json"
+BATCH_SIZE = 500  # Reduced batch size for GPU memory management
+# Create output folder if it doesn't exist
+if not os.path.exists(OUTPUT_FOLDER):
+    os.makedirs(OUTPUT_FOLDER)
+# ==============================================
+# Model Initialization with GPU Acceleration
+# ==============================================
+print("Loading RoBERTa model...")
+model_name = "roberta-base"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Initialize with GPU acceleration
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
+nlp_pipeline = pipeline("fill-mask", model=model_name, device=0 if torch.cuda.is_available() else -1)
+# Initialize sentiment analysis pipeline
+print("Loading sentiment analysis model...")
+try:
+    # Using a Twitter-specific sentiment model for better results on social media text
+    sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment"
+    sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, device=0 if torch.cuda.is_available() else -1)
+    SENTIMENT_AVAILABLE = True
+except Exception as e:
+    print(f"Error loading sentiment model: {e}")
+    # Fallback to a simpler sentiment model if the Twitter-specific one fails
+    try:
+        sentiment_pipeline = pipeline("sentiment-analysis", device=0 if torch.cuda.is_available() else -1)
+        SENTIMENT_AVAILABLE = True
+    except:
+        print("Sentiment analysis not available. Continuing without sentiment analysis.")
+        SENTIMENT_AVAILABLE = False
+# Try to load spaCy for basic text preprocessing
+try:
+    import spacy
+    spacy_nlp = spacy.load("en_core_web_sm")
+    SPACY_AVAILABLE = True
+except:
+    SPACY_AVAILABLE = False
+    print("SpaCy not available. Using basic text processing instead.")
+# Crypto-specific keywords with hierarchical categories
+CRYPTO_TAXONOMY = {
+    "COIN": {
+        "MAJOR": [
+            "bitcoin", "ethereum", "btc", "eth", "bnb", "xrp", "sol", "doge",
+            "cardano", "polkadot", "dot", "avalanche", "avax", "solana", "polygon", "matic"
+        ],
+        "STABLECOIN": [
+            "tether", "usdt", "usdc", "busd", "dai", "frax", "tusd", "usdd", "lusd", "gusd", "husd"
+        ],
+        "ALTCOIN": [
+            "litecoin", "ltc", "chainlink", "link", "stellar", "xlm", "dogecoin", "shib",
+            "tron", "trx", "cosmos", "atom", "near", "algo", "fantom", "ftm", "monero", "xmr"
+        ],
+        "DEFI": [
+            "uniswap", "uni", "aave", "sushi", "cake", "comp", "maker", "mkr", "curve", "crv",
+            "yearn", "yfi", "compound", "balancer", "bal", "synthetix", "snx"
+        ],
+        "UTILITY": [
+            "filecoin", "fil", "the graph", "grt", "arweave", "ar", "chainlink", "link",
+            "helium", "hnt", "theta", "icp"
+        ],
+        "NFT": [
+            "enjin", "enj", "decentraland", "mana", "sandbox", "sand", "axie", "axs",
+            "gala", "apecoin", "ape", "flow", "ens", "stepn", "gmt"
+        ]
+    },
+    "TECH": {
+        "CONCEPTS": [
+            "blockchain", "defi", "nft", "dao", "smart contract", "web3", "dapp", "protocol",
+            "consensus", "tokenomics", "tokenization"
+        ],
+        "CHAIN_TYPES": [
+            "layer1", "layer2", "rollup", "sidechain", "mainnet", "testnet", "devnet",
+            "pow", "pos", "poh", "pbft", "dpos"
+        ],
+        "PRIVACY": [
+            "zk", "zk-rollups", "zero-knowledge", "zkp", "zksnark", "zkstark", "mpc",
+            "privacy", "private", "anonymous", "confidential", "encrypted"
+        ],
+        "SECTORS": [
+            "defi", "cefi", "gamefi", "metaverse", "socialfi", "fintech", "realfi",
+            "play-to-earn", "move-to-earn", "learn-to-earn", "x-to-earn", "defai", "depin", "desci",
+            "refi", "did", "dedata", "dedao", "deid", "deai", "degov", "decloud", "dehealth",
+            "decex", "deinsurance", "deworkplace", "public goods", "zk", "ordinals", "soulbound",
+            "onchain gaming", "ai agents", "infrastructure", "credentials", "restaking", "modular blockchain",
+            "liquid staking", "real world assets", "rwa", "synthetic assets", "account abstraction"
+        ]
+    },
+    "ACTION": {
+        "TRADING": [
+            "buy", "sell", "long", "short", "margin", "leverage", "trade", "swap",
+            "arbitrage", "dca", "ape", "pump", "dump", "moon", "ath", "atl", "breakout",
+            "correction", "consolidation", "accumulate", "distribute", "front run", "front runner",
+            "front running", "mev", "sandwich attack"
+        ],
+        "DEFI": [
+            "stake", "yield", "farm", "lend", "borrow", "supply", "withdraw", "claim",
+            "harvest", "flash loan", "liquidate", "collateralize", "wrap", "unwrap", "bridge",
+            "provide liquidity", "withdraw liquidity", "impermanent loss"
+        ],
+        "GOVERNANCE": [
+            "delegate", "vote", "propose", "governance", "dao", "snapshot", "quorum",
+            "execution", "timelock", "veto"
+        ],
+        "NFT": [
+            "mint", "airdrop", "whitelist", "burn", "floor price", "rarity", "trait", "pfp",
+            "collection", "secondary", "flip"
+        ],
+        "DEVELOPMENT": [
+            "deploy", "audit", "fork", "bootstrap", "initiate", "merge", "split",
+            "rebase", "optimize", "gas optimization", "implement", "compile"
+        ]
+    },
+    "PLATFORM": {
+        "EXCHANGE": [
+            "coinbase", "binance", "kraken", "kucoin", "ftx", "okx", "bybit", "bitfinex",
+            "huobi", "gate", "gemini", "bitstamp", "bittrex", "crypto.com", "cex", "dex"
+        ],
+        "WALLET": [
+            "metamask", "phantom", "trust wallet", "ledger", "trezor", "argent", "rainbow",
+            "wallet", "hot wallet", "cold storage", "hardware wallet", "seed phrase"
+        ],
+        "NFT_MARKET": [
+            "opensea", "rarible", "foundation", "superrare", "looksrare", "blur", "magic eden",
+            "nifty gateway", "zora", "x2y2", "element"
+        ],
+        "INFRA": [
+            "alchemy", "infura", "moralis", "quicknode", "ceramic", "arweave", "ipfs",
+            "node", "rpc", "api", "indexer", "subgraph"
+        ]
+    },
+    "NETWORK": {
+        "LAYER1": [
+            "ethereum", "bitcoin", "solana", "avalanche", "polygon", "bnb chain", "bsc",
+            "cardano", "polkadot", "cosmos", "algorand", "tezos", "flow", "near", "tron"
+        ],
+        "LAYER2": [
+            "arbitrum", "optimism", "zksync", "starknet", "base", "polygon", "loopring",
+            "immutablex", "metis", "boba", "aztec", "validium", "zkevm"
+        ],
+        "INTEROPERABILITY": [
+            "cosmos", "polkadot", "kusama", "moonbeam", "moonriver", "parachains", "relay chain",
+            "ibc", "cross-chain", "bridge"
+        ]
+    },
+    "EVENTS": {
+        "MARKET": [
+            "bull market", "bear market", "bull run", "bear trap", "bull trap", "halving",
+            "capitulation", "golden cross", "death cross", "breakout", "resistance", "support"
+        ],
+        "SECURITY": [
+            "hack", "exploit", "vulnerability", "scam", "phishing", "rug pull", "honeypot",
+            "flash crash", "attack", "51% attack", "front running", "sandwich attack", "mev extraction"
+        ],
+        "TOKEN_EVENTS": [
+            "airdrop", "token unlock", "vesting", "ico", "ido", "ito", "ieo", "fair launch",
+            "private sale", "seed round", "listing", "delisting"
+        ]
+    },
+    "METRICS": {
+        "FINANCIAL": [
+            "apy", "apr", "roi", "tvl", "market cap", "mcap", "volume", "liquidity", "supply",
+            "circulating supply", "total supply", "max supply", "inflation", "deflation",
+            "volatility", "dominance"
+        ],
+        "TECHNICAL": [
+            "gas fee", "gas price", "gas limit", "slippage", "impermanent loss", "yield",
+            "hashrate", "difficulty", "tps", "latency", "finality", "block time", "block size",
+            "block reward"
+        ]
+    },
+    "COMMUNITY": {
+        "ROLES": [
+            "whale", "degen", "anon", "influencer", "kol", "thought leader", "ambassador",
+            "advocate", "og", "contributor", "dev", "builder", "founder", "investor", "vc",
+            "angel", "team", "core team", "front runner", "mev bot", "searcher", "validator",
+            "miner", "node operator", "liquidity provider", "market maker", "arbitrageur"
+        ],
+        "SLANG": [
+            "diamond hands", "paper hands", "wagmi", "ngmi", "gm", "gn", "ser", "based",
+            "crypto twitter", "ct", "alpha", "dyor", "fomo", "fud", "hodl", "rekt"
+        ]
+    }
+}
+# ==============================================
+# Helper Functions
+# ==============================================
+def clear_gpu_memory():
+    """Clear GPU memory to prevent OOM errors"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+def load_checkpoint():
+    """Load processing checkpoint if it exists"""
+    if os.path.exists(CHECKPOINT_FILE):
+        with open(CHECKPOINT_FILE, 'r') as f:
+            return json.load(f)
+    return {'last_processed_index': 0}
+def save_checkpoint(index):
+    """Save the current processing index to a checkpoint file"""
+    with open(CHECKPOINT_FILE, 'w') as f:
+        json.dump({'last_processed_index': index}, f)
+def identify_crypto_entities(text: str) -> list:
+    """
+    Identify crypto-specific entities in text using the hierarchical taxonomy.
+    Args:
+        text (str): Text to analyze
+    Returns:
+        list: List of tuples (entity, main_category, sub_category)
+    """
+    if not isinstance(text, str):
+        return []
+    text_lower = text.lower()
+    found_entities = []
+    # Search for each entity in the taxonomy
+    for main_cat, subcats in CRYPTO_TAXONOMY.items():
+        for subcat, terms in subcats.items():
+            for term in terms:
+                # Avoid matching partial words (ensure word boundaries)
+                pattern = r'\b' + re.escape(term) + r'\b'
+                if re.search(pattern, text_lower):
+                    found_entities.append((term, main_cat, subcat))
+    return found_entities
+def clean_text(text: str) -> str:
+    """Clean text while preserving mentions and hashtags"""
+    if not isinstance(text, str):
+        return ""
+    # Remove URLs
+    text = re.sub(r'http\S+', '', text)
+    # Remove non-alphanumeric characters (except mentions, hashtags, and spaces)
+    text = re.sub(r'[^\w\s@#]', ' ', text)
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text.lower()
+def process_nlp_text(text: str) -> str:
+    """Process text with advanced NLP (lemmatization, stopword removal)"""
+    if not isinstance(text, str):
+        return ""
+    # Basic cleaning
+    text = clean_text(text)
+    if SPACY_AVAILABLE:
+        # Process with spaCy for advanced NLP
+        doc = spacy_nlp(text)
+        # Lemmatize and remove stopwords
+        processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
+        return " ".join(processed_tokens)
+    else:
+        # Fallback to basic cleaning if spaCy is not available
+        return text
+def extract_mentions(text: str) -> list:
+    """Extract @mentions from text"""
+    if not isinstance(text, str):
+        return []
+    return re.findall(r'@(\w+)', text)
+def extract_hashtags(text: str) -> list:
+    """Extract #hashtags from text"""
+    if not isinstance(text, str):
+        return []
+    return re.findall(r'#(\w+)', text)
+def extract_urls(text: str) -> list:
+    """Extract URLs from text"""
+    if not isinstance(text, str):
+        return []
+    urls = re.findall(r'(https?://\S+)', text)
+    return urls
+def analyze_sentiment(text: str) -> dict:
+    """
+    Analyze the sentiment of a text using the sentiment analysis pipeline.
+    Args:
+        text (str): The text to analyze
+    Returns:
+        dict: A dictionary containing sentiment label and score
+    """
+    if not SENTIMENT_AVAILABLE or not text.strip():
+        return {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}
+    try:
+        # Pre-process the text to improve sentiment analysis accuracy
+        # Limit text length to avoid errors with very long tweets
+        truncated_text = text[:512] if len(text) > 512 else text
+        # Get sentiment prediction
+        sentiment_result = sentiment_pipeline(truncated_text)[0]
+        label = sentiment_result['label']
+        score = sentiment_result['score']
+        # Map to standardized format (positive, negative, neutral)
+        sentiment_mapping = {
+            'LABEL_0': 'negative',
+            'LABEL_1': 'neutral',
+            'LABEL_2': 'positive',
+            'NEGATIVE': 'negative',
+            'NEUTRAL': 'neutral',
+            'POSITIVE': 'positive'
+        }
+        standardized_sentiment = sentiment_mapping.get(label, label.lower())
+        # Calculate magnitude (confidence) - useful for filtering high-confidence sentiments
+        magnitude = abs(score - 0.5) * 2 if standardized_sentiment != 'neutral' else score
+        return {
+            "sentiment": standardized_sentiment,
+            "sentiment_score": score,
+            "sentiment_magnitude": magnitude
+        }
+    except Exception as e:
+        print(f"Error in sentiment analysis: {e}")
+        return {"sentiment": "error", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}
+def process_with_nlp(text: str) -> dict:
+    """
+    Process text with NLP to extract named entities, key phrases, etc.
+    Args:
+        text (str): The text to process
+    Returns:
+        dict: A dictionary containing NLP processing results
+    """
+    results = {
+        "named_entities": [],
+        "pos_tags": [],
+        "lemmatized_tokens": [],
+        "key_phrases": [],
+        "important_nouns": [],
+        "sentiment_analysis": {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}
+    }
+    if not text or text.isspace():
+        return results
+    # First, analyze sentiment
+    results["sentiment_analysis"] = analyze_sentiment(text)
+    try:
+        # Use spaCy for advanced NLP if available
+        if SPACY_AVAILABLE:
+            doc = spacy_nlp(text)
+            # Extract named entities (excluding crypto entities which are handled separately)
+            results["named_entities"] = [(ent.text, ent.label_) for ent in doc.ents]
+            # Extract POS tags for content words
+            results["pos_tags"] = [(token.text, token.pos_) for token in doc
+                                   if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"] and not token.is_stop]
+            # Get lemmatized tokens (normalized words)
+            results["lemmatized_tokens"] = [token.lemma_ for token in doc
+                                           if not token.is_stop and not token.is_punct and token.text.strip()]
+            # Extract important nouns (potential topics)
+            results["important_nouns"] = [token.text for token in doc
+                                         if token.pos_ == "NOUN" and not token.is_stop]
+            # Try to extract key phrases using noun chunks
+            results["key_phrases"] = [chunk.text for chunk in doc.noun_chunks
+                                     if len(chunk.text.split()) > 1]
+        # If key phrases are empty, use RoBERTa to attempt extraction
+        if not results["key_phrases"] and len(text.split()) > 3:
+            try:
+                # Create a masked sentence from the text
+                words = text.split()
+                if len(words) > 5:
+                    # Get 3 random positions to mask
+                    import random
+                    positions = sorted(random.sample(range(len(words)), min(3, len(words))))
+                    # Create masked sentences
+                    key_terms = []
+                    for pos in positions:
+                        words_copy = words.copy()
+                        words_copy[pos] = tokenizer.mask_token
+                        masked_text = " ".join(words_copy)
+                        # Get predictions for the masked token
+                        predictions = nlp_pipeline(masked_text, top_k=2)
+                        for prediction in predictions:
+                            key_terms.append(prediction["token_str"].strip())
+                    results["key_phrases"].extend(key_terms)
+            except Exception as e:
+                print(f"Error in key phrase extraction: {e}")
+        # Ensure all results are strings for CSV output
+        results["named_entities"] = ";".join([f"{ent[0]}:{ent[1]}" for ent in results["named_entities"]])
+        results["pos_tags"] = ";".join([f"{tag[0]}:{tag[1]}" for tag in results["pos_tags"]])
+        results["lemmatized_tokens"] = ";".join(results["lemmatized_tokens"])
+        results["key_phrases"] = ";".join(list(set(results["key_phrases"])))  # Remove duplicates
+        results["important_nouns"] = ";".join(list(set(results["important_nouns"])))  # Remove duplicates
+    except Exception as e:
+        print(f"Error in NLP processing: {e}")
+    # Clear GPU memory after processing
+    if (results["named_entities"].count(";") > 100) or (len(text) > 1000):
+        clear_gpu_memory()
+    return results
+def process_tweet(text: str) -> tuple:
+    """
+    Process a tweet to extract mentions, hashtags, URLs, crypto entities, and perform NLP analysis.
+    Also performs sentiment analysis.
+    Args:
+        text (str): The tweet text to process
+    Returns:
+        tuple: A tuple containing mentions, hashtags, URLs, NLP results, and sentiment analysis
+    """
+    if not text or not isinstance(text, str):
+        return [], [], [], "", "", {}, {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}
+    # Clean the text while preserving mentions and hashtags
+    cleaned_text = clean_text(text)
+    # Process text with NLP
+    processed_text = process_nlp_text(text)
+    # Extract mentions, hashtags, and URLs
+    mentions = extract_mentions(text)
+    hashtags = extract_hashtags(text)
+    urls = extract_urls(text)
+    # Identify crypto entities
+    crypto_entities = identify_crypto_entities(text)
+    # Process with NLP models
+    nlp_results = process_with_nlp(text)
+    # Ensure we have the sentiment analysis results
+    sentiment_results = nlp_results.pop("sentiment_analysis", {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0})
+    # Add crypto entities to the named entities
+    formatted_crypto_entities = [f"{entity}:{main_cat}.{sub_cat}" for entity, main_cat, sub_cat in crypto_entities]
+    # If named_entities is a string (joined with semicolons), we need to handle differently
+    if isinstance(nlp_results.get("named_entities", ""), str):
+        nlp_results["named_entities"] = nlp_results.get("named_entities", "")
+        if nlp_results["named_entities"] and formatted_crypto_entities:
+            nlp_results["named_entities"] += ";" + ";".join(formatted_crypto_entities)
+        elif formatted_crypto_entities:
+            nlp_results["named_entities"] = ";".join(formatted_crypto_entities)
+    return mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results
+def process_batch(df_batch):
+    """Process a batch of tweets"""
+    processed_data = []
+    for idx, row in df_batch.iterrows():
+        text = row.get('text', '')
+        # Process the tweet
+        mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results = process_tweet(text)
+        # Create a dictionary with the results
+        result = {
+            'id': row.get('id', ''),
+            'original_text': text,  # Store the original text
+            'cleaned_text': cleaned_text,
+            'nlp_processed_text': processed_text,
+            'extracted_mentions': ';'.join(mentions),
+            'extracted_hashtags': ';'.join(hashtags),
+            'extracted_urls': ';'.join(urls),
+            'named_entities': nlp_results.get('named_entities', ''),
+            'pos_tags': nlp_results.get('pos_tags', ''),
+            'lemmatized_tokens': nlp_results.get('lemmatized_tokens', ''),
+            'key_phrases': nlp_results.get('key_phrases', ''),
+            'important_nouns': nlp_results.get('important_nouns', ''),
+            'sentiment': sentiment_results.get('sentiment', 'unknown'),
+            'sentiment_score': sentiment_results.get('sentiment_score', 0.0),
+            'sentiment_magnitude': sentiment_results.get('sentiment_magnitude', 0.0)
+        }
+        processed_data.append(result)
+    return pd.DataFrame(processed_data)
+# ==============================================
+# Main Processing Function
+# ==============================================
+def main(reset_checkpoint=False, input_file=None):
+    """
+    Main function to process tweets
+    Args:
+        reset_checkpoint (bool): Whether to reset the checkpoint and process all data
+        input_file (str): Optional specific input file to process, otherwise processes all CSV files
+    """
+    if reset_checkpoint and os.path.exists(CHECKPOINT_FILE):
+        os.remove(CHECKPOINT_FILE)
+        print("Checkpoint reset. Will process all data from the beginning.")
+    # Get list of CSV files to process
+    if input_file:
+        # Process a specific file
+        input_files = [input_file]
+    else:
+        # Find all CSV files in the OUTPUT_FOLDER
+        import glob
+        input_files = glob.glob(f"{OUTPUT_FOLDER}/*.csv")
+        # Exclude our output files
+        input_files = [f for f in input_files if not any(x in f for x in ["_processed.csv", "_analysis.csv"])]
+    if not input_files:
+        print(f"No input CSV files found in {OUTPUT_FOLDER}")
+        return
+    print(f"Found {len(input_files)} files to process: {[os.path.basename(f) for f in input_files]}")
+    # Process each file
+    for input_csv in input_files:
+        print(f"\nProcessing file: {os.path.basename(input_csv)}")
+        print("Loading dataset...")
+        # Check if input file exists
+        if not os.path.exists(input_csv):
+            print(f"Input file {input_csv} not found. Skipping.")
+            continue
+        # Load the dataset
+        try:
+            df = pd.read_csv(input_csv)
+            print(f"Loaded dataset with {len(df)} records and {len(df.columns)} columns.")
+        except Exception as e:
+            print(f"Error loading {input_csv}: {e}")
+            continue
+        # Load checkpoint if it exists
+        checkpoint = load_checkpoint()
+        start_idx = checkpoint['last_processed_index']
+        # For simplicity, reset checkpoints between files
+        start_idx = 0
+        save_checkpoint(0)
+        print("\nProcessing tweets...")
+        print(f"Starting from index {start_idx}")
+        # Filter to only unprocessed rows
+        df_to_process = df.iloc[start_idx:]
+        if len(df_to_process) == 0:
+            print("No new data to process in this file.")
+            continue
+        # Process in batches for memory efficiency
+        batch_size = BATCH_SIZE
+        num_batches = math.ceil(len(df_to_process) / batch_size)
+        print(f"Processing in {num_batches} batches of {batch_size} records each")
+        processed_batches = []
+        # Create progress bar
+        for i in tqdm(range(num_batches)):
+            batch_start = i * batch_size
+            batch_end = min((i + 1) * batch_size, len(df_to_process))
+            # Get current batch
+            df_batch = df_to_process.iloc[batch_start:batch_end]
+            # Process the batch
+            processed_batch = process_batch(df_batch)
+            processed_batches.append(processed_batch)
+            # Save checkpoint
+            save_checkpoint(start_idx + batch_end)
+            # Save intermediate results every 5 batches to prevent data loss in case of session timeout
+            if i % 5 == 0 and i > 0:
+                file_basename = os.path.splitext(os.path.basename(input_csv))[0]
+                interim_df = pd.concat(processed_batches, ignore_index=True)
+                interim_file = f"{OUTPUT_FOLDER}/{file_basename}_interim_{i}.csv"
+                interim_df.to_csv(interim_file, index=False)
+                print(f"\nSaved interim results to {interim_file}")
+                # Clear memory
+                clear_gpu_memory()
+        # Combine all batches
+        if processed_batches:
+            file_basename = os.path.splitext(os.path.basename(input_csv))[0]
+            final_df = pd.concat(processed_batches, ignore_index=True)
+            # Calculate statistics columns
+            final_df["mention_count"] = final_df["extracted_mentions"].str.count(";") + (final_df["extracted_mentions"] != "").astype(int)
+            final_df["hashtag_count"] = final_df["extracted_hashtags"].str.count(";") + (final_df["extracted_hashtags"] != "").astype(int)
+            final_df["entity_count"] = final_df["named_entities"].str.count(";") + (final_df["named_entities"] != "").astype(int)
+            # Save the full processed dataset
+            output_file = f"{OUTPUT_FOLDER}/{file_basename}_processed.csv"
+            final_df.to_csv(output_file, index=False)
+            print(f"Processed data saved to {output_file}")
+            # Create a lighter version with just the analysis
+            analysis_columns = [
+                "id", "original_text", "cleaned_text", "nlp_processed_text",
+                "extracted_mentions", "extracted_hashtags", "extracted_urls",
+                "named_entities", "key_phrases", "important_nouns",
+                "sentiment", "sentiment_score", "sentiment_magnitude",
+                "mention_count", "hashtag_count", "entity_count"
+            ]
+            # Ensure all columns exist before subsetting
+            available_columns = [col for col in analysis_columns if col in final_df.columns]
+            analysis_df = final_df[available_columns]
+            analysis_file = f"{OUTPUT_FOLDER}/{file_basename}_analysis.csv"
+            analysis_df.to_csv(analysis_file, index=False)
+            print(f"Analysis results saved to {analysis_file}")
+            # Print statistics
+            print(f"\nAnalysis completed successfully!")
+            print(f"Total records: {len(final_df)}")
+            print(f"Tweets with Mentions: {(final_df['extracted_mentions'] != '').sum()}")
+            print(f"Tweets with Hashtags: {(final_df['extracted_hashtags'] != '').sum()}")
+            print(f"Tweets with Named Entities: {(final_df['named_entities'] != '').sum()}")
+            # Print sentiment statistics
+            sentiment_counts = final_df['sentiment'].value_counts()
+            print("\nSentiment Distribution:")
+            for sentiment, count in sentiment_counts.items():
+                percentage = (count / len(final_df)) * 100
+                print(f"  {sentiment}: {count} tweets ({percentage:.1f}%)")
+            # Get average sentiment scores
+            avg_score = final_df['sentiment_score'].mean()
+            avg_magnitude = final_df['sentiment_magnitude'].mean()
+            print(f"\nAverage sentiment score: {avg_score:.3f}")
+            print(f"Average sentiment magnitude: {avg_magnitude:.3f}")
+            # Get top entities by sentiment
+            positive_entities = []
+            for idx, row in final_df[final_df['sentiment'] == 'positive'].iterrows():
+                entities = row['named_entities'].split(';') if isinstance(row['named_entities'], str) and row['named_entities'] else []
+                for entity in entities:
+                    if entity and ':' in entity:
+                        entity_name = entity.split(':')[0]
+                        positive_entities.append(entity_name)
+            # Get the most common positive entities
+            from collections import Counter
+            top_positive = Counter(positive_entities).most_common(5)
+            if top_positive:
+                print("\nTop entities with positive sentiment:")
+                for entity, count in top_positive:
+                    print(f"  {entity}: {count} mentions")
+            # Print sample results
+            print("\nSample of processing results:")
+            for i, row in analysis_df.head(3).iterrows():
+                print(f"\nOriginal Text: {row['original_text']}")
+                print(f"Cleaned Text: {row['cleaned_text']}")
+                print(f"NLP Processed Text: {row['nlp_processed_text']}")
+                print(f"Mentions: {row['extracted_mentions']}")
+                print(f"Hashtags: {row['extracted_hashtags']}")
+                print(f"Named Entities: {row['named_entities']}")
+                print(f"Key Phrases: {row['key_phrases']}")
+                print(f"Sentiment: {row['sentiment']} (Score: {row['sentiment_score']:.3f}, Magnitude: {row['sentiment_magnitude']:.3f})")
+                print("-" * 80)
+            # Delete interim files
+            import glob
+            interim_files = glob.glob(f"{OUTPUT_FOLDER}/{file_basename}_interim_*.csv")
+            for f in interim_files:
+                try:
+                    os.remove(f)
+                    print(f"Deleted interim file: {os.path.basename(f)}")
+                except:
+                    pass
+            # Clear memory after processing each file
+            clear_gpu_memory()
+        else:
+            print("No data processed for this file.")
+    # Clean up checkpoint file after successful processing
+    if os.path.exists(CHECKPOINT_FILE):
+        os.remove(CHECKPOINT_FILE)
+    print("\nAll files processed successfully!")
+# ==============================================
+# Colab Usage Example
+# ==============================================
+"""
+# EXAMPLE USAGE IN COLAB:
+# 1. Install packages and mount drive
+from google.colab import drive
+drive.mount('/content/drive')
+# 2. Process one specific file
+input_file = "/content/drive/MyDrive/projects_twitter_post/zilliqa.csv"
+main(reset_checkpoint=True, input_file=input_file)
+# 3. Process all files
+main(reset_checkpoint=True)
+"""
+if __name__ == "__main__":
+    import sys
+    # Check if --reset flag is provided
+    reset_checkpoint = "--reset" in sys.argv
+    # Check if --file flag is provided
+    input_file = None
+    if "--file" in sys.argv:
+        try:
+            input_file = sys.argv[sys.argv.index("--file") + 1]
+        except IndexError:
+            print("Error: --file flag requires a filename argument")
+            sys.exit(1)
+    # Run the main function
+    main(reset_checkpoint=reset_checkpoint, input_file=input_file) )
+   demo.launch()