Spaces:

calibr234
/

tweet_processor

Runtime error

App Files Files Community

tweet_processor / app.py

calibr234

Update app.py

bd0b092 verified 4 months ago

raw

history blame contribute delete

34.9 kB

	import gradio as gr
	import pandas as pd
	import os
	import subprocess
	import sys

	# Install spaCy model
	os.system("python -m spacy download en_core_web_sm")

	def process_tweets(files, reset_processing=False):
	# Save uploaded files
	file_paths = []
	for file in files:
	if file.name.endswith('.csv'):
	# Ensure directory exists
	os.makedirs("projects_twitter_post", exist_ok=True)

	# Save file to the directory
	dest_path = f"projects_twitter_post/{os.path.basename(file.name)}"
	os.system(f"cp {file.name} {dest_path}")
	file_paths.append(dest_path)

	if not file_paths:
	return "No CSV files uploaded. Please upload CSV files containing tweet data."

	# Run the processing script
	reset_flag = "--reset" if reset_processing else ""
	result = subprocess.run(
	f"python process_tweet_huggingface.py {reset_flag}",
	shell=True,
	capture_output=True,
	text=True
	)

	# Check if output files were created
	output_files = []
	for file_path in file_paths:
	base_name = os.path.basename(file_path).replace('.csv', '')
	processed_path = f"projects_twitter_post/{base_name}_processed.csv"
	analysis_path = f"projects_twitter_post/{base_name}_analysis.csv"

	if os.path.exists(processed_path):
	output_files.append(processed_path)
	if os.path.exists(analysis_path):
	output_files.append(analysis_path)

	return_files = [f for f in output_files if os.path.exists(f)]

	log_output = result.stdout + "\n" + result.stderr

	return log_output, return_files

	with gr.Blocks() as demo:
	gr.Markdown("# Crypto Tweet Processor")
	gr.Markdown("Upload CSV files containing tweet data to process")

	with gr.Row():
	files_input = gr.File(file_count="multiple", label="Upload CSV Files")
	reset_checkbox = gr.Checkbox(label="Reset Processing", value=False)

	process_btn = gr.Button("Process Tweets")

	output_text = gr.Textbox(label="Processing Log")
	output_files = gr.File(label="Processed Files", file_count="multiple")

	process_btn.click(
	process_tweets,
	inputs=[files_input, reset_checkbox],
	outputs=[output_text, output_files]
	)

	# Add the modified processing script code here
	with open("process_tweet_huggingface.py", "w") as f:
	f.write(
	import os
	import re
	import json
	import numpy as np
	import torch
	import math
	import gc
	from tqdm import tqdm
	from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
	import spacy

	# ==============================================
	# COLAB SETUP - Run these cells first in Colab
	# ==============================================

	# Uncomment and run this cell to mount your Google Drive
	"""
	from google.colab import drive
	drive.mount('/content/drive')
	"""

	# Uncomment and run this cell to install required packages
	"""
	!pip install pandas tqdm transformers spacy
	!python -m spacy download en_core_web_sm
	"""

	# Uncomment and run this cell to verify GPU availability
	"""
	import torch
	print(f"GPU available: {torch.cuda.is_available()}")
	print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
	"""

	# ==============================================
	# Constants - Update these paths for your setup
	# ==============================================

	# Update this to your Google Drive path
	DRIVE_PATH = "./projects_twitter_post"
	OUTPUT_FOLDER = f"{DRIVE_PATH}"
	CHECKPOINT_FILE = f"{OUTPUT_FOLDER}/processing_checkpoint.json"
	BATCH_SIZE = 500 # Reduced batch size for GPU memory management

	# Create output folder if it doesn't exist
	if not os.path.exists(OUTPUT_FOLDER):
	os.makedirs(OUTPUT_FOLDER)

	# ==============================================
	# Model Initialization with GPU Acceleration
	# ==============================================

	print("Loading RoBERTa model...")
	model_name = "roberta-base"
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Initialize with GPU acceleration
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
	nlp_pipeline = pipeline("fill-mask", model=model_name, device=0 if torch.cuda.is_available() else -1)

	# Initialize sentiment analysis pipeline
	print("Loading sentiment analysis model...")
	try:
	# Using a Twitter-specific sentiment model for better results on social media text
	sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment"
	sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, device=0 if torch.cuda.is_available() else -1)
	SENTIMENT_AVAILABLE = True
	except Exception as e:
	print(f"Error loading sentiment model: {e}")
	# Fallback to a simpler sentiment model if the Twitter-specific one fails
	try:
	sentiment_pipeline = pipeline("sentiment-analysis", device=0 if torch.cuda.is_available() else -1)
	SENTIMENT_AVAILABLE = True
	except:
	print("Sentiment analysis not available. Continuing without sentiment analysis.")
	SENTIMENT_AVAILABLE = False

	# Try to load spaCy for basic text preprocessing
	try:
	import spacy
	spacy_nlp = spacy.load("en_core_web_sm")
	SPACY_AVAILABLE = True
	except:
	SPACY_AVAILABLE = False
	print("SpaCy not available. Using basic text processing instead.")

	# Crypto-specific keywords with hierarchical categories
	CRYPTO_TAXONOMY = {
	"COIN": {
	"MAJOR": [
	"bitcoin", "ethereum", "btc", "eth", "bnb", "xrp", "sol", "doge",
	"cardano", "polkadot", "dot", "avalanche", "avax", "solana", "polygon", "matic"
	],
	"STABLECOIN": [
	"tether", "usdt", "usdc", "busd", "dai", "frax", "tusd", "usdd", "lusd", "gusd", "husd"
	],
	"ALTCOIN": [
	"litecoin", "ltc", "chainlink", "link", "stellar", "xlm", "dogecoin", "shib",
	"tron", "trx", "cosmos", "atom", "near", "algo", "fantom", "ftm", "monero", "xmr"
	],
	"DEFI": [
	"uniswap", "uni", "aave", "sushi", "cake", "comp", "maker", "mkr", "curve", "crv",
	"yearn", "yfi", "compound", "balancer", "bal", "synthetix", "snx"
	],
	"UTILITY": [
	"filecoin", "fil", "the graph", "grt", "arweave", "ar", "chainlink", "link",
	"helium", "hnt", "theta", "icp"
	],
	"NFT": [
	"enjin", "enj", "decentraland", "mana", "sandbox", "sand", "axie", "axs",
	"gala", "apecoin", "ape", "flow", "ens", "stepn", "gmt"
	]
	},

	"TECH": {
	"CONCEPTS": [
	"blockchain", "defi", "nft", "dao", "smart contract", "web3", "dapp", "protocol",
	"consensus", "tokenomics", "tokenization"
	],
	"CHAIN_TYPES": [
	"layer1", "layer2", "rollup", "sidechain", "mainnet", "testnet", "devnet",
	"pow", "pos", "poh", "pbft", "dpos"
	],
	"PRIVACY": [
	"zk", "zk-rollups", "zero-knowledge", "zkp", "zksnark", "zkstark", "mpc",
	"privacy", "private", "anonymous", "confidential", "encrypted"
	],
	"SECTORS": [
	"defi", "cefi", "gamefi", "metaverse", "socialfi", "fintech", "realfi",
	"play-to-earn", "move-to-earn", "learn-to-earn", "x-to-earn", "defai", "depin", "desci",
	"refi", "did", "dedata", "dedao", "deid", "deai", "degov", "decloud", "dehealth",
	"decex", "deinsurance", "deworkplace", "public goods", "zk", "ordinals", "soulbound",
	"onchain gaming", "ai agents", "infrastructure", "credentials", "restaking", "modular blockchain",
	"liquid staking", "real world assets", "rwa", "synthetic assets", "account abstraction"
	]
	},

	"ACTION": {
	"TRADING": [
	"buy", "sell", "long", "short", "margin", "leverage", "trade", "swap",
	"arbitrage", "dca", "ape", "pump", "dump", "moon", "ath", "atl", "breakout",
	"correction", "consolidation", "accumulate", "distribute", "front run", "front runner",
	"front running", "mev", "sandwich attack"
	],
	"DEFI": [
	"stake", "yield", "farm", "lend", "borrow", "supply", "withdraw", "claim",
	"harvest", "flash loan", "liquidate", "collateralize", "wrap", "unwrap", "bridge",
	"provide liquidity", "withdraw liquidity", "impermanent loss"
	],
	"GOVERNANCE": [
	"delegate", "vote", "propose", "governance", "dao", "snapshot", "quorum",
	"execution", "timelock", "veto"
	],
	"NFT": [
	"mint", "airdrop", "whitelist", "burn", "floor price", "rarity", "trait", "pfp",
	"collection", "secondary", "flip"
	],
	"DEVELOPMENT": [
	"deploy", "audit", "fork", "bootstrap", "initiate", "merge", "split",
	"rebase", "optimize", "gas optimization", "implement", "compile"
	]
	},

	"PLATFORM": {
	"EXCHANGE": [
	"coinbase", "binance", "kraken", "kucoin", "ftx", "okx", "bybit", "bitfinex",
	"huobi", "gate", "gemini", "bitstamp", "bittrex", "crypto.com", "cex", "dex"
	],
	"WALLET": [
	"metamask", "phantom", "trust wallet", "ledger", "trezor", "argent", "rainbow",
	"wallet", "hot wallet", "cold storage", "hardware wallet", "seed phrase"
	],
	"NFT_MARKET": [
	"opensea", "rarible", "foundation", "superrare", "looksrare", "blur", "magic eden",
	"nifty gateway", "zora", "x2y2", "element"
	],
	"INFRA": [
	"alchemy", "infura", "moralis", "quicknode", "ceramic", "arweave", "ipfs",
	"node", "rpc", "api", "indexer", "subgraph"
	]
	},

	"NETWORK": {
	"LAYER1": [
	"ethereum", "bitcoin", "solana", "avalanche", "polygon", "bnb chain", "bsc",
	"cardano", "polkadot", "cosmos", "algorand", "tezos", "flow", "near", "tron"
	],
	"LAYER2": [
	"arbitrum", "optimism", "zksync", "starknet", "base", "polygon", "loopring",
	"immutablex", "metis", "boba", "aztec", "validium", "zkevm"
	],
	"INTEROPERABILITY": [
	"cosmos", "polkadot", "kusama", "moonbeam", "moonriver", "parachains", "relay chain",
	"ibc", "cross-chain", "bridge"
	]
	},

	"EVENTS": {
	"MARKET": [
	"bull market", "bear market", "bull run", "bear trap", "bull trap", "halving",
	"capitulation", "golden cross", "death cross", "breakout", "resistance", "support"
	],
	"SECURITY": [
	"hack", "exploit", "vulnerability", "scam", "phishing", "rug pull", "honeypot",
	"flash crash", "attack", "51% attack", "front running", "sandwich attack", "mev extraction"
	],
	"TOKEN_EVENTS": [
	"airdrop", "token unlock", "vesting", "ico", "ido", "ito", "ieo", "fair launch",
	"private sale", "seed round", "listing", "delisting"
	]
	},

	"METRICS": {
	"FINANCIAL": [
	"apy", "apr", "roi", "tvl", "market cap", "mcap", "volume", "liquidity", "supply",
	"circulating supply", "total supply", "max supply", "inflation", "deflation",
	"volatility", "dominance"
	],
	"TECHNICAL": [
	"gas fee", "gas price", "gas limit", "slippage", "impermanent loss", "yield",
	"hashrate", "difficulty", "tps", "latency", "finality", "block time", "block size",
	"block reward"
	]
	},

	"COMMUNITY": {
	"ROLES": [
	"whale", "degen", "anon", "influencer", "kol", "thought leader", "ambassador",
	"advocate", "og", "contributor", "dev", "builder", "founder", "investor", "vc",
	"angel", "team", "core team", "front runner", "mev bot", "searcher", "validator",
	"miner", "node operator", "liquidity provider", "market maker", "arbitrageur"
	],
	"SLANG": [
	"diamond hands", "paper hands", "wagmi", "ngmi", "gm", "gn", "ser", "based",
	"crypto twitter", "ct", "alpha", "dyor", "fomo", "fud", "hodl", "rekt"
	]
	}
	}

	# ==============================================
	# Helper Functions
	# ==============================================

	def clear_gpu_memory():
	"""Clear GPU memory to prevent OOM errors"""
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()

	def load_checkpoint():
	"""Load processing checkpoint if it exists"""
	if os.path.exists(CHECKPOINT_FILE):
	with open(CHECKPOINT_FILE, 'r') as f:
	return json.load(f)
	return {'last_processed_index': 0}

	def save_checkpoint(index):
	"""Save the current processing index to a checkpoint file"""
	with open(CHECKPOINT_FILE, 'w') as f:
	json.dump({'last_processed_index': index}, f)

	def identify_crypto_entities(text: str) -> list:
	"""
	Identify crypto-specific entities in text using the hierarchical taxonomy.

	Args:
	text (str): Text to analyze

	Returns:
	list: List of tuples (entity, main_category, sub_category)
	"""
	if not isinstance(text, str):
	return []

	text_lower = text.lower()
	found_entities = []

	# Search for each entity in the taxonomy
	for main_cat, subcats in CRYPTO_TAXONOMY.items():
	for subcat, terms in subcats.items():
	for term in terms:
	# Avoid matching partial words (ensure word boundaries)
	pattern = r'\b' + re.escape(term) + r'\b'
	if re.search(pattern, text_lower):
	found_entities.append((term, main_cat, subcat))

	return found_entities

	def clean_text(text: str) -> str:
	"""Clean text while preserving mentions and hashtags"""
	if not isinstance(text, str):
	return ""

	# Remove URLs
	text = re.sub(r'http\S+', '', text)

	# Remove non-alphanumeric characters (except mentions, hashtags, and spaces)
	text = re.sub(r'[^\w\s@#]', ' ', text)

	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text).strip()

	return text.lower()

	def process_nlp_text(text: str) -> str:
	"""Process text with advanced NLP (lemmatization, stopword removal)"""
	if not isinstance(text, str):
	return ""

	# Basic cleaning
	text = clean_text(text)

	if SPACY_AVAILABLE:
	# Process with spaCy for advanced NLP
	doc = spacy_nlp(text)

	# Lemmatize and remove stopwords
	processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

	return " ".join(processed_tokens)
	else:
	# Fallback to basic cleaning if spaCy is not available
	return text

	def extract_mentions(text: str) -> list:
	"""Extract @mentions from text"""
	if not isinstance(text, str):
	return []
	return re.findall(r'@(\w+)', text)

	def extract_hashtags(text: str) -> list:
	"""Extract #hashtags from text"""
	if not isinstance(text, str):
	return []
	return re.findall(r'#(\w+)', text)

	def extract_urls(text: str) -> list:
	"""Extract URLs from text"""
	if not isinstance(text, str):
	return []
	urls = re.findall(r'(https?://\S+)', text)
	return urls

	def analyze_sentiment(text: str) -> dict:
	"""
	Analyze the sentiment of a text using the sentiment analysis pipeline.

	Args:
	text (str): The text to analyze

	Returns:
	dict: A dictionary containing sentiment label and score
	"""
	if not SENTIMENT_AVAILABLE or not text.strip():
	return {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}

	try:
	# Pre-process the text to improve sentiment analysis accuracy
	# Limit text length to avoid errors with very long tweets
	truncated_text = text[:512] if len(text) > 512 else text

	# Get sentiment prediction
	sentiment_result = sentiment_pipeline(truncated_text)[0]
	label = sentiment_result['label']
	score = sentiment_result['score']

	# Map to standardized format (positive, negative, neutral)
	sentiment_mapping = {
	'LABEL_0': 'negative',
	'LABEL_1': 'neutral',
	'LABEL_2': 'positive',
	'NEGATIVE': 'negative',
	'NEUTRAL': 'neutral',
	'POSITIVE': 'positive'
	}

	standardized_sentiment = sentiment_mapping.get(label, label.lower())

	# Calculate magnitude (confidence) - useful for filtering high-confidence sentiments
	magnitude = abs(score - 0.5) * 2 if standardized_sentiment != 'neutral' else score

	return {
	"sentiment": standardized_sentiment,
	"sentiment_score": score,
	"sentiment_magnitude": magnitude
	}
	except Exception as e:
	print(f"Error in sentiment analysis: {e}")
	return {"sentiment": "error", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}

	def process_with_nlp(text: str) -> dict:
	"""
	Process text with NLP to extract named entities, key phrases, etc.

	Args:
	text (str): The text to process

	Returns:
	dict: A dictionary containing NLP processing results
	"""
	results = {
	"named_entities": [],
	"pos_tags": [],
	"lemmatized_tokens": [],
	"key_phrases": [],
	"important_nouns": [],
	"sentiment_analysis": {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}
	}

	if not text or text.isspace():
	return results

	# First, analyze sentiment
	results["sentiment_analysis"] = analyze_sentiment(text)

	try:
	# Use spaCy for advanced NLP if available
	if SPACY_AVAILABLE:
	doc = spacy_nlp(text)

	# Extract named entities (excluding crypto entities which are handled separately)
	results["named_entities"] = [(ent.text, ent.label_) for ent in doc.ents]

	# Extract POS tags for content words
	results["pos_tags"] = [(token.text, token.pos_) for token in doc
	if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"] and not token.is_stop]

	# Get lemmatized tokens (normalized words)
	results["lemmatized_tokens"] = [token.lemma_ for token in doc
	if not token.is_stop and not token.is_punct and token.text.strip()]

	# Extract important nouns (potential topics)
	results["important_nouns"] = [token.text for token in doc
	if token.pos_ == "NOUN" and not token.is_stop]

	# Try to extract key phrases using noun chunks
	results["key_phrases"] = [chunk.text for chunk in doc.noun_chunks
	if len(chunk.text.split()) > 1]

	# If key phrases are empty, use RoBERTa to attempt extraction
	if not results["key_phrases"] and len(text.split()) > 3:
	try:
	# Create a masked sentence from the text
	words = text.split()
	if len(words) > 5:
	# Get 3 random positions to mask
	import random
	positions = sorted(random.sample(range(len(words)), min(3, len(words))))

	# Create masked sentences
	key_terms = []
	for pos in positions:
	words_copy = words.copy()
	words_copy[pos] = tokenizer.mask_token
	masked_text = " ".join(words_copy)

	# Get predictions for the masked token
	predictions = nlp_pipeline(masked_text, top_k=2)
	for prediction in predictions:
	key_terms.append(prediction["token_str"].strip())

	results["key_phrases"].extend(key_terms)
	except Exception as e:
	print(f"Error in key phrase extraction: {e}")

	# Ensure all results are strings for CSV output
	results["named_entities"] = ";".join([f"{ent[0]}:{ent[1]}" for ent in results["named_entities"]])
	results["pos_tags"] = ";".join([f"{tag[0]}:{tag[1]}" for tag in results["pos_tags"]])
	results["lemmatized_tokens"] = ";".join(results["lemmatized_tokens"])
	results["key_phrases"] = ";".join(list(set(results["key_phrases"]))) # Remove duplicates
	results["important_nouns"] = ";".join(list(set(results["important_nouns"]))) # Remove duplicates

	except Exception as e:
	print(f"Error in NLP processing: {e}")

	# Clear GPU memory after processing
	if (results["named_entities"].count(";") > 100) or (len(text) > 1000):
	clear_gpu_memory()

	return results

	def process_tweet(text: str) -> tuple:
	"""
	Process a tweet to extract mentions, hashtags, URLs, crypto entities, and perform NLP analysis.
	Also performs sentiment analysis.

	Args:
	text (str): The tweet text to process

	Returns:
	tuple: A tuple containing mentions, hashtags, URLs, NLP results, and sentiment analysis
	"""
	if not text or not isinstance(text, str):
	return [], [], [], "", "", {}, {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0}

	# Clean the text while preserving mentions and hashtags
	cleaned_text = clean_text(text)

	# Process text with NLP
	processed_text = process_nlp_text(text)

	# Extract mentions, hashtags, and URLs
	mentions = extract_mentions(text)
	hashtags = extract_hashtags(text)
	urls = extract_urls(text)

	# Identify crypto entities
	crypto_entities = identify_crypto_entities(text)

	# Process with NLP models
	nlp_results = process_with_nlp(text)

	# Ensure we have the sentiment analysis results
	sentiment_results = nlp_results.pop("sentiment_analysis", {"sentiment": "unknown", "sentiment_score": 0.0, "sentiment_magnitude": 0.0})

	# Add crypto entities to the named entities
	formatted_crypto_entities = [f"{entity}:{main_cat}.{sub_cat}" for entity, main_cat, sub_cat in crypto_entities]

	# If named_entities is a string (joined with semicolons), we need to handle differently
	if isinstance(nlp_results.get("named_entities", ""), str):
	nlp_results["named_entities"] = nlp_results.get("named_entities", "")
	if nlp_results["named_entities"] and formatted_crypto_entities:
	nlp_results["named_entities"] += ";" + ";".join(formatted_crypto_entities)
	elif formatted_crypto_entities:
	nlp_results["named_entities"] = ";".join(formatted_crypto_entities)

	return mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results

	def process_batch(df_batch):
	"""Process a batch of tweets"""
	processed_data = []

	for idx, row in df_batch.iterrows():
	text = row.get('text', '')

	# Process the tweet
	mentions, hashtags, urls, cleaned_text, processed_text, nlp_results, sentiment_results = process_tweet(text)

	# Create a dictionary with the results
	result = {
	'id': row.get('id', ''),
	'original_text': text, # Store the original text
	'cleaned_text': cleaned_text,
	'nlp_processed_text': processed_text,
	'extracted_mentions': ';'.join(mentions),
	'extracted_hashtags': ';'.join(hashtags),
	'extracted_urls': ';'.join(urls),
	'named_entities': nlp_results.get('named_entities', ''),
	'pos_tags': nlp_results.get('pos_tags', ''),
	'lemmatized_tokens': nlp_results.get('lemmatized_tokens', ''),
	'key_phrases': nlp_results.get('key_phrases', ''),
	'important_nouns': nlp_results.get('important_nouns', ''),
	'sentiment': sentiment_results.get('sentiment', 'unknown'),
	'sentiment_score': sentiment_results.get('sentiment_score', 0.0),
	'sentiment_magnitude': sentiment_results.get('sentiment_magnitude', 0.0)
	}

	processed_data.append(result)

	return pd.DataFrame(processed_data)

	# ==============================================
	# Main Processing Function
	# ==============================================

	def main(reset_checkpoint=False, input_file=None):
	"""
	Main function to process tweets

	Args:
	reset_checkpoint (bool): Whether to reset the checkpoint and process all data
	input_file (str): Optional specific input file to process, otherwise processes all CSV files
	"""
	if reset_checkpoint and os.path.exists(CHECKPOINT_FILE):
	os.remove(CHECKPOINT_FILE)
	print("Checkpoint reset. Will process all data from the beginning.")

	# Get list of CSV files to process
	if input_file:
	# Process a specific file
	input_files = [input_file]
	else:
	# Find all CSV files in the OUTPUT_FOLDER
	import glob
	input_files = glob.glob(f"{OUTPUT_FOLDER}/*.csv")

	# Exclude our output files
	input_files = [f for f in input_files if not any(x in f for x in ["_processed.csv", "_analysis.csv"])]

	if not input_files:
	print(f"No input CSV files found in {OUTPUT_FOLDER}")
	return

	print(f"Found {len(input_files)} files to process: {[os.path.basename(f) for f in input_files]}")

	# Process each file
	for input_csv in input_files:
	print(f"\nProcessing file: {os.path.basename(input_csv)}")

	print("Loading dataset...")
	# Check if input file exists
	if not os.path.exists(input_csv):
	print(f"Input file {input_csv} not found. Skipping.")
	continue

	# Load the dataset
	try:
	df = pd.read_csv(input_csv)
	print(f"Loaded dataset with {len(df)} records and {len(df.columns)} columns.")
	except Exception as e:
	print(f"Error loading {input_csv}: {e}")
	continue

	# Load checkpoint if it exists
	checkpoint = load_checkpoint()
	start_idx = checkpoint['last_processed_index']

	# For simplicity, reset checkpoints between files
	start_idx = 0
	save_checkpoint(0)

	print("\nProcessing tweets...")
	print(f"Starting from index {start_idx}")

	# Filter to only unprocessed rows
	df_to_process = df.iloc[start_idx:]

	if len(df_to_process) == 0:
	print("No new data to process in this file.")
	continue

	# Process in batches for memory efficiency
	batch_size = BATCH_SIZE
	num_batches = math.ceil(len(df_to_process) / batch_size)
	print(f"Processing in {num_batches} batches of {batch_size} records each")

	processed_batches = []

	# Create progress bar
	for i in tqdm(range(num_batches)):
	batch_start = i * batch_size
	batch_end = min((i + 1) * batch_size, len(df_to_process))

	# Get current batch
	df_batch = df_to_process.iloc[batch_start:batch_end]

	# Process the batch
	processed_batch = process_batch(df_batch)
	processed_batches.append(processed_batch)

	# Save checkpoint
	save_checkpoint(start_idx + batch_end)

	# Save intermediate results every 5 batches to prevent data loss in case of session timeout
	if i % 5 == 0 and i > 0:
	file_basename = os.path.splitext(os.path.basename(input_csv))[0]
	interim_df = pd.concat(processed_batches, ignore_index=True)
	interim_file = f"{OUTPUT_FOLDER}/{file_basename}_interim_{i}.csv"
	interim_df.to_csv(interim_file, index=False)
	print(f"\nSaved interim results to {interim_file}")

	# Clear memory
	clear_gpu_memory()

	# Combine all batches
	if processed_batches:
	file_basename = os.path.splitext(os.path.basename(input_csv))[0]

	final_df = pd.concat(processed_batches, ignore_index=True)

	# Calculate statistics columns
	final_df["mention_count"] = final_df["extracted_mentions"].str.count(";") + (final_df["extracted_mentions"] != "").astype(int)
	final_df["hashtag_count"] = final_df["extracted_hashtags"].str.count(";") + (final_df["extracted_hashtags"] != "").astype(int)
	final_df["entity_count"] = final_df["named_entities"].str.count(";") + (final_df["named_entities"] != "").astype(int)

	# Save the full processed dataset
	output_file = f"{OUTPUT_FOLDER}/{file_basename}_processed.csv"
	final_df.to_csv(output_file, index=False)
	print(f"Processed data saved to {output_file}")

	# Create a lighter version with just the analysis
	analysis_columns = [
	"id", "original_text", "cleaned_text", "nlp_processed_text",
	"extracted_mentions", "extracted_hashtags", "extracted_urls",
	"named_entities", "key_phrases", "important_nouns",
	"sentiment", "sentiment_score", "sentiment_magnitude",
	"mention_count", "hashtag_count", "entity_count"
	]

	# Ensure all columns exist before subsetting
	available_columns = [col for col in analysis_columns if col in final_df.columns]
	analysis_df = final_df[available_columns]
	analysis_file = f"{OUTPUT_FOLDER}/{file_basename}_analysis.csv"
	analysis_df.to_csv(analysis_file, index=False)
	print(f"Analysis results saved to {analysis_file}")

	# Print statistics
	print(f"\nAnalysis completed successfully!")
	print(f"Total records: {len(final_df)}")
	print(f"Tweets with Mentions: {(final_df['extracted_mentions'] != '').sum()}")
	print(f"Tweets with Hashtags: {(final_df['extracted_hashtags'] != '').sum()}")
	print(f"Tweets with Named Entities: {(final_df['named_entities'] != '').sum()}")

	# Print sentiment statistics
	sentiment_counts = final_df['sentiment'].value_counts()
	print("\nSentiment Distribution:")
	for sentiment, count in sentiment_counts.items():
	percentage = (count / len(final_df)) * 100
	print(f" {sentiment}: {count} tweets ({percentage:.1f}%)")

	# Get average sentiment scores
	avg_score = final_df['sentiment_score'].mean()
	avg_magnitude = final_df['sentiment_magnitude'].mean()
	print(f"\nAverage sentiment score: {avg_score:.3f}")
	print(f"Average sentiment magnitude: {avg_magnitude:.3f}")

	# Get top entities by sentiment
	positive_entities = []
	for idx, row in final_df[final_df['sentiment'] == 'positive'].iterrows():
	entities = row['named_entities'].split(';') if isinstance(row['named_entities'], str) and row['named_entities'] else []
	for entity in entities:
	if entity and ':' in entity:
	entity_name = entity.split(':')[0]
	positive_entities.append(entity_name)

	# Get the most common positive entities
	from collections import Counter
	top_positive = Counter(positive_entities).most_common(5)
	if top_positive:
	print("\nTop entities with positive sentiment:")
	for entity, count in top_positive:
	print(f" {entity}: {count} mentions")

	# Print sample results
	print("\nSample of processing results:")
	for i, row in analysis_df.head(3).iterrows():
	print(f"\nOriginal Text: {row['original_text']}")
	print(f"Cleaned Text: {row['cleaned_text']}")
	print(f"NLP Processed Text: {row['nlp_processed_text']}")
	print(f"Mentions: {row['extracted_mentions']}")
	print(f"Hashtags: {row['extracted_hashtags']}")
	print(f"Named Entities: {row['named_entities']}")
	print(f"Key Phrases: {row['key_phrases']}")
	print(f"Sentiment: {row['sentiment']} (Score: {row['sentiment_score']:.3f}, Magnitude: {row['sentiment_magnitude']:.3f})")
	print("-" * 80)

	# Delete interim files
	import glob
	interim_files = glob.glob(f"{OUTPUT_FOLDER}/{file_basename}_interim_*.csv")
	for f in interim_files:
	try:
	os.remove(f)
	print(f"Deleted interim file: {os.path.basename(f)}")
	except:
	pass

	# Clear memory after processing each file
	clear_gpu_memory()
	else:
	print("No data processed for this file.")

	# Clean up checkpoint file after successful processing
	if os.path.exists(CHECKPOINT_FILE):
	os.remove(CHECKPOINT_FILE)
	print("\nAll files processed successfully!")

	# ==============================================
	# Colab Usage Example
	# ==============================================

	"""
	# EXAMPLE USAGE IN COLAB:

	# 1. Install packages and mount drive
	from google.colab import drive
	drive.mount('/content/drive')

	# 2. Process one specific file
	input_file = "/content/drive/MyDrive/projects_twitter_post/zilliqa.csv"
	main(reset_checkpoint=True, input_file=input_file)

	# 3. Process all files
	main(reset_checkpoint=True)
	"""

	if __name__ == "__main__":
	import sys

	# Check if --reset flag is provided
	reset_checkpoint = "--reset" in sys.argv

	# Check if --file flag is provided
	input_file = None
	if "--file" in sys.argv:
	try:
	input_file = sys.argv[sys.argv.index("--file") + 1]
	except IndexError:
	print("Error: --file flag requires a filename argument")
	sys.exit(1)

	# Run the main function
	main(reset_checkpoint=reset_checkpoint, input_file=input_file) )

	demo.launch()