abhisheksan commited on
Commit
1aa3253
·
verified ·
1 Parent(s): 9c81152

Update twitter_service.py

Browse files
Files changed (1) hide show
  1. twitter_service.py +441 -442
twitter_service.py CHANGED
@@ -1,443 +1,442 @@
1
- import asyncio
2
- import json
3
- import os
4
- import time
5
- from datetime import datetime, timedelta, timezone
6
- from typing import Dict, List, Optional
7
-
8
- import httpx
9
- from cachetools import TTLCache
10
- from loguru import logger
11
-
12
- from models import NewsSource, Tweet
13
-
14
-
15
- class RssTwitterService:
16
- """Service for collecting tweets via RSS feeds."""
17
-
18
- def __init__(self):
19
- self.cache_expiry = int(os.getenv("CACHE_EXPIRY_MINUTES", 120))
20
-
21
- # HTTP client for making requests
22
- self.client = None
23
-
24
- # Enhanced cache with TTL and persistence
25
- self.tweet_cache_dir = os.path.join(os.path.dirname(__file__), ".tweet_cache")
26
- os.makedirs(self.tweet_cache_dir, exist_ok=True)
27
- self.in_memory_cache = TTLCache(maxsize=100, ttl=self.cache_expiry * 60)
28
-
29
- # Statistics and monitoring
30
- self.stats = {
31
- "requests": 0,
32
- "cache_hits": 0,
33
- "errors": 0,
34
- "success": 0
35
- }
36
- self.last_stats_reset = time.time()
37
-
38
- # Map of Twitter handles to RSS feed URLs - directly added
39
- self.rss_feed_urls = {
40
- "sidhant": "https://rss.app/feeds/v1.1/e3jA7zWvdgakMgqE.json",
41
- "ShivAroor": "https://rss.app/feeds/v1.1/1f4kMzGI07mYZ83m.json",
42
- "IAF_MCC": "https://rss.app/feeds/v1.1/KRlErbCqqu4sKtNP.json",
43
- "adgpi": "https://rss.app/feeds/v1.1/br2tH5o30zxi6LjL.json",
44
- "SpokespersonMoD": "https://rss.app/feeds/v1.1/tXYyx8q4L9xDNssq.json",
45
- "MIB_India": "https://rss.app/feeds/v1.1/0pABfGkIm2Y1ru5z.json",
46
- "DrSJaishankar": "https://rss.app/feeds/v1.1/Dq7PRmVOagKt3Q5D.json"
47
- }
48
-
49
- # Default trusted news sources
50
- self.news_sources = [
51
- NewsSource(name="Shiv Aroor", twitter_handle="ShivAroor", country="India", reliability_score=0.85),
52
- NewsSource(name="Sidhant Sibal", twitter_handle="sidhant", country="India", reliability_score=0.85),
53
- NewsSource(name="Indian Air Force", twitter_handle="IAF_MCC", country="India", reliability_score=0.95),
54
- NewsSource(name="Indian Army", twitter_handle="adgpi", country="India", reliability_score=0.95),
55
- NewsSource(name="Indian Defence Ministry", twitter_handle="SpokespersonMoD", country="India", reliability_score=0.95),
56
- NewsSource(name="MIB India", twitter_handle="MIB_India", country="India", reliability_score=0.95),
57
- NewsSource(name="Indian External Affairs Minister", twitter_handle="DrSJaishankar", country="India", reliability_score=0.95),
58
- ]
59
-
60
- async def initialize(self) -> bool:
61
- """Initialize the Twitter service with RSS feed capability."""
62
- try:
63
- logger.info("Initializing Twitter service with RSS feed capability")
64
-
65
- # Initialize HTTP client
66
- self.client = httpx.AsyncClient(
67
- timeout=30.0,
68
- follow_redirects=True,
69
- http2=True
70
- )
71
-
72
- # Log the pre-registered RSS feeds
73
- logger.info(f"Pre-registered {len(self.rss_feed_urls)} RSS feeds for Twitter handles")
74
-
75
- # Schedule background maintenance
76
- asyncio.create_task(self._background_maintenance())
77
-
78
- logger.info("RSS Twitter service initialized successfully")
79
- return True
80
-
81
- except Exception as e:
82
- logger.error(f"Failed to initialize RSS Twitter service: {str(e)}")
83
- return False
84
-
85
- async def _background_maintenance(self):
86
- """Run background maintenance tasks"""
87
- while True:
88
- try:
89
- # Wait between maintenance cycles
90
- await asyncio.sleep(900) # 15 minutes
91
-
92
- # Log statistics
93
- self._log_statistics()
94
-
95
- # Clean up cache files
96
- self._cleanup_expired_cache()
97
-
98
- # Reset statistics periodically
99
- if time.time() - self.last_stats_reset > 3600: # Reset every hour
100
- self.stats = {key: 0 for key in self.stats}
101
- self.last_stats_reset = time.time()
102
-
103
- except Exception as e:
104
- logger.error(f"Error in background maintenance: {str(e)}")
105
-
106
- def _log_statistics(self):
107
- """Log service statistics"""
108
- total_requests = max(1, self.stats["requests"])
109
- cache_hit_rate = self.stats["cache_hits"] / total_requests * 100
110
- error_rate = self.stats["errors"] / total_requests * 100
111
-
112
- logger.info(f"RssTwitterService stats - Requests: {total_requests}, " +
113
- f"Cache hits: {self.stats['cache_hits']} ({cache_hit_rate:.1f}%), " +
114
- f"Errors: {self.stats['errors']} ({error_rate:.1f}%)")
115
-
116
- def _cleanup_expired_cache(self):
117
- """Clean up expired cache files"""
118
- now = time.time()
119
- expiry_time = self.cache_expiry * 60
120
-
121
- try:
122
- for filename in os.listdir(self.tweet_cache_dir):
123
- if not filename.endswith('.json'):
124
- continue
125
-
126
- file_path = os.path.join(self.tweet_cache_dir, filename)
127
-
128
- try:
129
- file_modified_time = os.path.getmtime(file_path)
130
- if now - file_modified_time > expiry_time:
131
- os.remove(file_path)
132
- logger.debug(f"Removed expired cache file: {filename}")
133
- except Exception as e:
134
- logger.error(f"Error cleaning up cache file {filename}: {e}")
135
- except Exception as e:
136
- logger.error(f"Error during cache cleanup: {e}")
137
-
138
- def _get_cache_key(self, twitter_handle, limit):
139
- """Generate a cache key for a specific Twitter source"""
140
- return f"{twitter_handle}_{limit}"
141
-
142
- def _get_cache_path(self, cache_key):
143
- """Get filesystem path for a cache key"""
144
- # Create a safe filename from the cache key
145
- import re
146
- safe_key = re.sub(r'[^a-zA-Z0-9_-]', '_', cache_key)
147
- return os.path.join(self.tweet_cache_dir, f"{safe_key}.json")
148
-
149
- def _get_from_cache(self, cache_key):
150
- """Get tweets from cache (memory or disk)"""
151
- # Check memory cache first
152
- if cache_key in self.in_memory_cache:
153
- self.stats["cache_hits"] += 1
154
- return self.in_memory_cache[cache_key]
155
-
156
- # Check disk cache
157
- cache_path = self._get_cache_path(cache_key)
158
- if os.path.exists(cache_path):
159
- try:
160
- with open(cache_path, 'r') as f:
161
- cache_data = json.load(f)
162
-
163
- # Check if cache is still valid
164
- if time.time() - cache_data['timestamp'] < self.cache_expiry * 60:
165
- # Convert dictionaries back to Tweet objects
166
- tweets = []
167
- for tweet_dict in cache_data['tweets']:
168
- # Parse created_at back to datetime if it's stored as a string
169
- if 'created_at' in tweet_dict and isinstance(tweet_dict['created_at'], str):
170
- try:
171
- # Strip timezone info to make naive datetime
172
- dt = datetime.fromisoformat(tweet_dict['created_at'].replace('Z', '+00:00'))
173
- tweet_dict['created_at'] = dt.replace(tzinfo=None)
174
- except ValueError:
175
- tweet_dict['created_at'] = datetime.now()
176
-
177
- tweets.append(Tweet(**tweet_dict))
178
-
179
- # Restore to memory cache and return
180
- self.in_memory_cache[cache_key] = tweets
181
- self.stats["cache_hits"] += 1
182
- return tweets
183
- else:
184
- # Cache expired, remove file
185
- os.remove(cache_path)
186
- except Exception as e:
187
- logger.error(f"Error reading cache file {cache_path}: {e}")
188
-
189
- return None
190
-
191
- def _save_to_cache(self, cache_key, tweets):
192
- """Save tweets to cache (memory and disk)"""
193
- # Save to memory cache
194
- self.in_memory_cache[cache_key] = tweets
195
-
196
- # Convert tweets to dictionaries for JSON serialization
197
- tweet_dicts = []
198
- for tweet in tweets:
199
- # Make sure created_at is serializable
200
- if hasattr(tweet.created_at, 'tzinfo') and tweet.created_at.tzinfo is not None:
201
- # Convert to UTC and make naive for consistent comparisons
202
- created_at = tweet.created_at.astimezone(timezone.utc).replace(tzinfo=None)
203
- else:
204
- created_at = tweet.created_at
205
-
206
- tweet_dicts.append({
207
- 'id': tweet.id,
208
- 'text': tweet.text,
209
- 'author': tweet.author,
210
- 'created_at': created_at.isoformat() if hasattr(created_at, 'isoformat') else str(created_at),
211
- 'engagement': tweet.engagement,
212
- 'url': tweet.url
213
- })
214
-
215
- # Save to disk cache
216
- cache_path = self._get_cache_path(cache_key)
217
- try:
218
- with open(cache_path, 'w') as f:
219
- json.dump({
220
- 'tweets': tweet_dicts,
221
- 'timestamp': time.time()
222
- }, f)
223
- except Exception as e:
224
- logger.error(f"Error writing to cache file {cache_path}: {e}")
225
-
226
- def register_rss_feed(self, twitter_handle: str, rss_url: str):
227
- """Register an RSS feed URL for a specific Twitter handle"""
228
- self.rss_feed_urls[twitter_handle] = rss_url
229
- logger.info(f"Registered RSS feed for {twitter_handle}: {rss_url}")
230
-
231
- def register_rss_feed_batch(self, feed_map: Dict[str, str]):
232
- """Register multiple RSS feeds at once"""
233
- self.rss_feed_urls.update(feed_map)
234
- logger.info(f"Registered {len(feed_map)} RSS feeds")
235
-
236
- async def get_tweets_from_source(self, source: NewsSource, limit: int = 20, retries: int = 3) -> List[Tweet]:
237
- """Get tweets from a specific Twitter source using RSS feed."""
238
- cache_key = self._get_cache_key(source.twitter_handle, limit)
239
-
240
- # Check cache first
241
- cached_tweets = self._get_from_cache(cache_key)
242
- if cached_tweets:
243
- logger.debug(f"Returning cached tweets for {source.twitter_handle}")
244
- return cached_tweets
245
-
246
- self.stats["requests"] += 1
247
-
248
- # Check if we have a registered RSS feed for this Twitter handle
249
- rss_url = self.rss_feed_urls.get(source.twitter_handle)
250
- if not rss_url:
251
- logger.warning(f"No RSS feed registered for {source.twitter_handle}")
252
- return []
253
-
254
- # Extract tweets with retry logic
255
- tweets = []
256
-
257
- for attempt in range(retries + 1):
258
- try:
259
- logger.info(f"Fetching tweets from RSS feed for {source.twitter_handle} (attempt {attempt + 1}/{retries + 1})")
260
-
261
- # Add cache-busting parameter
262
- params = {"_": str(int(time.time()))}
263
-
264
- response = await self.client.get(rss_url, params=params)
265
-
266
- if response.status_code == 200:
267
- # Parse the RSS JSON feed
268
- try:
269
- self.stats["success"] += 1
270
- rss_data = response.json()
271
-
272
- # Parse items from the feed
273
- if "items" in rss_data:
274
- items = rss_data["items"][:limit]
275
-
276
- for item in items:
277
- try:
278
- # Extract tweet ID from the URL
279
- tweet_id = item.get("id", "").split("/")[-1]
280
- if not tweet_id:
281
- continue
282
-
283
- # Extract tweet text
284
- tweet_text = item.get("content_text", item.get("title", ""))
285
-
286
- # Extract timestamp
287
- date_str = item.get("date_published", "")
288
- try:
289
- # Convert to datetime then strip timezone info to make naive
290
- dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
291
- created_at = dt.replace(tzinfo=None)
292
- except (ValueError, TypeError):
293
- created_at = datetime.now()
294
-
295
- # Extract engagement metrics if available
296
- engagement = {"likes": 0, "retweets": 0, "replies": 0, "views": 0}
297
-
298
- # Try to extract engagement from extensions or additional fields
299
- if "x_metadata" in item:
300
- x_data = item["x_metadata"]
301
- engagement["likes"] = x_data.get("likes", 0)
302
- engagement["retweets"] = x_data.get("retweets", 0)
303
- engagement["replies"] = x_data.get("replies", 0)
304
- engagement["views"] = x_data.get("views", 0)
305
-
306
- # Construct tweet URL
307
- tweet_url = item.get("url", f"https://x.com/{source.twitter_handle}/status/{tweet_id}")
308
-
309
- tweets.append(
310
- Tweet(
311
- id=tweet_id,
312
- text=tweet_text,
313
- author=source.twitter_handle,
314
- created_at=created_at,
315
- engagement=engagement,
316
- url=tweet_url
317
- )
318
- )
319
- except Exception as e:
320
- logger.error(f"Error processing tweet from RSS for {source.twitter_handle}: {str(e)}")
321
-
322
- # Cache the results
323
- if tweets:
324
- self._save_to_cache(cache_key, tweets)
325
- logger.info(f"Fetched and cached {len(tweets)} tweets from RSS feed for {source.twitter_handle}")
326
-
327
- return tweets
328
-
329
- except json.JSONDecodeError:
330
- logger.error(f"Invalid JSON response from RSS feed for {source.twitter_handle}")
331
- self.stats["errors"] += 1
332
-
333
- if attempt < retries:
334
- await asyncio.sleep(2)
335
- continue
336
- else:
337
- return []
338
- else:
339
- # HTTP error
340
- self.stats["errors"] += 1
341
- logger.error(f"Failed to fetch RSS feed for {source.twitter_handle}: HTTP {response.status_code}")
342
-
343
- if attempt < retries:
344
- await asyncio.sleep(5)
345
- continue
346
- else:
347
- return []
348
-
349
- except Exception as e:
350
- self.stats["errors"] += 1
351
- logger.error(f"Error fetching RSS feed for {source.twitter_handle}: {str(e)}")
352
-
353
- if attempt < retries:
354
- await asyncio.sleep(5)
355
- continue
356
-
357
- return [] # Return empty list if all retries failed
358
-
359
- async def get_related_tweets(self, keywords: List[str], days_back: int = 2) -> List[Tweet]:
360
- """
361
- Get tweets related to specific keywords from trusted news sources only.
362
- """
363
- all_tweets = []
364
- # Create naive datetime for consistent comparisons
365
- cutoff_date = datetime.now() - timedelta(days=days_back)
366
-
367
- # Filter to active sources that have RSS feeds
368
- active_sources = [source for source in self.news_sources
369
- if source.is_active and source.twitter_handle in self.rss_feed_urls]
370
-
371
- # Sort sources by reliability score (prioritize higher scores)
372
- active_sources.sort(key=lambda s: s.reliability_score, reverse=True)
373
-
374
- logger.info(f"Collecting tweets from {len(active_sources)} trusted news sources with RSS feeds")
375
-
376
- # Process sources in parallel
377
- tasks = []
378
- for source in active_sources:
379
- tasks.append(self.get_tweets_from_source(source, limit=50))
380
-
381
- source_tweets_list = await asyncio.gather(*tasks)
382
-
383
- # Process results
384
- for source_tweets in source_tweets_list:
385
- # Filter tweets by keywords and date
386
- for tweet in source_tweets:
387
- try:
388
- # Make sure we're comparing naive datetimes
389
- tweet_date = tweet.created_at
390
- if hasattr(tweet_date, 'tzinfo') and tweet_date.tzinfo is not None:
391
- tweet_date = tweet_date.replace(tzinfo=None)
392
-
393
- if (tweet_date >= cutoff_date and
394
- any(keyword.lower() in tweet.text.lower() for keyword in keywords)):
395
- all_tweets.append(tweet)
396
- except Exception as e:
397
- logger.error(f"Error processing tweet during filtering: {str(e)}")
398
-
399
- # If we have very few results, try with more relaxed filtering
400
- if len(all_tweets) < 5 and active_sources:
401
- logger.info("Few relevant tweets found, trying more relaxed filtering")
402
-
403
- # Process all tweets again with more relaxed keyword matching
404
- for source_tweets in source_tweets_list:
405
- for tweet in source_tweets:
406
- try:
407
- # Make sure we're comparing naive datetimes
408
- tweet_date = tweet.created_at
409
- if hasattr(tweet_date, 'tzinfo') and tweet_date.tzinfo is not None:
410
- tweet_date = tweet_date.replace(tzinfo=None)
411
-
412
- if tweet_date >= cutoff_date:
413
- for keyword in keywords:
414
- # Split keyword into parts and check if any part matches
415
- keyword_parts = keyword.lower().split()
416
- if any(part in tweet.text.lower() for part in keyword_parts if len(part) > 3):
417
- if tweet.id not in [t.id for t in all_tweets]:
418
- all_tweets.append(tweet)
419
- break
420
- except Exception as e:
421
- logger.error(f"Error during relaxed filtering: {str(e)}")
422
-
423
- # Sort by recency
424
- all_tweets.sort(key=lambda x: x.created_at, reverse=True)
425
-
426
- logger.info(f"Found {len(all_tweets)} tweets from trusted sources related to keywords: {keywords}")
427
- return all_tweets
428
-
429
- def update_sources(self, sources: List[NewsSource]) -> None:
430
- """Update the list of trusted news sources."""
431
- self.news_sources = sources
432
- # Clear cache when sources are updated
433
- self.in_memory_cache.clear()
434
- logger.info(f"Updated trusted news sources. New count: {len(sources)}")
435
-
436
- def get_sources(self) -> List[NewsSource]:
437
- """Get the current list of trusted news sources."""
438
- return self.news_sources
439
-
440
- async def close(self):
441
- """Clean up resources."""
442
- if self.client:
443
  await self.client.aclose()
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import time
5
+ from datetime import datetime, timedelta, timezone
6
+ from typing import Dict, List, Optional
7
+
8
+ import httpx
9
+ from cachetools import TTLCache
10
+ from loguru import logger
11
+
12
+ from models import NewsSource, Tweet
13
+
14
+
15
+ class RssTwitterService:
16
+ """Service for collecting tweets via RSS feeds."""
17
+
18
+ def __init__(self):
19
+ self.cache_expiry = int(os.getenv("CACHE_EXPIRY_MINUTES", 120))
20
+
21
+ # HTTP client for making requests
22
+ self.client = None
23
+
24
+ # Enhanced cache with TTL and persistence
25
+ self.tweet_cache_dir = os.path.join(os.path.dirname(__file__), ".tweet_cache")
26
+ os.makedirs(self.tweet_cache_dir, exist_ok=True)
27
+ self.in_memory_cache = TTLCache(maxsize=100, ttl=self.cache_expiry * 60)
28
+
29
+ # Statistics and monitoring
30
+ self.stats = {
31
+ "requests": 0,
32
+ "cache_hits": 0,
33
+ "errors": 0,
34
+ "success": 0
35
+ }
36
+ self.last_stats_reset = time.time()
37
+
38
+ # Map of Twitter handles to RSS feed URLs - directly added
39
+ self.rss_feed_urls = {
40
+ "sidhant": "https://rsshub.app/twitter/user/sidhant",
41
+ "ShivAroor": "https://rsshub.app/twitter/user/ShivAroor",
42
+ "IAF_MCC": "https://rsshub.app/twitter/user/IAF_MCC",
43
+ "adgpi": "https://rsshub.app/twitter/user/adgpi",
44
+ "SpokespersonMoD": "https://rsshub.app/twitter/user/SpokespersonMoD",
45
+ "MIB_India": "https://rsshub.app/twitter/user/MIB_India",
46
+ }
47
+
48
+ # Default trusted news sources
49
+ self.news_sources = [
50
+ NewsSource(name="Shiv Aroor", twitter_handle="ShivAroor", country="India", reliability_score=0.85),
51
+ NewsSource(name="Sidhant Sibal", twitter_handle="sidhant", country="India", reliability_score=0.85),
52
+ NewsSource(name="Indian Air Force", twitter_handle="IAF_MCC", country="India", reliability_score=0.95),
53
+ NewsSource(name="Indian Army", twitter_handle="adgpi", country="India", reliability_score=0.95),
54
+ NewsSource(name="Indian Defence Ministry", twitter_handle="SpokespersonMoD", country="India", reliability_score=0.95),
55
+ NewsSource(name="MIB India", twitter_handle="MIB_India", country="India", reliability_score=0.95),
56
+ NewsSource(name="Indian External Affairs Minister", twitter_handle="DrSJaishankar", country="India", reliability_score=0.95),
57
+ ]
58
+
59
+ async def initialize(self) -> bool:
60
+ """Initialize the Twitter service with RSS feed capability."""
61
+ try:
62
+ logger.info("Initializing Twitter service with RSS feed capability")
63
+
64
+ # Initialize HTTP client
65
+ self.client = httpx.AsyncClient(
66
+ timeout=30.0,
67
+ follow_redirects=True,
68
+ http2=True
69
+ )
70
+
71
+ # Log the pre-registered RSS feeds
72
+ logger.info(f"Pre-registered {len(self.rss_feed_urls)} RSS feeds for Twitter handles")
73
+
74
+ # Schedule background maintenance
75
+ asyncio.create_task(self._background_maintenance())
76
+
77
+ logger.info("RSS Twitter service initialized successfully")
78
+ return True
79
+
80
+ except Exception as e:
81
+ logger.error(f"Failed to initialize RSS Twitter service: {str(e)}")
82
+ return False
83
+
84
+ async def _background_maintenance(self):
85
+ """Run background maintenance tasks"""
86
+ while True:
87
+ try:
88
+ # Wait between maintenance cycles
89
+ await asyncio.sleep(900) # 15 minutes
90
+
91
+ # Log statistics
92
+ self._log_statistics()
93
+
94
+ # Clean up cache files
95
+ self._cleanup_expired_cache()
96
+
97
+ # Reset statistics periodically
98
+ if time.time() - self.last_stats_reset > 3600: # Reset every hour
99
+ self.stats = {key: 0 for key in self.stats}
100
+ self.last_stats_reset = time.time()
101
+
102
+ except Exception as e:
103
+ logger.error(f"Error in background maintenance: {str(e)}")
104
+
105
+ def _log_statistics(self):
106
+ """Log service statistics"""
107
+ total_requests = max(1, self.stats["requests"])
108
+ cache_hit_rate = self.stats["cache_hits"] / total_requests * 100
109
+ error_rate = self.stats["errors"] / total_requests * 100
110
+
111
+ logger.info(f"RssTwitterService stats - Requests: {total_requests}, " +
112
+ f"Cache hits: {self.stats['cache_hits']} ({cache_hit_rate:.1f}%), " +
113
+ f"Errors: {self.stats['errors']} ({error_rate:.1f}%)")
114
+
115
+ def _cleanup_expired_cache(self):
116
+ """Clean up expired cache files"""
117
+ now = time.time()
118
+ expiry_time = self.cache_expiry * 60
119
+
120
+ try:
121
+ for filename in os.listdir(self.tweet_cache_dir):
122
+ if not filename.endswith('.json'):
123
+ continue
124
+
125
+ file_path = os.path.join(self.tweet_cache_dir, filename)
126
+
127
+ try:
128
+ file_modified_time = os.path.getmtime(file_path)
129
+ if now - file_modified_time > expiry_time:
130
+ os.remove(file_path)
131
+ logger.debug(f"Removed expired cache file: {filename}")
132
+ except Exception as e:
133
+ logger.error(f"Error cleaning up cache file {filename}: {e}")
134
+ except Exception as e:
135
+ logger.error(f"Error during cache cleanup: {e}")
136
+
137
+ def _get_cache_key(self, twitter_handle, limit):
138
+ """Generate a cache key for a specific Twitter source"""
139
+ return f"{twitter_handle}_{limit}"
140
+
141
+ def _get_cache_path(self, cache_key):
142
+ """Get filesystem path for a cache key"""
143
+ # Create a safe filename from the cache key
144
+ import re
145
+ safe_key = re.sub(r'[^a-zA-Z0-9_-]', '_', cache_key)
146
+ return os.path.join(self.tweet_cache_dir, f"{safe_key}.json")
147
+
148
+ def _get_from_cache(self, cache_key):
149
+ """Get tweets from cache (memory or disk)"""
150
+ # Check memory cache first
151
+ if cache_key in self.in_memory_cache:
152
+ self.stats["cache_hits"] += 1
153
+ return self.in_memory_cache[cache_key]
154
+
155
+ # Check disk cache
156
+ cache_path = self._get_cache_path(cache_key)
157
+ if os.path.exists(cache_path):
158
+ try:
159
+ with open(cache_path, 'r') as f:
160
+ cache_data = json.load(f)
161
+
162
+ # Check if cache is still valid
163
+ if time.time() - cache_data['timestamp'] < self.cache_expiry * 60:
164
+ # Convert dictionaries back to Tweet objects
165
+ tweets = []
166
+ for tweet_dict in cache_data['tweets']:
167
+ # Parse created_at back to datetime if it's stored as a string
168
+ if 'created_at' in tweet_dict and isinstance(tweet_dict['created_at'], str):
169
+ try:
170
+ # Strip timezone info to make naive datetime
171
+ dt = datetime.fromisoformat(tweet_dict['created_at'].replace('Z', '+00:00'))
172
+ tweet_dict['created_at'] = dt.replace(tzinfo=None)
173
+ except ValueError:
174
+ tweet_dict['created_at'] = datetime.now()
175
+
176
+ tweets.append(Tweet(**tweet_dict))
177
+
178
+ # Restore to memory cache and return
179
+ self.in_memory_cache[cache_key] = tweets
180
+ self.stats["cache_hits"] += 1
181
+ return tweets
182
+ else:
183
+ # Cache expired, remove file
184
+ os.remove(cache_path)
185
+ except Exception as e:
186
+ logger.error(f"Error reading cache file {cache_path}: {e}")
187
+
188
+ return None
189
+
190
+ def _save_to_cache(self, cache_key, tweets):
191
+ """Save tweets to cache (memory and disk)"""
192
+ # Save to memory cache
193
+ self.in_memory_cache[cache_key] = tweets
194
+
195
+ # Convert tweets to dictionaries for JSON serialization
196
+ tweet_dicts = []
197
+ for tweet in tweets:
198
+ # Make sure created_at is serializable
199
+ if hasattr(tweet.created_at, 'tzinfo') and tweet.created_at.tzinfo is not None:
200
+ # Convert to UTC and make naive for consistent comparisons
201
+ created_at = tweet.created_at.astimezone(timezone.utc).replace(tzinfo=None)
202
+ else:
203
+ created_at = tweet.created_at
204
+
205
+ tweet_dicts.append({
206
+ 'id': tweet.id,
207
+ 'text': tweet.text,
208
+ 'author': tweet.author,
209
+ 'created_at': created_at.isoformat() if hasattr(created_at, 'isoformat') else str(created_at),
210
+ 'engagement': tweet.engagement,
211
+ 'url': tweet.url
212
+ })
213
+
214
+ # Save to disk cache
215
+ cache_path = self._get_cache_path(cache_key)
216
+ try:
217
+ with open(cache_path, 'w') as f:
218
+ json.dump({
219
+ 'tweets': tweet_dicts,
220
+ 'timestamp': time.time()
221
+ }, f)
222
+ except Exception as e:
223
+ logger.error(f"Error writing to cache file {cache_path}: {e}")
224
+
225
+ def register_rss_feed(self, twitter_handle: str, rss_url: str):
226
+ """Register an RSS feed URL for a specific Twitter handle"""
227
+ self.rss_feed_urls[twitter_handle] = rss_url
228
+ logger.info(f"Registered RSS feed for {twitter_handle}: {rss_url}")
229
+
230
+ def register_rss_feed_batch(self, feed_map: Dict[str, str]):
231
+ """Register multiple RSS feeds at once"""
232
+ self.rss_feed_urls.update(feed_map)
233
+ logger.info(f"Registered {len(feed_map)} RSS feeds")
234
+
235
+ async def get_tweets_from_source(self, source: NewsSource, limit: int = 20, retries: int = 3) -> List[Tweet]:
236
+ """Get tweets from a specific Twitter source using RSS feed."""
237
+ cache_key = self._get_cache_key(source.twitter_handle, limit)
238
+
239
+ # Check cache first
240
+ cached_tweets = self._get_from_cache(cache_key)
241
+ if cached_tweets:
242
+ logger.debug(f"Returning cached tweets for {source.twitter_handle}")
243
+ return cached_tweets
244
+
245
+ self.stats["requests"] += 1
246
+
247
+ # Check if we have a registered RSS feed for this Twitter handle
248
+ rss_url = self.rss_feed_urls.get(source.twitter_handle)
249
+ if not rss_url:
250
+ logger.warning(f"No RSS feed registered for {source.twitter_handle}")
251
+ return []
252
+
253
+ # Extract tweets with retry logic
254
+ tweets = []
255
+
256
+ for attempt in range(retries + 1):
257
+ try:
258
+ logger.info(f"Fetching tweets from RSS feed for {source.twitter_handle} (attempt {attempt + 1}/{retries + 1})")
259
+
260
+ # Add cache-busting parameter
261
+ params = {"_": str(int(time.time()))}
262
+
263
+ response = await self.client.get(rss_url, params=params)
264
+
265
+ if response.status_code == 200:
266
+ # Parse the RSS JSON feed
267
+ try:
268
+ self.stats["success"] += 1
269
+ rss_data = response.json()
270
+
271
+ # Parse items from the feed
272
+ if "items" in rss_data:
273
+ items = rss_data["items"][:limit]
274
+
275
+ for item in items:
276
+ try:
277
+ # Extract tweet ID from the URL
278
+ tweet_id = item.get("id", "").split("/")[-1]
279
+ if not tweet_id:
280
+ continue
281
+
282
+ # Extract tweet text
283
+ tweet_text = item.get("content_text", item.get("title", ""))
284
+
285
+ # Extract timestamp
286
+ date_str = item.get("date_published", "")
287
+ try:
288
+ # Convert to datetime then strip timezone info to make naive
289
+ dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
290
+ created_at = dt.replace(tzinfo=None)
291
+ except (ValueError, TypeError):
292
+ created_at = datetime.now()
293
+
294
+ # Extract engagement metrics if available
295
+ engagement = {"likes": 0, "retweets": 0, "replies": 0, "views": 0}
296
+
297
+ # Try to extract engagement from extensions or additional fields
298
+ if "x_metadata" in item:
299
+ x_data = item["x_metadata"]
300
+ engagement["likes"] = x_data.get("likes", 0)
301
+ engagement["retweets"] = x_data.get("retweets", 0)
302
+ engagement["replies"] = x_data.get("replies", 0)
303
+ engagement["views"] = x_data.get("views", 0)
304
+
305
+ # Construct tweet URL
306
+ tweet_url = item.get("url", f"https://x.com/{source.twitter_handle}/status/{tweet_id}")
307
+
308
+ tweets.append(
309
+ Tweet(
310
+ id=tweet_id,
311
+ text=tweet_text,
312
+ author=source.twitter_handle,
313
+ created_at=created_at,
314
+ engagement=engagement,
315
+ url=tweet_url
316
+ )
317
+ )
318
+ except Exception as e:
319
+ logger.error(f"Error processing tweet from RSS for {source.twitter_handle}: {str(e)}")
320
+
321
+ # Cache the results
322
+ if tweets:
323
+ self._save_to_cache(cache_key, tweets)
324
+ logger.info(f"Fetched and cached {len(tweets)} tweets from RSS feed for {source.twitter_handle}")
325
+
326
+ return tweets
327
+
328
+ except json.JSONDecodeError:
329
+ logger.error(f"Invalid JSON response from RSS feed for {source.twitter_handle}")
330
+ self.stats["errors"] += 1
331
+
332
+ if attempt < retries:
333
+ await asyncio.sleep(2)
334
+ continue
335
+ else:
336
+ return []
337
+ else:
338
+ # HTTP error
339
+ self.stats["errors"] += 1
340
+ logger.error(f"Failed to fetch RSS feed for {source.twitter_handle}: HTTP {response.status_code}")
341
+
342
+ if attempt < retries:
343
+ await asyncio.sleep(5)
344
+ continue
345
+ else:
346
+ return []
347
+
348
+ except Exception as e:
349
+ self.stats["errors"] += 1
350
+ logger.error(f"Error fetching RSS feed for {source.twitter_handle}: {str(e)}")
351
+
352
+ if attempt < retries:
353
+ await asyncio.sleep(5)
354
+ continue
355
+
356
+ return [] # Return empty list if all retries failed
357
+
358
+ async def get_related_tweets(self, keywords: List[str], days_back: int = 2) -> List[Tweet]:
359
+ """
360
+ Get tweets related to specific keywords from trusted news sources only.
361
+ """
362
+ all_tweets = []
363
+ # Create naive datetime for consistent comparisons
364
+ cutoff_date = datetime.now() - timedelta(days=days_back)
365
+
366
+ # Filter to active sources that have RSS feeds
367
+ active_sources = [source for source in self.news_sources
368
+ if source.is_active and source.twitter_handle in self.rss_feed_urls]
369
+
370
+ # Sort sources by reliability score (prioritize higher scores)
371
+ active_sources.sort(key=lambda s: s.reliability_score, reverse=True)
372
+
373
+ logger.info(f"Collecting tweets from {len(active_sources)} trusted news sources with RSS feeds")
374
+
375
+ # Process sources in parallel
376
+ tasks = []
377
+ for source in active_sources:
378
+ tasks.append(self.get_tweets_from_source(source, limit=50))
379
+
380
+ source_tweets_list = await asyncio.gather(*tasks)
381
+
382
+ # Process results
383
+ for source_tweets in source_tweets_list:
384
+ # Filter tweets by keywords and date
385
+ for tweet in source_tweets:
386
+ try:
387
+ # Make sure we're comparing naive datetimes
388
+ tweet_date = tweet.created_at
389
+ if hasattr(tweet_date, 'tzinfo') and tweet_date.tzinfo is not None:
390
+ tweet_date = tweet_date.replace(tzinfo=None)
391
+
392
+ if (tweet_date >= cutoff_date and
393
+ any(keyword.lower() in tweet.text.lower() for keyword in keywords)):
394
+ all_tweets.append(tweet)
395
+ except Exception as e:
396
+ logger.error(f"Error processing tweet during filtering: {str(e)}")
397
+
398
+ # If we have very few results, try with more relaxed filtering
399
+ if len(all_tweets) < 5 and active_sources:
400
+ logger.info("Few relevant tweets found, trying more relaxed filtering")
401
+
402
+ # Process all tweets again with more relaxed keyword matching
403
+ for source_tweets in source_tweets_list:
404
+ for tweet in source_tweets:
405
+ try:
406
+ # Make sure we're comparing naive datetimes
407
+ tweet_date = tweet.created_at
408
+ if hasattr(tweet_date, 'tzinfo') and tweet_date.tzinfo is not None:
409
+ tweet_date = tweet_date.replace(tzinfo=None)
410
+
411
+ if tweet_date >= cutoff_date:
412
+ for keyword in keywords:
413
+ # Split keyword into parts and check if any part matches
414
+ keyword_parts = keyword.lower().split()
415
+ if any(part in tweet.text.lower() for part in keyword_parts if len(part) > 3):
416
+ if tweet.id not in [t.id for t in all_tweets]:
417
+ all_tweets.append(tweet)
418
+ break
419
+ except Exception as e:
420
+ logger.error(f"Error during relaxed filtering: {str(e)}")
421
+
422
+ # Sort by recency
423
+ all_tweets.sort(key=lambda x: x.created_at, reverse=True)
424
+
425
+ logger.info(f"Found {len(all_tweets)} tweets from trusted sources related to keywords: {keywords}")
426
+ return all_tweets
427
+
428
+ def update_sources(self, sources: List[NewsSource]) -> None:
429
+ """Update the list of trusted news sources."""
430
+ self.news_sources = sources
431
+ # Clear cache when sources are updated
432
+ self.in_memory_cache.clear()
433
+ logger.info(f"Updated trusted news sources. New count: {len(sources)}")
434
+
435
+ def get_sources(self) -> List[NewsSource]:
436
+ """Get the current list of trusted news sources."""
437
+ return self.news_sources
438
+
439
+ async def close(self):
440
+ """Clean up resources."""
441
+ if self.client:
 
442
  await self.client.aclose()