abhisheksan commited on
Commit
e40e84a
·
verified ·
1 Parent(s): 1aa3253

Update twitter_service.py

Browse files
Files changed (1) hide show
  1. twitter_service.py +107 -86
twitter_service.py CHANGED
@@ -1,7 +1,9 @@
1
  import asyncio
2
  import json
3
  import os
 
4
  import time
 
5
  from datetime import datetime, timedelta, timezone
6
  from typing import Dict, List, Optional
7
 
@@ -13,7 +15,7 @@ from models import NewsSource, Tweet
13
 
14
 
15
  class RssTwitterService:
16
- """Service for collecting tweets via RSS feeds."""
17
 
18
  def __init__(self):
19
  self.cache_expiry = int(os.getenv("CACHE_EXPIRY_MINUTES", 120))
@@ -35,14 +37,16 @@ class RssTwitterService:
35
  }
36
  self.last_stats_reset = time.time()
37
 
38
- # Map of Twitter handles to RSS feed URLs - directly added
 
39
  self.rss_feed_urls = {
40
- "sidhant": "https://rsshub.app/twitter/user/sidhant",
41
- "ShivAroor": "https://rsshub.app/twitter/user/ShivAroor",
42
- "IAF_MCC": "https://rsshub.app/twitter/user/IAF_MCC",
43
- "adgpi": "https://rsshub.app/twitter/user/adgpi",
44
- "SpokespersonMoD": "https://rsshub.app/twitter/user/SpokespersonMoD",
45
- "MIB_India": "https://rsshub.app/twitter/user/MIB_India",
 
46
  }
47
 
48
  # Default trusted news sources
@@ -59,7 +63,7 @@ class RssTwitterService:
59
  async def initialize(self) -> bool:
60
  """Initialize the Twitter service with RSS feed capability."""
61
  try:
62
- logger.info("Initializing Twitter service with RSS feed capability")
63
 
64
  # Initialize HTTP client
65
  self.client = httpx.AsyncClient(
@@ -74,11 +78,11 @@ class RssTwitterService:
74
  # Schedule background maintenance
75
  asyncio.create_task(self._background_maintenance())
76
 
77
- logger.info("RSS Twitter service initialized successfully")
78
  return True
79
 
80
  except Exception as e:
81
- logger.error(f"Failed to initialize RSS Twitter service: {str(e)}")
82
  return False
83
 
84
  async def _background_maintenance(self):
@@ -108,7 +112,7 @@ class RssTwitterService:
108
  cache_hit_rate = self.stats["cache_hits"] / total_requests * 100
109
  error_rate = self.stats["errors"] / total_requests * 100
110
 
111
- logger.info(f"RssTwitterService stats - Requests: {total_requests}, " +
112
  f"Cache hits: {self.stats['cache_hits']} ({cache_hit_rate:.1f}%), " +
113
  f"Errors: {self.stats['errors']} ({error_rate:.1f}%)")
114
 
@@ -136,12 +140,11 @@ class RssTwitterService:
136
 
137
  def _get_cache_key(self, twitter_handle, limit):
138
  """Generate a cache key for a specific Twitter source"""
139
- return f"{twitter_handle}_{limit}"
140
 
141
  def _get_cache_path(self, cache_key):
142
  """Get filesystem path for a cache key"""
143
  # Create a safe filename from the cache key
144
- import re
145
  safe_key = re.sub(r'[^a-zA-Z0-9_-]', '_', cache_key)
146
  return os.path.join(self.tweet_cache_dir, f"{safe_key}.json")
147
 
@@ -208,7 +211,8 @@ class RssTwitterService:
208
  'author': tweet.author,
209
  'created_at': created_at.isoformat() if hasattr(created_at, 'isoformat') else str(created_at),
210
  'engagement': tweet.engagement,
211
- 'url': tweet.url
 
212
  })
213
 
214
  # Save to disk cache
@@ -222,18 +226,19 @@ class RssTwitterService:
222
  except Exception as e:
223
  logger.error(f"Error writing to cache file {cache_path}: {e}")
224
 
225
- def register_rss_feed(self, twitter_handle: str, rss_url: str):
226
- """Register an RSS feed URL for a specific Twitter handle"""
227
- self.rss_feed_urls[twitter_handle] = rss_url
228
- logger.info(f"Registered RSS feed for {twitter_handle}: {rss_url}")
229
 
230
- def register_rss_feed_batch(self, feed_map: Dict[str, str]):
231
- """Register multiple RSS feeds at once"""
232
- self.rss_feed_urls.update(feed_map)
233
- logger.info(f"Registered {len(feed_map)} RSS feeds")
 
234
 
235
  async def get_tweets_from_source(self, source: NewsSource, limit: int = 20, retries: int = 3) -> List[Tweet]:
236
- """Get tweets from a specific Twitter source using RSS feed."""
237
  cache_key = self._get_cache_key(source.twitter_handle, limit)
238
 
239
  # Check cache first
@@ -247,86 +252,103 @@ class RssTwitterService:
247
  # Check if we have a registered RSS feed for this Twitter handle
248
  rss_url = self.rss_feed_urls.get(source.twitter_handle)
249
  if not rss_url:
250
- logger.warning(f"No RSS feed registered for {source.twitter_handle}")
251
- return []
 
 
252
 
253
  # Extract tweets with retry logic
254
  tweets = []
255
 
256
  for attempt in range(retries + 1):
257
  try:
258
- logger.info(f"Fetching tweets from RSS feed for {source.twitter_handle} (attempt {attempt + 1}/{retries + 1})")
259
 
260
- # Add cache-busting parameter
261
  params = {"_": str(int(time.time()))}
262
 
263
  response = await self.client.get(rss_url, params=params)
264
 
265
  if response.status_code == 200:
266
- # Parse the RSS JSON feed
267
  try:
268
  self.stats["success"] += 1
269
- rss_data = response.json()
270
 
271
- # Parse items from the feed
272
- if "items" in rss_data:
273
- items = rss_data["items"][:limit]
274
-
275
- for item in items:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  try:
277
- # Extract tweet ID from the URL
278
- tweet_id = item.get("id", "").split("/")[-1]
279
- if not tweet_id:
280
- continue
281
-
282
- # Extract tweet text
283
- tweet_text = item.get("content_text", item.get("title", ""))
284
-
285
- # Extract timestamp
286
- date_str = item.get("date_published", "")
287
- try:
288
- # Convert to datetime then strip timezone info to make naive
289
- dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
290
- created_at = dt.replace(tzinfo=None)
291
- except (ValueError, TypeError):
292
- created_at = datetime.now()
293
-
294
- # Extract engagement metrics if available
295
- engagement = {"likes": 0, "retweets": 0, "replies": 0, "views": 0}
296
-
297
- # Try to extract engagement from extensions or additional fields
298
- if "x_metadata" in item:
299
- x_data = item["x_metadata"]
300
- engagement["likes"] = x_data.get("likes", 0)
301
- engagement["retweets"] = x_data.get("retweets", 0)
302
- engagement["replies"] = x_data.get("replies", 0)
303
- engagement["views"] = x_data.get("views", 0)
304
-
305
- # Construct tweet URL
306
- tweet_url = item.get("url", f"https://x.com/{source.twitter_handle}/status/{tweet_id}")
307
-
308
- tweets.append(
309
- Tweet(
310
- id=tweet_id,
311
- text=tweet_text,
312
- author=source.twitter_handle,
313
- created_at=created_at,
314
- engagement=engagement,
315
- url=tweet_url
316
- )
317
  )
318
- except Exception as e:
319
- logger.error(f"Error processing tweet from RSS for {source.twitter_handle}: {str(e)}")
 
320
 
321
  # Cache the results
322
  if tweets:
323
  self._save_to_cache(cache_key, tweets)
324
- logger.info(f"Fetched and cached {len(tweets)} tweets from RSS feed for {source.twitter_handle}")
325
 
326
  return tweets
327
 
328
- except json.JSONDecodeError:
329
- logger.error(f"Invalid JSON response from RSS feed for {source.twitter_handle}")
330
  self.stats["errors"] += 1
331
 
332
  if attempt < retries:
@@ -337,7 +359,7 @@ class RssTwitterService:
337
  else:
338
  # HTTP error
339
  self.stats["errors"] += 1
340
- logger.error(f"Failed to fetch RSS feed for {source.twitter_handle}: HTTP {response.status_code}")
341
 
342
  if attempt < retries:
343
  await asyncio.sleep(5)
@@ -347,7 +369,7 @@ class RssTwitterService:
347
 
348
  except Exception as e:
349
  self.stats["errors"] += 1
350
- logger.error(f"Error fetching RSS feed for {source.twitter_handle}: {str(e)}")
351
 
352
  if attempt < retries:
353
  await asyncio.sleep(5)
@@ -363,14 +385,13 @@ class RssTwitterService:
363
  # Create naive datetime for consistent comparisons
364
  cutoff_date = datetime.now() - timedelta(days=days_back)
365
 
366
- # Filter to active sources that have RSS feeds
367
- active_sources = [source for source in self.news_sources
368
- if source.is_active and source.twitter_handle in self.rss_feed_urls]
369
 
370
  # Sort sources by reliability score (prioritize higher scores)
371
  active_sources.sort(key=lambda s: s.reliability_score, reverse=True)
372
 
373
- logger.info(f"Collecting tweets from {len(active_sources)} trusted news sources with RSS feeds")
374
 
375
  # Process sources in parallel
376
  tasks = []
 
1
  import asyncio
2
  import json
3
  import os
4
+ import re
5
  import time
6
+ import xml.etree.ElementTree as ET
7
  from datetime import datetime, timedelta, timezone
8
  from typing import Dict, List, Optional
9
 
 
15
 
16
 
17
  class RssTwitterService:
18
+ """Service for collecting tweets via RSSHub's XML feeds."""
19
 
20
  def __init__(self):
21
  self.cache_expiry = int(os.getenv("CACHE_EXPIRY_MINUTES", 120))
 
37
  }
38
  self.last_stats_reset = time.time()
39
 
40
+ # Map of Twitter handles to RSSHub URLs - directly added
41
+ self.rsshub_base_url = "https://rsshub.app/twitter/user/"
42
  self.rss_feed_urls = {
43
+ "sidhant": f"{self.rsshub_base_url}sidhant",
44
+ "ShivAroor": f"{self.rsshub_base_url}ShivAroor",
45
+ "IAF_MCC": f"{self.rsshub_base_url}IAF_MCC",
46
+ "adgpi": f"{self.rsshub_base_url}adgpi",
47
+ "SpokespersonMoD": f"{self.rsshub_base_url}SpokespersonMoD",
48
+ "MIB_India": f"{self.rsshub_base_url}MIB_India",
49
+ "DrSJaishankar": f"{self.rsshub_base_url}DrSJaishankar"
50
  }
51
 
52
  # Default trusted news sources
 
63
  async def initialize(self) -> bool:
64
  """Initialize the Twitter service with RSS feed capability."""
65
  try:
66
+ logger.info("Initializing Twitter service with RSSHub XML feed capability")
67
 
68
  # Initialize HTTP client
69
  self.client = httpx.AsyncClient(
 
78
  # Schedule background maintenance
79
  asyncio.create_task(self._background_maintenance())
80
 
81
+ logger.info("RSSHub Twitter service initialized successfully")
82
  return True
83
 
84
  except Exception as e:
85
+ logger.error(f"Failed to initialize RSSHub Twitter service: {str(e)}")
86
  return False
87
 
88
  async def _background_maintenance(self):
 
112
  cache_hit_rate = self.stats["cache_hits"] / total_requests * 100
113
  error_rate = self.stats["errors"] / total_requests * 100
114
 
115
+ logger.info(f"RssHubTwitterService stats - Requests: {total_requests}, " +
116
  f"Cache hits: {self.stats['cache_hits']} ({cache_hit_rate:.1f}%), " +
117
  f"Errors: {self.stats['errors']} ({error_rate:.1f}%)")
118
 
 
140
 
141
  def _get_cache_key(self, twitter_handle, limit):
142
  """Generate a cache key for a specific Twitter source"""
143
+ return f"{twitter_handle}_{limit}_rsshub"
144
 
145
  def _get_cache_path(self, cache_key):
146
  """Get filesystem path for a cache key"""
147
  # Create a safe filename from the cache key
 
148
  safe_key = re.sub(r'[^a-zA-Z0-9_-]', '_', cache_key)
149
  return os.path.join(self.tweet_cache_dir, f"{safe_key}.json")
150
 
 
211
  'author': tweet.author,
212
  'created_at': created_at.isoformat() if hasattr(created_at, 'isoformat') else str(created_at),
213
  'engagement': tweet.engagement,
214
+ 'url': tweet.url,
215
+ 'image_url': getattr(tweet, 'image_url', None) # Handle optional image_url
216
  })
217
 
218
  # Save to disk cache
 
226
  except Exception as e:
227
  logger.error(f"Error writing to cache file {cache_path}: {e}")
228
 
229
+ def register_rss_feed(self, twitter_handle: str):
230
+ """Register an RSSHub feed URL for a specific Twitter handle"""
231
+ self.rss_feed_urls[twitter_handle] = f"{self.rsshub_base_url}{twitter_handle}"
232
+ logger.info(f"Registered RSSHub feed for {twitter_handle}")
233
 
234
+ def register_rss_feed_batch(self, twitter_handles: List[str]):
235
+ """Register multiple RSSHub feeds at once"""
236
+ for handle in twitter_handles:
237
+ self.rss_feed_urls[handle] = f"{self.rsshub_base_url}{handle}"
238
+ logger.info(f"Registered {len(twitter_handles)} RSSHub feeds")
239
 
240
  async def get_tweets_from_source(self, source: NewsSource, limit: int = 20, retries: int = 3) -> List[Tweet]:
241
+ """Get tweets from a specific Twitter source using RSSHub XML feed."""
242
  cache_key = self._get_cache_key(source.twitter_handle, limit)
243
 
244
  # Check cache first
 
252
  # Check if we have a registered RSS feed for this Twitter handle
253
  rss_url = self.rss_feed_urls.get(source.twitter_handle)
254
  if not rss_url:
255
+ # Auto-register the feed if not already registered
256
+ rss_url = f"{self.rsshub_base_url}{source.twitter_handle}"
257
+ self.rss_feed_urls[source.twitter_handle] = rss_url
258
+ logger.info(f"Auto-registered RSSHub feed for {source.twitter_handle}")
259
 
260
  # Extract tweets with retry logic
261
  tweets = []
262
 
263
  for attempt in range(retries + 1):
264
  try:
265
+ logger.info(f"Fetching tweets from RSSHub for {source.twitter_handle} (attempt {attempt + 1}/{retries + 1})")
266
 
267
+ # Add cache-busting parameter to avoid RSSHub's cache
268
  params = {"_": str(int(time.time()))}
269
 
270
  response = await self.client.get(rss_url, params=params)
271
 
272
  if response.status_code == 200:
273
+ # Parse the RSS XML feed
274
  try:
275
  self.stats["success"] += 1
 
276
 
277
+ # Parse XML response
278
+ root = ET.fromstring(response.text)
279
+
280
+ # Find all item elements (tweets)
281
+ ns = {'atom': 'http://www.w3.org/2005/Atom'} # Define namespace if needed
282
+ items = root.findall('.//item')[:limit] # Limit to specified number
283
+
284
+ for item in items:
285
+ try:
286
+ # Extract tweet details from XML
287
+ title_elem = item.find('title')
288
+ title = title_elem.text if title_elem is not None else ""
289
+
290
+ description_elem = item.find('description')
291
+ description = description_elem.text if description_elem is not None else ""
292
+
293
+ link_elem = item.find('link')
294
+ url = link_elem.text if link_elem is not None else ""
295
+
296
+ # Extract tweet ID from URL
297
+ tweet_id = url.split("/status/")[-1] if "/status/" in url else ""
298
+
299
+ # Get the timestamp
300
+ pub_date_elem = item.find('pubDate')
301
+ date_str = pub_date_elem.text if pub_date_elem is not None else ""
302
+
303
  try:
304
+ # Parse RSS date format
305
+ created_at = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z")
306
+ except (ValueError, TypeError):
307
+ logger.warning(f"Date parsing error for {source.twitter_handle}: {date_str}")
308
+ created_at = datetime.now()
309
+
310
+ # Get author
311
+ author_elem = item.find('author')
312
+ author = author_elem.text if author_elem is not None else source.twitter_handle
313
+
314
+ # Extract image URL from description
315
+ image_url = None
316
+ if description:
317
+ # Try to find image in description
318
+ img_match = re.search(r'src="([^"]+)"', description)
319
+ if img_match:
320
+ image_url = img_match.group(1)
321
+
322
+ # Extract text content
323
+ # For the text, use title as it's cleaner than description
324
+ tweet_text = title
325
+
326
+ # Mock engagement metrics (not provided in RSS)
327
+ engagement = {"likes": 0, "retweets": 0, "replies": 0, "views": 0}
328
+
329
+ tweets.append(
330
+ Tweet(
331
+ id=tweet_id,
332
+ text=tweet_text,
333
+ author=source.twitter_handle,
334
+ created_at=created_at,
335
+ engagement=engagement,
336
+ url=url,
337
+ image_url=image_url
 
 
 
 
 
 
338
  )
339
+ )
340
+ except Exception as e:
341
+ logger.error(f"Error processing tweet from RSSHub for {source.twitter_handle}: {str(e)}")
342
 
343
  # Cache the results
344
  if tweets:
345
  self._save_to_cache(cache_key, tweets)
346
+ logger.info(f"Fetched and cached {len(tweets)} tweets from RSSHub for {source.twitter_handle}")
347
 
348
  return tweets
349
 
350
+ except Exception as e:
351
+ logger.error(f"Error parsing XML from RSSHub for {source.twitter_handle}: {str(e)}")
352
  self.stats["errors"] += 1
353
 
354
  if attempt < retries:
 
359
  else:
360
  # HTTP error
361
  self.stats["errors"] += 1
362
+ logger.error(f"Failed to fetch RSSHub feed for {source.twitter_handle}: HTTP {response.status_code}")
363
 
364
  if attempt < retries:
365
  await asyncio.sleep(5)
 
369
 
370
  except Exception as e:
371
  self.stats["errors"] += 1
372
+ logger.error(f"Error fetching RSSHub feed for {source.twitter_handle}: {str(e)}")
373
 
374
  if attempt < retries:
375
  await asyncio.sleep(5)
 
385
  # Create naive datetime for consistent comparisons
386
  cutoff_date = datetime.now() - timedelta(days=days_back)
387
 
388
+ # Filter to active sources
389
+ active_sources = [source for source in self.news_sources if source.is_active]
 
390
 
391
  # Sort sources by reliability score (prioritize higher scores)
392
  active_sources.sort(key=lambda s: s.reliability_score, reverse=True)
393
 
394
+ logger.info(f"Collecting tweets from {len(active_sources)} trusted news sources with RSSHub")
395
 
396
  # Process sources in parallel
397
  tasks = []