Spaces:
Running
Running
Update twitter_service.py
Browse files- twitter_service.py +107 -86
twitter_service.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
import os
|
|
|
4 |
import time
|
|
|
5 |
from datetime import datetime, timedelta, timezone
|
6 |
from typing import Dict, List, Optional
|
7 |
|
@@ -13,7 +15,7 @@ from models import NewsSource, Tweet
|
|
13 |
|
14 |
|
15 |
class RssTwitterService:
|
16 |
-
"""Service for collecting tweets via
|
17 |
|
18 |
def __init__(self):
|
19 |
self.cache_expiry = int(os.getenv("CACHE_EXPIRY_MINUTES", 120))
|
@@ -35,14 +37,16 @@ class RssTwitterService:
|
|
35 |
}
|
36 |
self.last_stats_reset = time.time()
|
37 |
|
38 |
-
# Map of Twitter handles to
|
|
|
39 |
self.rss_feed_urls = {
|
40 |
-
"sidhant": "
|
41 |
-
"ShivAroor": "
|
42 |
-
"IAF_MCC": "
|
43 |
-
"adgpi": "
|
44 |
-
"SpokespersonMoD": "
|
45 |
-
"MIB_India": "
|
|
|
46 |
}
|
47 |
|
48 |
# Default trusted news sources
|
@@ -59,7 +63,7 @@ class RssTwitterService:
|
|
59 |
async def initialize(self) -> bool:
|
60 |
"""Initialize the Twitter service with RSS feed capability."""
|
61 |
try:
|
62 |
-
logger.info("Initializing Twitter service with
|
63 |
|
64 |
# Initialize HTTP client
|
65 |
self.client = httpx.AsyncClient(
|
@@ -74,11 +78,11 @@ class RssTwitterService:
|
|
74 |
# Schedule background maintenance
|
75 |
asyncio.create_task(self._background_maintenance())
|
76 |
|
77 |
-
logger.info("
|
78 |
return True
|
79 |
|
80 |
except Exception as e:
|
81 |
-
logger.error(f"Failed to initialize
|
82 |
return False
|
83 |
|
84 |
async def _background_maintenance(self):
|
@@ -108,7 +112,7 @@ class RssTwitterService:
|
|
108 |
cache_hit_rate = self.stats["cache_hits"] / total_requests * 100
|
109 |
error_rate = self.stats["errors"] / total_requests * 100
|
110 |
|
111 |
-
logger.info(f"
|
112 |
f"Cache hits: {self.stats['cache_hits']} ({cache_hit_rate:.1f}%), " +
|
113 |
f"Errors: {self.stats['errors']} ({error_rate:.1f}%)")
|
114 |
|
@@ -136,12 +140,11 @@ class RssTwitterService:
|
|
136 |
|
137 |
def _get_cache_key(self, twitter_handle, limit):
|
138 |
"""Generate a cache key for a specific Twitter source"""
|
139 |
-
return f"{twitter_handle}_{limit}"
|
140 |
|
141 |
def _get_cache_path(self, cache_key):
|
142 |
"""Get filesystem path for a cache key"""
|
143 |
# Create a safe filename from the cache key
|
144 |
-
import re
|
145 |
safe_key = re.sub(r'[^a-zA-Z0-9_-]', '_', cache_key)
|
146 |
return os.path.join(self.tweet_cache_dir, f"{safe_key}.json")
|
147 |
|
@@ -208,7 +211,8 @@ class RssTwitterService:
|
|
208 |
'author': tweet.author,
|
209 |
'created_at': created_at.isoformat() if hasattr(created_at, 'isoformat') else str(created_at),
|
210 |
'engagement': tweet.engagement,
|
211 |
-
'url': tweet.url
|
|
|
212 |
})
|
213 |
|
214 |
# Save to disk cache
|
@@ -222,18 +226,19 @@ class RssTwitterService:
|
|
222 |
except Exception as e:
|
223 |
logger.error(f"Error writing to cache file {cache_path}: {e}")
|
224 |
|
225 |
-
def register_rss_feed(self, twitter_handle: str
|
226 |
-
"""Register an
|
227 |
-
self.rss_feed_urls[twitter_handle] =
|
228 |
-
logger.info(f"Registered
|
229 |
|
230 |
-
def register_rss_feed_batch(self,
|
231 |
-
"""Register multiple
|
232 |
-
|
233 |
-
|
|
|
234 |
|
235 |
async def get_tweets_from_source(self, source: NewsSource, limit: int = 20, retries: int = 3) -> List[Tweet]:
|
236 |
-
"""Get tweets from a specific Twitter source using
|
237 |
cache_key = self._get_cache_key(source.twitter_handle, limit)
|
238 |
|
239 |
# Check cache first
|
@@ -247,86 +252,103 @@ class RssTwitterService:
|
|
247 |
# Check if we have a registered RSS feed for this Twitter handle
|
248 |
rss_url = self.rss_feed_urls.get(source.twitter_handle)
|
249 |
if not rss_url:
|
250 |
-
|
251 |
-
|
|
|
|
|
252 |
|
253 |
# Extract tweets with retry logic
|
254 |
tweets = []
|
255 |
|
256 |
for attempt in range(retries + 1):
|
257 |
try:
|
258 |
-
logger.info(f"Fetching tweets from
|
259 |
|
260 |
-
# Add cache-busting parameter
|
261 |
params = {"_": str(int(time.time()))}
|
262 |
|
263 |
response = await self.client.get(rss_url, params=params)
|
264 |
|
265 |
if response.status_code == 200:
|
266 |
-
# Parse the RSS
|
267 |
try:
|
268 |
self.stats["success"] += 1
|
269 |
-
rss_data = response.json()
|
270 |
|
271 |
-
# Parse
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
try:
|
277 |
-
#
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
text=tweet_text,
|
312 |
-
author=source.twitter_handle,
|
313 |
-
created_at=created_at,
|
314 |
-
engagement=engagement,
|
315 |
-
url=tweet_url
|
316 |
-
)
|
317 |
)
|
318 |
-
|
319 |
-
|
|
|
320 |
|
321 |
# Cache the results
|
322 |
if tweets:
|
323 |
self._save_to_cache(cache_key, tweets)
|
324 |
-
logger.info(f"Fetched and cached {len(tweets)} tweets from
|
325 |
|
326 |
return tweets
|
327 |
|
328 |
-
except
|
329 |
-
logger.error(f"
|
330 |
self.stats["errors"] += 1
|
331 |
|
332 |
if attempt < retries:
|
@@ -337,7 +359,7 @@ class RssTwitterService:
|
|
337 |
else:
|
338 |
# HTTP error
|
339 |
self.stats["errors"] += 1
|
340 |
-
logger.error(f"Failed to fetch
|
341 |
|
342 |
if attempt < retries:
|
343 |
await asyncio.sleep(5)
|
@@ -347,7 +369,7 @@ class RssTwitterService:
|
|
347 |
|
348 |
except Exception as e:
|
349 |
self.stats["errors"] += 1
|
350 |
-
logger.error(f"Error fetching
|
351 |
|
352 |
if attempt < retries:
|
353 |
await asyncio.sleep(5)
|
@@ -363,14 +385,13 @@ class RssTwitterService:
|
|
363 |
# Create naive datetime for consistent comparisons
|
364 |
cutoff_date = datetime.now() - timedelta(days=days_back)
|
365 |
|
366 |
-
# Filter to active sources
|
367 |
-
active_sources = [source for source in self.news_sources
|
368 |
-
if source.is_active and source.twitter_handle in self.rss_feed_urls]
|
369 |
|
370 |
# Sort sources by reliability score (prioritize higher scores)
|
371 |
active_sources.sort(key=lambda s: s.reliability_score, reverse=True)
|
372 |
|
373 |
-
logger.info(f"Collecting tweets from {len(active_sources)} trusted news sources with
|
374 |
|
375 |
# Process sources in parallel
|
376 |
tasks = []
|
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
import os
|
4 |
+
import re
|
5 |
import time
|
6 |
+
import xml.etree.ElementTree as ET
|
7 |
from datetime import datetime, timedelta, timezone
|
8 |
from typing import Dict, List, Optional
|
9 |
|
|
|
15 |
|
16 |
|
17 |
class RssTwitterService:
|
18 |
+
"""Service for collecting tweets via RSSHub's XML feeds."""
|
19 |
|
20 |
def __init__(self):
|
21 |
self.cache_expiry = int(os.getenv("CACHE_EXPIRY_MINUTES", 120))
|
|
|
37 |
}
|
38 |
self.last_stats_reset = time.time()
|
39 |
|
40 |
+
# Map of Twitter handles to RSSHub URLs - directly added
|
41 |
+
self.rsshub_base_url = "https://rsshub.app/twitter/user/"
|
42 |
self.rss_feed_urls = {
|
43 |
+
"sidhant": f"{self.rsshub_base_url}sidhant",
|
44 |
+
"ShivAroor": f"{self.rsshub_base_url}ShivAroor",
|
45 |
+
"IAF_MCC": f"{self.rsshub_base_url}IAF_MCC",
|
46 |
+
"adgpi": f"{self.rsshub_base_url}adgpi",
|
47 |
+
"SpokespersonMoD": f"{self.rsshub_base_url}SpokespersonMoD",
|
48 |
+
"MIB_India": f"{self.rsshub_base_url}MIB_India",
|
49 |
+
"DrSJaishankar": f"{self.rsshub_base_url}DrSJaishankar"
|
50 |
}
|
51 |
|
52 |
# Default trusted news sources
|
|
|
63 |
async def initialize(self) -> bool:
|
64 |
"""Initialize the Twitter service with RSS feed capability."""
|
65 |
try:
|
66 |
+
logger.info("Initializing Twitter service with RSSHub XML feed capability")
|
67 |
|
68 |
# Initialize HTTP client
|
69 |
self.client = httpx.AsyncClient(
|
|
|
78 |
# Schedule background maintenance
|
79 |
asyncio.create_task(self._background_maintenance())
|
80 |
|
81 |
+
logger.info("RSSHub Twitter service initialized successfully")
|
82 |
return True
|
83 |
|
84 |
except Exception as e:
|
85 |
+
logger.error(f"Failed to initialize RSSHub Twitter service: {str(e)}")
|
86 |
return False
|
87 |
|
88 |
async def _background_maintenance(self):
|
|
|
112 |
cache_hit_rate = self.stats["cache_hits"] / total_requests * 100
|
113 |
error_rate = self.stats["errors"] / total_requests * 100
|
114 |
|
115 |
+
logger.info(f"RssHubTwitterService stats - Requests: {total_requests}, " +
|
116 |
f"Cache hits: {self.stats['cache_hits']} ({cache_hit_rate:.1f}%), " +
|
117 |
f"Errors: {self.stats['errors']} ({error_rate:.1f}%)")
|
118 |
|
|
|
140 |
|
141 |
def _get_cache_key(self, twitter_handle, limit):
|
142 |
"""Generate a cache key for a specific Twitter source"""
|
143 |
+
return f"{twitter_handle}_{limit}_rsshub"
|
144 |
|
145 |
def _get_cache_path(self, cache_key):
|
146 |
"""Get filesystem path for a cache key"""
|
147 |
# Create a safe filename from the cache key
|
|
|
148 |
safe_key = re.sub(r'[^a-zA-Z0-9_-]', '_', cache_key)
|
149 |
return os.path.join(self.tweet_cache_dir, f"{safe_key}.json")
|
150 |
|
|
|
211 |
'author': tweet.author,
|
212 |
'created_at': created_at.isoformat() if hasattr(created_at, 'isoformat') else str(created_at),
|
213 |
'engagement': tweet.engagement,
|
214 |
+
'url': tweet.url,
|
215 |
+
'image_url': getattr(tweet, 'image_url', None) # Handle optional image_url
|
216 |
})
|
217 |
|
218 |
# Save to disk cache
|
|
|
226 |
except Exception as e:
|
227 |
logger.error(f"Error writing to cache file {cache_path}: {e}")
|
228 |
|
229 |
+
def register_rss_feed(self, twitter_handle: str):
|
230 |
+
"""Register an RSSHub feed URL for a specific Twitter handle"""
|
231 |
+
self.rss_feed_urls[twitter_handle] = f"{self.rsshub_base_url}{twitter_handle}"
|
232 |
+
logger.info(f"Registered RSSHub feed for {twitter_handle}")
|
233 |
|
234 |
+
def register_rss_feed_batch(self, twitter_handles: List[str]):
|
235 |
+
"""Register multiple RSSHub feeds at once"""
|
236 |
+
for handle in twitter_handles:
|
237 |
+
self.rss_feed_urls[handle] = f"{self.rsshub_base_url}{handle}"
|
238 |
+
logger.info(f"Registered {len(twitter_handles)} RSSHub feeds")
|
239 |
|
240 |
async def get_tweets_from_source(self, source: NewsSource, limit: int = 20, retries: int = 3) -> List[Tweet]:
|
241 |
+
"""Get tweets from a specific Twitter source using RSSHub XML feed."""
|
242 |
cache_key = self._get_cache_key(source.twitter_handle, limit)
|
243 |
|
244 |
# Check cache first
|
|
|
252 |
# Check if we have a registered RSS feed for this Twitter handle
|
253 |
rss_url = self.rss_feed_urls.get(source.twitter_handle)
|
254 |
if not rss_url:
|
255 |
+
# Auto-register the feed if not already registered
|
256 |
+
rss_url = f"{self.rsshub_base_url}{source.twitter_handle}"
|
257 |
+
self.rss_feed_urls[source.twitter_handle] = rss_url
|
258 |
+
logger.info(f"Auto-registered RSSHub feed for {source.twitter_handle}")
|
259 |
|
260 |
# Extract tweets with retry logic
|
261 |
tweets = []
|
262 |
|
263 |
for attempt in range(retries + 1):
|
264 |
try:
|
265 |
+
logger.info(f"Fetching tweets from RSSHub for {source.twitter_handle} (attempt {attempt + 1}/{retries + 1})")
|
266 |
|
267 |
+
# Add cache-busting parameter to avoid RSSHub's cache
|
268 |
params = {"_": str(int(time.time()))}
|
269 |
|
270 |
response = await self.client.get(rss_url, params=params)
|
271 |
|
272 |
if response.status_code == 200:
|
273 |
+
# Parse the RSS XML feed
|
274 |
try:
|
275 |
self.stats["success"] += 1
|
|
|
276 |
|
277 |
+
# Parse XML response
|
278 |
+
root = ET.fromstring(response.text)
|
279 |
+
|
280 |
+
# Find all item elements (tweets)
|
281 |
+
ns = {'atom': 'http://www.w3.org/2005/Atom'} # Define namespace if needed
|
282 |
+
items = root.findall('.//item')[:limit] # Limit to specified number
|
283 |
+
|
284 |
+
for item in items:
|
285 |
+
try:
|
286 |
+
# Extract tweet details from XML
|
287 |
+
title_elem = item.find('title')
|
288 |
+
title = title_elem.text if title_elem is not None else ""
|
289 |
+
|
290 |
+
description_elem = item.find('description')
|
291 |
+
description = description_elem.text if description_elem is not None else ""
|
292 |
+
|
293 |
+
link_elem = item.find('link')
|
294 |
+
url = link_elem.text if link_elem is not None else ""
|
295 |
+
|
296 |
+
# Extract tweet ID from URL
|
297 |
+
tweet_id = url.split("/status/")[-1] if "/status/" in url else ""
|
298 |
+
|
299 |
+
# Get the timestamp
|
300 |
+
pub_date_elem = item.find('pubDate')
|
301 |
+
date_str = pub_date_elem.text if pub_date_elem is not None else ""
|
302 |
+
|
303 |
try:
|
304 |
+
# Parse RSS date format
|
305 |
+
created_at = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z")
|
306 |
+
except (ValueError, TypeError):
|
307 |
+
logger.warning(f"Date parsing error for {source.twitter_handle}: {date_str}")
|
308 |
+
created_at = datetime.now()
|
309 |
+
|
310 |
+
# Get author
|
311 |
+
author_elem = item.find('author')
|
312 |
+
author = author_elem.text if author_elem is not None else source.twitter_handle
|
313 |
+
|
314 |
+
# Extract image URL from description
|
315 |
+
image_url = None
|
316 |
+
if description:
|
317 |
+
# Try to find image in description
|
318 |
+
img_match = re.search(r'src="([^"]+)"', description)
|
319 |
+
if img_match:
|
320 |
+
image_url = img_match.group(1)
|
321 |
+
|
322 |
+
# Extract text content
|
323 |
+
# For the text, use title as it's cleaner than description
|
324 |
+
tweet_text = title
|
325 |
+
|
326 |
+
# Mock engagement metrics (not provided in RSS)
|
327 |
+
engagement = {"likes": 0, "retweets": 0, "replies": 0, "views": 0}
|
328 |
+
|
329 |
+
tweets.append(
|
330 |
+
Tweet(
|
331 |
+
id=tweet_id,
|
332 |
+
text=tweet_text,
|
333 |
+
author=source.twitter_handle,
|
334 |
+
created_at=created_at,
|
335 |
+
engagement=engagement,
|
336 |
+
url=url,
|
337 |
+
image_url=image_url
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
)
|
339 |
+
)
|
340 |
+
except Exception as e:
|
341 |
+
logger.error(f"Error processing tweet from RSSHub for {source.twitter_handle}: {str(e)}")
|
342 |
|
343 |
# Cache the results
|
344 |
if tweets:
|
345 |
self._save_to_cache(cache_key, tweets)
|
346 |
+
logger.info(f"Fetched and cached {len(tweets)} tweets from RSSHub for {source.twitter_handle}")
|
347 |
|
348 |
return tweets
|
349 |
|
350 |
+
except Exception as e:
|
351 |
+
logger.error(f"Error parsing XML from RSSHub for {source.twitter_handle}: {str(e)}")
|
352 |
self.stats["errors"] += 1
|
353 |
|
354 |
if attempt < retries:
|
|
|
359 |
else:
|
360 |
# HTTP error
|
361 |
self.stats["errors"] += 1
|
362 |
+
logger.error(f"Failed to fetch RSSHub feed for {source.twitter_handle}: HTTP {response.status_code}")
|
363 |
|
364 |
if attempt < retries:
|
365 |
await asyncio.sleep(5)
|
|
|
369 |
|
370 |
except Exception as e:
|
371 |
self.stats["errors"] += 1
|
372 |
+
logger.error(f"Error fetching RSSHub feed for {source.twitter_handle}: {str(e)}")
|
373 |
|
374 |
if attempt < retries:
|
375 |
await asyncio.sleep(5)
|
|
|
385 |
# Create naive datetime for consistent comparisons
|
386 |
cutoff_date = datetime.now() - timedelta(days=days_back)
|
387 |
|
388 |
+
# Filter to active sources
|
389 |
+
active_sources = [source for source in self.news_sources if source.is_active]
|
|
|
390 |
|
391 |
# Sort sources by reliability score (prioritize higher scores)
|
392 |
active_sources.sort(key=lambda s: s.reliability_score, reverse=True)
|
393 |
|
394 |
+
logger.info(f"Collecting tweets from {len(active_sources)} trusted news sources with RSSHub")
|
395 |
|
396 |
# Process sources in parallel
|
397 |
tasks = []
|