Spaces:
Sleeping
Sleeping
File size: 6,285 Bytes
dbaa71b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
from typing import Any, Dict, List, Optional
from urllib import parse
import dateparser
from GoogleNews import GoogleNews
from pydantic import PrivateAttr
from datetime import datetime, date, timedelta, time, timezone
from obsei.payload import TextPayload
from obsei.misc.utils import DATETIME_STRING_PATTERN, convert_utc_time, DEFAULT_LOOKUP_PERIOD
from obsei.source.base_source import BaseSource, BaseSourceConfig
from obsei.source.website_crawler_source import (
BaseCrawlerConfig,
TrafilaturaCrawlerConfig,
)
GOOGLE_DATE_TIME_QUERY_PATTERN = "%Y-%m-%d"
class GoogleNewsConfig(BaseSourceConfig):
_google_news_client: GoogleNews = PrivateAttr()
TYPE: str = "GoogleNews"
query: str
country: Optional[str] = "US"
language: Optional[str] = "en"
max_results: Optional[int] = 100
lookup_period: Optional[str] = None
after_date: Optional[str] = None # latest time
before_date: Optional[str] = None # oldest time
fetch_article: Optional[bool] = False
crawler_config: Optional[BaseCrawlerConfig] = None
def __init__(self, **data: Any):
super().__init__(**data)
if self.lookup_period and self.after_date:
raise AttributeError("Can't use `lookup_period` and `after_date` both")
elif not self.after_date and self.before_date:
raise AttributeError("Can't use `before_date` without `after_date` or `lookup_period`")
if self.lookup_period:
after_time = convert_utc_time(self.lookup_period)
self.after_date = after_time.strftime(GOOGLE_DATE_TIME_QUERY_PATTERN)
if not self.before_date:
before_time = datetime.combine(date.today(), time(tzinfo=timezone.utc)) + timedelta(days=1)
self.before_date = before_time.strftime(GOOGLE_DATE_TIME_QUERY_PATTERN)
self._google_news_client = GoogleNews(
lang=self.language,
region=self.country
)
if not self.crawler_config:
self.crawler_config = TrafilaturaCrawlerConfig(urls=[])
def get_client(self) -> GoogleNews:
return self._google_news_client
class GoogleNewsSource(BaseSource):
NAME: Optional[str] = "GoogleNews"
def lookup(self, config: GoogleNewsConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
source_responses: List[TextPayload] = []
# Get data from state
id: str = kwargs.get("id", None)
state: Optional[Dict[str, Any]] = (
None
if id is None or self.store is None
else self.store.get_source_state(id)
)
update_state: bool = True if id else False
state = state or dict()
lookup_period: str = state.get("since_time", None) or DEFAULT_LOOKUP_PERIOD
since_time: datetime = convert_utc_time(lookup_period)
last_since_time = since_time
today_start_of_day: datetime = datetime.combine(date.today(), time(tzinfo=timezone.utc))
today_end_of_day: datetime = today_start_of_day + timedelta(days=1)
last_after_time: datetime # start_time
if config.after_date:
last_after_time = convert_utc_time(config.after_date)
else:
last_after_time = today_start_of_day
if state.get("since_time", None) is not None:
last_after_time = since_time \
if since_time > last_after_time \
else last_since_time
before_time: datetime # end time
if config.before_date and config.after_date:
before_time = convert_utc_time(config.before_date)
else:
before_time = today_end_of_day
if before_time > today_start_of_day:
before_time = today_end_of_day
google_news_client = config.get_client()
more_data_exist = True
while more_data_exist and before_time > last_after_time:
after_time = before_time - timedelta(days=1)
after_date = after_time.strftime(GOOGLE_DATE_TIME_QUERY_PATTERN)
before_date = before_time.strftime(GOOGLE_DATE_TIME_QUERY_PATTERN)
new_query = f'{config.query}+after:{after_date}+before:{before_date}'
# query = parse.quote(new_query, errors='ignore')
before_time = after_time
google_news_client.get_news(new_query)
articles = google_news_client.results(sort=True)
for article in articles:
published_date = (
None
if article["datetime"] is None
else article["datetime"].replace(tzinfo=timezone.utc)
)
article_text: str = ""
if config.fetch_article and config.crawler_config:
extracted_data = config.crawler_config.extract_url(url=article["link"])
if extracted_data.get("text", None) is not None:
article_text = extracted_data["text"]
del extracted_data["text"]
article["extracted_data"] = extracted_data
source_responses.append(
TextPayload(
processed_text=f"{article['title']}.\n\n {article_text}",
meta=vars(article) if hasattr(article, "__dict__") else article,
source_name=self.NAME,
)
)
if config.max_results is not None and len(source_responses) >= config.max_results:
source_responses = source_responses[:config.max_results]
more_data_exist = False
break
if published_date and since_time and published_date < since_time:
more_data_exist = False
break
if last_since_time is None or (
published_date and last_since_time < published_date
):
last_since_time = published_date
if update_state and last_since_time and self.store is not None:
state["since_time"] = last_since_time.strftime(DATETIME_STRING_PATTERN)
self.store.update_source_state(workflow_id=id, state=state)
return source_responses
|