import logging from datetime import datetime from typing import Optional, List, Any, Dict import requests from pydantic import SecretStr, Field from obsei.misc.utils import convert_utc_time, DATETIME_STRING_PATTERN from obsei.payload import TextPayload from obsei.source.base_source import BaseSourceConfig, BaseSource logger = logging.getLogger(__name__) OUTSCRAPPER_API_URL = 'https://api.app.outscraper.com' class OSGoogleMapsReviewsConfig(BaseSourceConfig): NAME: str = "Maps Reviews Scrapper" queries: List[str] sort: str = "newest" ignore_empty_reviews: bool = True language: str = "en" since_timestamp: Optional[int] = None until_timestamp: Optional[int] = None lookup_period: Optional[str] = None number_of_reviews: int = 10 number_of_places_per_query: int = 1 country: Optional[str] = None filtered_fields: List[str] = Field(['reviews_data']) # parameter defines the coordinates of the location where you want your query to be applied. # It has to be constructed in the next sequence: "@" + "latitude" + "," + "longitude" + "," + "zoom" # (e.g. "@41.3954381,2.1628662,15.1z"). central_coordinates: Optional[str] = None # Get API key from https://outscraper.com/ api_key: Optional[SecretStr] = Field(None, env="outscrapper_api_key") def __init__(self, **values: Any): super().__init__(**values) if self.api_key is None: raise ValueError("OutScrapper API key require to fetch reviews data") class OSGoogleMapsReviewsSource(BaseSource): NAME: str = "Maps Reviews Scrapper" def lookup(self, config: OSGoogleMapsReviewsConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override] source_responses: List[TextPayload] = [] # Get data from state identifier: str = kwargs.get("id", None) state: Optional[Dict[str, Any]] = ( None if id is None or self.store is None else self.store.get_source_state(identifier) ) update_state: bool = True if identifier else False state = state or dict() since_timestamp: Optional[int] = ( None if state is None else state.get("since_timestamp", None) ) since_timestamp = since_timestamp or config.since_timestamp if since_timestamp is None and config.lookup_period is not None: if len(config.lookup_period) <= 5: since_time = convert_utc_time(config.lookup_period) else: since_time = datetime.strptime(config.lookup_period, DATETIME_STRING_PATTERN) since_timestamp = int(since_time.timestamp()) last_reviews_since_time = since_timestamp params: Dict[str, Any] = { 'query': config.queries, 'reviewsLimit': config.number_of_reviews, 'limit': config.number_of_places_per_query, 'sort': config.sort, # Reviews are sorted from latest to oldest in case cutoff or start is passed # cutoff is oldest timestamp till reviews are needed 'cutoff': since_timestamp, # start is newest timestamp from reviews are needed 'start': config.until_timestamp, 'ignoreEmpty': config.ignore_empty_reviews, 'coordinates': config.central_coordinates, 'language': config.language, 'region': config.country, 'fields': ",".join(config.filtered_fields), 'async': False, } # For API doc refer https://app.outscraper.com/api-docs#tag/Google-Reviews response = requests.get(f'{OUTSCRAPPER_API_URL}/maps/reviews-v3', params=params, headers={ 'X-API-KEY': "" if config.api_key is None else config.api_key.get_secret_value(), }) queries_data = [] if response.status_code == 200: queries_data = response.json().get('data', []) else: logger.warning(f"API call failed with error: {response.json()}") for query_data in queries_data: reviews = [] if "reviews_data" not in query_data else query_data.pop("reviews_data") for review in reviews: source_responses.append( TextPayload( processed_text=review["review_text"], meta={**review, **query_data}, source_name=self.NAME, ) ) review_time = review["review_timestamp"] if last_reviews_since_time is None or last_reviews_since_time < review_time: last_reviews_since_time = review_time state["since_timestamp"] = last_reviews_since_time if update_state and self.store is not None: self.store.update_source_state(workflow_id=identifier, state=state) return source_responses