import logging import re from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Tuple from urllib import parse from google_play_scraper import Sort, reviews from obsei.misc.web_search import perform_search from obsei.source.base_source import BaseSource, BaseSourceConfig from obsei.payload import TextPayload from obsei.misc.utils import ( DATETIME_STRING_PATTERN, DEFAULT_LOOKUP_PERIOD, convert_utc_time, ) logger = logging.getLogger(__name__) class PlayStoreScrapperConfig(BaseSourceConfig): TYPE: str = "PlayStoreScrapper" app_url: Optional[str] = None countries: Optional[List[str]] = None package_name: Optional[str] = None app_name: Optional[str] = None language: Optional[str] = None filter_score_with: Optional[int] = None lookup_period: Optional[str] = None max_count: Optional[int] = 200 def __init__(self, **data: Any): super().__init__(**data) if self.app_url is not None: self.package_name, self.countries, self.language = PlayStoreScrapperConfig.parse_app_url(self.app_url) else: if not self.package_name and self.app_name: self.package_name = PlayStoreScrapperConfig.search_package_name( self.app_name ) if not self.package_name: raise ValueError("Valid `package_name`, `app_name` or `app_url` is mandatory") self.language = self.language or "en" self.countries = self.countries or ["us"] self.app_name = self.app_name or self.package_name @classmethod def parse_app_url(cls, app_url: str) -> Tuple[Optional[str], Optional[List[str]], Optional[str]]: parsed_url = parse.urlparse(app_url) query_dict = parse.parse_qs(parsed_url.query) countries = query_dict.get('gl', None) language = None languages = query_dict.get('hl', None) if languages is not None: language = languages[0] package_name = None package_ids = query_dict.get('id', None) if package_ids is not None: package_name = package_ids[0] return package_name, countries, language @classmethod def search_package_name(cls, app_name: str) -> str: base_request_url = f"https://play.google.com" search_response = perform_search( request_url=base_request_url, query=f"play store {app_name}" ) pattern = r"play.google.com/store/apps/details.+?id=([0-9a-z.]+)" match_object = re.search(pattern, search_response.text) if match_object: app_id = str(match_object.group(1)) else: raise RuntimeError("Pattern matching is not found") return app_id class PlayStoreScrapperSource(BaseSource): NAME: Optional[str] = "PlayStoreScrapper" def lookup( # type: ignore[override] self, config: PlayStoreScrapperConfig, **kwargs: Any ) -> List[TextPayload]: source_responses: List[TextPayload] = [] # Get data from state id: str = kwargs.get("id", None) state: Optional[Dict[str, Any]] = ( None if id is None or self.store is None else self.store.get_source_state(id) ) update_state: bool = True if id else False state = state or dict() if config.countries is None or len(config.countries) == 0: logger.warning("`countries` in config should not be empty or None") return source_responses for country in config.countries: country_stat: Dict[str, Any] = state.get(country, dict()) lookup_period: str = country_stat.get("since_time", config.lookup_period) lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD if len(lookup_period) <= 5: since_time = convert_utc_time(lookup_period) else: since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN) last_since_time: datetime = since_time # since_id: Optional[str] = country_stat.get("since_id", None) # last_index = since_id # state[scrapper.country] = country_stat continuation_token = None while True: store_reviews, continuation_token = reviews( app_id=config.package_name, lang=config.language, country=country, sort=Sort.NEWEST, filter_score_with=config.filter_score_with, continuation_token=continuation_token, count=config.max_count, ) store_reviews = store_reviews or [] for review in store_reviews: source_responses.append( TextPayload( processed_text=review["content"], meta=review, source_name=self.NAME, ) ) review_time = review["at"].replace(tzinfo=timezone.utc) if since_time > review_time: break if last_since_time is None or last_since_time < review_time: last_since_time = review_time # if last_index is None or last_index < review.id: # last_index = review.id if ( continuation_token is None or continuation_token.token is None or continuation_token.count <= len(source_responses) ): break country_stat["since_time"] = last_since_time.strftime( DATETIME_STRING_PATTERN ) # country_stat["since_id"] = last_index if update_state and self.store is not None: self.store.update_source_state(workflow_id=id, state=state) return source_responses