demo_obsei / obsei_module /obsei /source /appstore_scrapper.py
kltn20133118's picture
Upload 337 files
dbaa71b verified
import logging
import re
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple
from urllib import parse
from app_store.app_store_reviews_reader import AppStoreReviewsReader
from obsei.misc.web_search import perform_search
from obsei.source.base_source import BaseSource, BaseSourceConfig
from obsei.payload import TextPayload
from obsei.misc.utils import (
DATETIME_STRING_PATTERN,
DEFAULT_LOOKUP_PERIOD,
convert_utc_time,
)
logger = logging.getLogger(__name__)
class AppStoreScrapperConfig(BaseSourceConfig):
TYPE: str = "AppStoreScrapper"
app_url: Optional[str] = None
countries: Optional[List[str]] = None
app_id: Optional[str] = None
app_name: Optional[str] = None
lookup_period: Optional[str] = None
max_count: Optional[int] = None
def __init__(self, **data: Any):
super().__init__(**data)
if self.app_url is not None:
self.app_id, self.countries, self.app_name = AppStoreScrapperConfig.parse_app_url(self.app_url)
else:
if not self.app_id and self.app_name:
self.app_id = AppStoreScrapperConfig.search_id(self.app_name)
if not self.app_id:
raise ValueError("Valid `package_name`, `app_name` or `app_url` is mandatory")
self.countries = self.countries or ["us"]
self.app_name = self.app_name or self.app_id
@classmethod
def parse_app_url(cls, app_url: str) -> Tuple[Optional[str], Optional[List[str]], Optional[str]]:
parsed_url = parse.urlparse(app_url)
url_paths = parsed_url.path.split("/")
countries = app_name = app_id = None
if len(url_paths) == 5:
countries = [url_paths[1]]
app_name = url_paths[3]
app_ids = url_paths[4].split("id")
app_id = None if len(app_ids) != 2 else app_ids[1]
return app_id, countries, app_name
# Code is influenced from https://github.com/cowboy-bebug/app-store-scraper
@classmethod
def search_id(cls, app_name: str, store: str = "app") -> str:
if store == "app":
landing_url = "apps.apple.com"
request_host = "amp-api.apps.apple.com"
else:
landing_url = "podcasts.apple.com"
request_host = "amp-api.podcasts.apple.com"
base_request_url = f"https://{request_host}"
search_response = perform_search(
request_url=base_request_url, query=f"app store {app_name}"
)
pattern = fr"{landing_url}/[a-z]{{2}}/.+?/id([0-9]+)"
match_object = re.search(pattern, search_response.text)
if match_object:
app_id = str(match_object.group(1))
else:
raise RuntimeError("Pattern matching is not found")
return app_id
class AppStoreScrapperSource(BaseSource):
NAME: Optional[str] = "AppStoreScrapper"
def lookup(self, config: AppStoreScrapperConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
source_responses: List[TextPayload] = []
# Get data from state
identifier: str = kwargs.get("id", None)
state: Optional[Dict[str, Any]] = (
None
if identifier is None or self.store is None
else self.store.get_source_state(identifier)
)
update_state: bool = True if identifier else False
state = state or dict()
if config.countries is None or len(config.countries) == 0:
logger.warning("`countries` in config should not be empty or None")
return source_responses
for country in config.countries:
country_stat: Dict[str, Any] = state.get(country, dict())
lookup_period: str = country_stat.get("since_time", config.lookup_period)
lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
if len(lookup_period) <= 5:
since_time = convert_utc_time(lookup_period)
else:
since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)
since_time = since_time.replace(tzinfo=timezone.utc)
last_since_time: datetime = since_time
since_id: Optional[int] = country_stat.get("since_id", None)
last_index = since_id
state[country] = country_stat
scrapper = AppStoreReviewsReader(country=country, app_id=config.app_id)
reviews = scrapper.fetch_reviews(after=since_time, since_id=since_id)
reviews = reviews or []
if config.max_count is not None and config.max_count < len(reviews):
reviews = reviews[: config.max_count]
for review in reviews:
source_responses.append(
TextPayload(
processed_text=f"{review.title}. {review.content}",
meta=vars(review) if hasattr(review, "__dict__") else review,
source_name=self.NAME,
)
)
review_time = review.date.replace(tzinfo=timezone.utc)
if review_time < since_time:
break
if last_since_time is None or last_since_time < review_time:
last_since_time = review_time
if last_index is None or last_index < review.id:
last_index = review.id
country_stat["since_time"] = last_since_time.strftime(
DATETIME_STRING_PATTERN
)
country_stat["since_id"] = last_index
if update_state and self.store is not None:
self.store.update_source_state(workflow_id=identifier, state=state)
return source_responses