File size: 4,914 Bytes
dbaa71b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import logging
from datetime import datetime
from typing import Optional, List, Any, Dict

import requests
from pydantic import SecretStr, Field

from obsei.misc.utils import convert_utc_time, DATETIME_STRING_PATTERN
from obsei.payload import TextPayload
from obsei.source.base_source import BaseSourceConfig, BaseSource

logger = logging.getLogger(__name__)
OUTSCRAPPER_API_URL = 'https://api.app.outscraper.com'


class OSGoogleMapsReviewsConfig(BaseSourceConfig):
    NAME: str = "Maps Reviews Scrapper"
    queries: List[str]
    sort: str = "newest"
    ignore_empty_reviews: bool = True
    language: str = "en"
    since_timestamp: Optional[int] = None
    until_timestamp: Optional[int] = None
    lookup_period: Optional[str] = None
    number_of_reviews: int = 10
    number_of_places_per_query: int = 1
    country: Optional[str] = None
    filtered_fields: List[str] = Field(['reviews_data'])
    # parameter defines the coordinates of the location where you want your query to be applied.
    # It has to be constructed in the next sequence: "@" + "latitude" + "," + "longitude" + "," + "zoom"
    # (e.g. "@41.3954381,2.1628662,15.1z").
    central_coordinates: Optional[str] = None
    # Get API key from https://outscraper.com/
    api_key: Optional[SecretStr] = Field(None, env="outscrapper_api_key")

    def __init__(self, **values: Any):
        super().__init__(**values)

        if self.api_key is None:
            raise ValueError("OutScrapper API key require to fetch reviews data")


class OSGoogleMapsReviewsSource(BaseSource):
    NAME: str = "Maps Reviews Scrapper"

    def lookup(self, config: OSGoogleMapsReviewsConfig, **kwargs: Any) -> List[TextPayload]:  # type: ignore[override]
        source_responses: List[TextPayload] = []

        # Get data from state
        identifier: str = kwargs.get("id", None)

        state: Optional[Dict[str, Any]] = (
            None
            if id is None or self.store is None
            else self.store.get_source_state(identifier)
        )

        update_state: bool = True if identifier else False
        state = state or dict()

        since_timestamp: Optional[int] = (
             None if state is None else state.get("since_timestamp", None)
        )
        since_timestamp = since_timestamp or config.since_timestamp
        if since_timestamp is None and config.lookup_period is not None:
            if len(config.lookup_period) <= 5:
                since_time = convert_utc_time(config.lookup_period)
            else:
                since_time = datetime.strptime(config.lookup_period, DATETIME_STRING_PATTERN)

            since_timestamp = int(since_time.timestamp())

        last_reviews_since_time = since_timestamp

        params: Dict[str, Any] = {
            'query': config.queries,
            'reviewsLimit': config.number_of_reviews,
            'limit': config.number_of_places_per_query,
            'sort': config.sort,
            # Reviews are sorted from latest to oldest in case cutoff or start is passed
            # cutoff is oldest timestamp till reviews are needed
            'cutoff': since_timestamp,
            # start is newest timestamp from reviews are needed
            'start': config.until_timestamp,
            'ignoreEmpty': config.ignore_empty_reviews,
            'coordinates': config.central_coordinates,
            'language': config.language,
            'region': config.country,
            'fields': ",".join(config.filtered_fields),
            'async': False,
        }

        # For API doc refer https://app.outscraper.com/api-docs#tag/Google-Reviews
        response = requests.get(f'{OUTSCRAPPER_API_URL}/maps/reviews-v3', params=params, headers={
            'X-API-KEY': "" if config.api_key is None else config.api_key.get_secret_value(),
        })

        queries_data = []
        if response.status_code == 200:
            queries_data = response.json().get('data', [])
        else:
            logger.warning(f"API call failed with error: {response.json()}")

        for query_data in queries_data:
            reviews = [] if "reviews_data" not in query_data else query_data.pop("reviews_data")

            for review in reviews:
                source_responses.append(
                    TextPayload(
                        processed_text=review["review_text"],
                        meta={**review, **query_data},
                        source_name=self.NAME,
                    )
                )
                review_time = review["review_timestamp"]

                if last_reviews_since_time is None or last_reviews_since_time < review_time:
                    last_reviews_since_time = review_time

        state["since_timestamp"] = last_reviews_since_time
        if update_state and self.store is not None:
            self.store.update_source_state(workflow_id=identifier, state=state)

        return source_responses