File size: 1,851 Bytes
21e639d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from scrapy import Spider
from typing import Optional
from ..utils.parsers.html import (
    IndexParser,
    LatestIndexParser,
    YearBackwardIndexParser,
    get_title_tags,
)
from abc import ABC, abstractmethod
from ..utils.requests import fetch_ptt_boards
from scrapy.http.response.html import HtmlResponse


class BaseSpider(Spider, ABC):
    """
    The BasePostSpider object defines the behaviour for crawling and parsing ptt posts.
    """

    allowed_domains = ["ptt.cc"]

    def __init__(
        self,
        boards: str,
        data_dir: str = "./data",
        ip_cache: bool = False,
        scrap_all: Optional[bool] = None,
        index_from: Optional[int] = None,
        index_to: Optional[int] = None,
        since: Optional[int] = None,
        *args,
        **kwargs
    ) -> None:
        super().__init__(*args, **kwargs)
        self.boards = boards.split(",")
        self.data_dir = data_dir
        self.ip_cache = ip_cache
        self.scrap_all = scrap_all
        self.index_from = index_from
        self.index_to = index_to
        self.since = since

    def start_requests(self):
        return fetch_ptt_boards(
            self.boards, self.parse_index, self.index_from, self.index_to
        )

    def parse_index(self, response: HtmlResponse):
        title_tags = get_title_tags(response)

        if self.scrap_all:
            return LatestIndexParser(self.logger).parse(response, self.parse_index)
        elif self.since:
            return YearBackwardIndexParser(
                self.since,
                title_tags,
                self.logger,
            ).parse(response, callback=self.parse, self_callback=self.parse_index)
        else:
            return IndexParser(title_tags).parse(self.parse)

    @abstractmethod
    def parse(self, response: HtmlResponse, **kwargs):
        pass