jeffeux's picture
Add application file
21e639d
from scrapy import Spider
from typing import Optional
from ..utils.parsers.html import (
IndexParser,
LatestIndexParser,
YearBackwardIndexParser,
get_title_tags,
)
from abc import ABC, abstractmethod
from ..utils.requests import fetch_ptt_boards
from scrapy.http.response.html import HtmlResponse
class BaseSpider(Spider, ABC):
"""
The BasePostSpider object defines the behaviour for crawling and parsing ptt posts.
"""
allowed_domains = ["ptt.cc"]
def __init__(
self,
boards: str,
data_dir: str = "./data",
ip_cache: bool = False,
scrap_all: Optional[bool] = None,
index_from: Optional[int] = None,
index_to: Optional[int] = None,
since: Optional[int] = None,
*args,
**kwargs
) -> None:
super().__init__(*args, **kwargs)
self.boards = boards.split(",")
self.data_dir = data_dir
self.ip_cache = ip_cache
self.scrap_all = scrap_all
self.index_from = index_from
self.index_to = index_to
self.since = since
def start_requests(self):
return fetch_ptt_boards(
self.boards, self.parse_index, self.index_from, self.index_to
)
def parse_index(self, response: HtmlResponse):
title_tags = get_title_tags(response)
if self.scrap_all:
return LatestIndexParser(self.logger).parse(response, self.parse_index)
elif self.since:
return YearBackwardIndexParser(
self.since,
title_tags,
self.logger,
).parse(response, callback=self.parse, self_callback=self.parse_index)
else:
return IndexParser(title_tags).parse(self.parse)
@abstractmethod
def parse(self, response: HtmlResponse, **kwargs):
pass