Spaces:
Runtime error
Runtime error
File size: 1,598 Bytes
21e639d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
from pathlib import Path
from io import BufferedWriter
from datetime import datetime
from ..spiders import PttSpider
from typing import Any, Dict, Tuple
from scrapy.exporters import BaseItemExporter, CsvItemExporter
class CsvPipeline:
"""
The CsvPipeline object writes the scraped item to csv.
"""
def open_spider(self, spider: PttSpider) -> None:
self.exporters_list: Dict[str, Tuple[BaseItemExporter, BufferedWriter]] = {}
def _exporter_for_item(
self, item: Dict[str, Any], spider: PttSpider
) -> CsvItemExporter:
data_dir = spider.data_dir
board = item.pop("board")
date = datetime.strptime(item["date"], "%Y-%m-%d %H:%M:%S")
year = date.year
month = date.month
dir_path = f"{data_dir}/{board}/{year}"
file_path = f"{dir_path}/{board}_{year}_{month}"
Path(dir_path).mkdir(parents=True, exist_ok=True)
if file_path not in self.exporters_list:
file = open(f"{file_path}.csv", "wb")
exporter = CsvItemExporter(file)
exporter.start_exporting()
self.exporters_list[file_path] = (exporter, file)
return self.exporters_list[file_path][0]
def process_item(self, item: Dict[str, Any], spider: PttSpider) -> Dict[str, Any]:
exporter = self._exporter_for_item(item, spider)
exporter.export_item(item)
return item
def close_spider(self, spider: PttSpider) -> None:
for exporter, csv_file in self.exporters_list.values():
exporter.finish_exporting()
csv_file.close()
|