Spaces:

jeffeux
/

assignment-1-jeffeuxMartin

Runtime error

App Files Files Community

assignment-1-jeffeuxMartin / ptt-crawler /scraptt /settings.py

jeffeux

Add application file

21e639d almost 3 years ago

raw

history blame contribute delete

3.37 kB

	# Scrapy settings for scraptt project
	#
	# For simplicity, this file contains only settings considered important or
	# commonly used. You can find more settings consulting the documentation:
	#
	# https://docs.scrapy.org/en/latest/topics/settings.html
	# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

	BOT_NAME = "scraptt"

	SPIDER_MODULES = ["scraptt.spiders"]
	NEWSPIDER_MODULE = "scraptt.spiders"


	# Crawl responsibly by identifying yourself (and your website) on the user-agent
	# USER_AGENT = 'scraptt (+http://www.yourdomain.com)'

	# Obey robots.txt rules
	ROBOTSTXT_OBEY = False

	# Configure maximum concurrent requests performed by Scrapy (default: 16)
	CONCURRENT_REQUESTS = 16

	# Configure a delay for requests for the same website (default: 0)
	# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
	# See also autothrottle settings and docs
	DOWNLOAD_DELAY = 0.4
	# The download delay setting will honor only one of:
	CONCURRENT_REQUESTS_PER_DOMAIN = 16
	CONCURRENT_REQUESTS_PER_IP = 16

	# Disable cookies (enabled by default)
	COOKIES_ENABLED = True

	# Disable Telnet Console (enabled by default)
	TELNETCONSOLE_ENABLED = False

	# Override the default request headers:
	# DEFAULT_REQUEST_HEADERS = {
	# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	# 'Accept-Language': 'en',
	# }

	# Enable or disable spider middlewares
	# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
	# SPIDER_MIDDLEWARES = {
	# 'scraptt.middlewares.ScrapttSpiderMiddleware': 543,
	# }

	# Enable or disable downloader middlewares
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	DOWNLOADER_MIDDLEWARES = {
	# "scraptt.middlewares.ScrapttDownloaderMiddleware": 543,
	"scraptt.middlewares.PyqueryMiddleware": 543,
	"scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
	"scrapy_user_agents.middlewares.RandomUserAgentMiddleware": 400,
	}

	# Enable or disable extensions
	# See https://docs.scrapy.org/en/latest/topics/extensions.html
	EXTENSIONS = {
	"scrapy.extensions.telnet.TelnetConsole": None,
	}

	# Configure item pipelines
	# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
	ITEM_PIPELINES = {
	'scraptt.pipelines.CsvPipeline': 300,
	}

	# Enable and configure the AutoThrottle extension (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
	# AUTOTHROTTLE_ENABLED = True
	# The initial download delay
	# AUTOTHROTTLE_START_DELAY = 5
	# The maximum download delay to be set in case of high latencies
	# AUTOTHROTTLE_MAX_DELAY = 60
	# The average number of requests Scrapy should be sending in parallel to
	# each remote server
	# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
	# Enable showing throttling stats for every response received:
	# AUTOTHROTTLE_DEBUG = False

	# Enable and configure HTTP caching (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
	# HTTPCACHE_ENABLED = True
	# HTTPCACHE_EXPIRATION_SECS = 0
	# HTTPCACHE_DIR = 'httpcache'
	# HTTPCACHE_IGNORE_HTTP_CODES = []
	# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


	RETRY_ENABLED = True
	RETRY_HTTP_CODES = [500, 502, 503, 504, 520, 521, 522, 524, 525, 408, 429]
	RETRY_TIMES = 5