Spaces:

ParitKansal
/

tempAutoScraping

Sleeping

App Files Files Community

tempAutoScraping / webscraper /webscraper /settings.py

ParitKansal

Add all files

f96e5ac 10 months ago

raw

history blame contribute delete

1.72 kB

	FEEDS ={'scraped.json' : {'format' : 'json'}}


	#For SCRAPEOPS
	SCRAPEOPS_API_KEY = '8857a1e3-3e44-428f-8809-d6028ba24f0f'
	#SCRAPEOPS_FAKE_USER_AGENT_ENDPOINT = "https://headers.scrapeops.io/v1/user-agents"
	SCRAPEOPS_FAKE_BROWSER_HEADER_ENDPOINT = "https://headers.scrapeops.io/v1/browser-headers"
	#SCRAPEOPS_FAKE_USER_AGENT_ENABLED = True
	SCRAPEOPS_FAKE_BROWSER_HEADER_ENABLED = True
	SCRAPEOPS_NUM_RESULTS = 50


	#For PROXY
	PROXY_USER = 'your_proxy_user_here'
	PROXY_PASSWORD = 'your_proxy_password_here'
	PROXY_ENDPOINT = 'your_proxy_endpoint_here'
	PROXY_PORT = 'your_proxy_port_here'


	#User agent is one part of header
	#If we want only user agent to change use ScrapeOpsFakeUserAgentMiddleware
	#But if you want to change complete user agent Use ScrapeOpsFakeBrowserHeaderAgentMiddleware
	#DOWNLOADER_MIDDLEWARES = {
	# "webscraper.middlewares.MyProxyMiddleware": 300,
	# "webscraper.middlewares.HttpProxyMiddleware": 350,
	# "webscraper.middlewares.ScrapeOpsFakeUserAgentMiddleware": 400,
	# "webscraper.middlewares.ScrapeOpsFakeBrowserHeaderAgentMiddleware": 400,
	#}

	# Obey robots.txt rules
	# True - If we need to follow the rules regarding whether the website allows us to scrape
	# False - If we do not want to follow the rules regarding whether the website allows us to scrape
	ROBOTSTXT_OBEY = True


	ITEM_PIPELINES = {
	"webscraper.pipelines.HtmlToMarkdownPipeline": 300,
	}


	BOT_NAME = "webscraper"

	SPIDER_MODULES = ["webscraper.spiders"]
	NEWSPIDER_MODULE = "webscraper.spiders"

	# Set settings whose default value is deprecated to a future-proof value
	REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
	TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
	FEED_EXPORT_ENCODING = "utf-8"