ParitKansal's picture
Add all files
f96e5ac
FEEDS ={'scraped.json' : {'format' : 'json'}}
#For SCRAPEOPS
SCRAPEOPS_API_KEY = '8857a1e3-3e44-428f-8809-d6028ba24f0f'
#SCRAPEOPS_FAKE_USER_AGENT_ENDPOINT = "https://headers.scrapeops.io/v1/user-agents"
SCRAPEOPS_FAKE_BROWSER_HEADER_ENDPOINT = "https://headers.scrapeops.io/v1/browser-headers"
#SCRAPEOPS_FAKE_USER_AGENT_ENABLED = True
SCRAPEOPS_FAKE_BROWSER_HEADER_ENABLED = True
SCRAPEOPS_NUM_RESULTS = 50
#For PROXY
PROXY_USER = 'your_proxy_user_here'
PROXY_PASSWORD = 'your_proxy_password_here'
PROXY_ENDPOINT = 'your_proxy_endpoint_here'
PROXY_PORT = 'your_proxy_port_here'
#User agent is one part of header
#If we want only user agent to change use ScrapeOpsFakeUserAgentMiddleware
#But if you want to change complete user agent Use ScrapeOpsFakeBrowserHeaderAgentMiddleware
#DOWNLOADER_MIDDLEWARES = {
# "webscraper.middlewares.MyProxyMiddleware": 300,
# "webscraper.middlewares.HttpProxyMiddleware": 350,
# "webscraper.middlewares.ScrapeOpsFakeUserAgentMiddleware": 400,
# "webscraper.middlewares.ScrapeOpsFakeBrowserHeaderAgentMiddleware": 400,
#}
# Obey robots.txt rules
# True - If we need to follow the rules regarding whether the website allows us to scrape
# False - If we do not want to follow the rules regarding whether the website allows us to scrape
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
"webscraper.pipelines.HtmlToMarkdownPipeline": 300,
}
BOT_NAME = "webscraper"
SPIDER_MODULES = ["webscraper.spiders"]
NEWSPIDER_MODULE = "webscraper.spiders"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"