File size: 1,723 Bytes
f96e5ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
FEEDS ={'scraped.json' : {'format' : 'json'}}


#For SCRAPEOPS
SCRAPEOPS_API_KEY = '8857a1e3-3e44-428f-8809-d6028ba24f0f'
#SCRAPEOPS_FAKE_USER_AGENT_ENDPOINT = "https://headers.scrapeops.io/v1/user-agents"
SCRAPEOPS_FAKE_BROWSER_HEADER_ENDPOINT = "https://headers.scrapeops.io/v1/browser-headers"
#SCRAPEOPS_FAKE_USER_AGENT_ENABLED  = True
SCRAPEOPS_FAKE_BROWSER_HEADER_ENABLED = True
SCRAPEOPS_NUM_RESULTS = 50


#For PROXY
PROXY_USER = 'your_proxy_user_here'
PROXY_PASSWORD = 'your_proxy_password_here'
PROXY_ENDPOINT = 'your_proxy_endpoint_here'
PROXY_PORT = 'your_proxy_port_here'


#User agent is one part of header
#If we want only user agent to change use ScrapeOpsFakeUserAgentMiddleware
#But if you want to change complete user agent  Use ScrapeOpsFakeBrowserHeaderAgentMiddleware
#DOWNLOADER_MIDDLEWARES = {
#    "webscraper.middlewares.MyProxyMiddleware": 300,
#    "webscraper.middlewares.HttpProxyMiddleware": 350,
#    "webscraper.middlewares.ScrapeOpsFakeUserAgentMiddleware": 400,
#    "webscraper.middlewares.ScrapeOpsFakeBrowserHeaderAgentMiddleware": 400,
#}

# Obey robots.txt rules
# True - If we need to follow the rules regarding whether the website allows us to scrape
# False - If we do not want to follow the rules regarding whether the website allows us to scrape
ROBOTSTXT_OBEY = True


ITEM_PIPELINES = {
    "webscraper.pipelines.HtmlToMarkdownPipeline": 300,
}


BOT_NAME = "webscraper"

SPIDER_MODULES = ["webscraper.spiders"]
NEWSPIDER_MODULE = "webscraper.spiders"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"