spoof-detect / crawler /vendor.py
Niv Sardi
reorganize vendor.py and log
7c115c7
raw
history blame
2.36 kB
#!/usr/bin/env python3
import pathlib
import ssl
import shutil
import csv
import concurrent.futures
import requests
from bs4 import BeautifulSoup
from progress.bar import ChargingBar
from entity import Entity
from common import selectors
import screenshot
def write_cert(e: Entity):
ssl_url = e.url.split("/")[2]
try:
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
with open(f"{e.DATA_PATH}/cert", 'w') as f:
f.write(cert)
except Exception as err:
with open(f"{e.DATA_PATH}/error.log", 'w+') as f:
f.write(str(err))
def get_logos(e: Entity, page):
soup = BeautifulSoup(page.content, "html.parser")
logos = soup.select(selectors.logo)
i = 0
lfn = []
for l in logos:
src = l.attrs['src']
ext = src.split('.')[-1].split('/')[-1]
try:
res = requests.get(src, stream=True)
except Exception:
res = requests.get(f"{e.url}/{src}")
fn = f"{e.DATA_PATH}/{i}.{ext}"
with open(fn, "wb") as f:
shutil.copyfileobj(res.raw, f)
lfn.append(fn)
i+=1
def query_vendor_site(e: Entity):
pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
try:
page = requests.get(e.url)
except Exception:
e.url = e.url.replace('http', 'https')
page = requests.get(e.url)
write_cert(e)
get_logos(e, page)
screenshot.sc_entity(e)
return (fn, lfn)
def from_csv(fn):
with open(fn, newline='') as csvfile:
reader = csv.DictReader(csvfile)
with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
futures = {executor.submit(query_vendor_site, e): e for e in [Entity.from_dict(d) for d in reader]}
bar = ChargingBar('Processing', max=len(futures))
for f in concurrent.futures.as_completed(futures):
url = futures[f]
try:
(cert, logos) = f.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print(cert, logos)
bar.next()
bar.finish()
#query_vendor_site('http://www.bancoprovincia.com.ar', 'debug')
#exit()
if __name__ == '__main__':
from_csv('entidades.csv')