Spaces:
Runtime error
Runtime error
File size: 2,359 Bytes
485f76b 7c115c7 485f76b 7c115c7 485f76b 7c115c7 485f76b 7c115c7 485f76b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
#!/usr/bin/env python3
import pathlib
import ssl
import shutil
import csv
import concurrent.futures
import requests
from bs4 import BeautifulSoup
from progress.bar import ChargingBar
from entity import Entity
from common import selectors
import screenshot
def write_cert(e: Entity):
ssl_url = e.url.split("/")[2]
try:
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
with open(f"{e.DATA_PATH}/cert", 'w') as f:
f.write(cert)
except Exception as err:
with open(f"{e.DATA_PATH}/error.log", 'w+') as f:
f.write(str(err))
def get_logos(e: Entity, page):
soup = BeautifulSoup(page.content, "html.parser")
logos = soup.select(selectors.logo)
i = 0
lfn = []
for l in logos:
src = l.attrs['src']
ext = src.split('.')[-1].split('/')[-1]
try:
res = requests.get(src, stream=True)
except Exception:
res = requests.get(f"{e.url}/{src}")
fn = f"{e.DATA_PATH}/{i}.{ext}"
with open(fn, "wb") as f:
shutil.copyfileobj(res.raw, f)
lfn.append(fn)
i+=1
def query_vendor_site(e: Entity):
pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
try:
page = requests.get(e.url)
except Exception:
e.url = e.url.replace('http', 'https')
page = requests.get(e.url)
write_cert(e)
get_logos(e, page)
screenshot.sc_entity(e)
return (fn, lfn)
def from_csv(fn):
with open(fn, newline='') as csvfile:
reader = csv.DictReader(csvfile)
with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
futures = {executor.submit(query_vendor_site, e): e for e in [Entity.from_dict(d) for d in reader]}
bar = ChargingBar('Processing', max=len(futures))
for f in concurrent.futures.as_completed(futures):
url = futures[f]
try:
(cert, logos) = f.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print(cert, logos)
bar.next()
bar.finish()
#query_vendor_site('http://www.bancoprovincia.com.ar', 'debug')
#exit()
if __name__ == '__main__':
from_csv('entidades.csv')
|