File size: 2,359 Bytes
485f76b
 
 
 
 
 
 
 
 
 
 
 
 
 
7c115c7
485f76b
 
7c115c7
 
 
 
 
 
485f76b
7c115c7
 
485f76b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c115c7
 
 
 
 
 
 
 
 
 
 
 
485f76b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
import pathlib
import ssl
import shutil
import csv
import concurrent.futures
import requests
from bs4 import BeautifulSoup
from progress.bar import ChargingBar

from entity import Entity
from common import selectors
import screenshot

def write_cert(e: Entity):
    ssl_url = e.url.split("/")[2]
    try:
        cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
        with open(f"{e.DATA_PATH}/cert", 'w') as f:
            f.write(cert)
    except Exception as err:
        with open(f"{e.DATA_PATH}/error.log", 'w+') as f:
            f.write(str(err))

def get_logos(e: Entity, page):
    soup = BeautifulSoup(page.content, "html.parser")
    logos = soup.select(selectors.logo)

    i = 0
    lfn = []
    for l in logos:
        src = l.attrs['src']
        ext = src.split('.')[-1].split('/')[-1]
        try:
            res = requests.get(src, stream=True)
        except Exception:
            res = requests.get(f"{e.url}/{src}")

        fn = f"{e.DATA_PATH}/{i}.{ext}"
        with open(fn, "wb") as f:
            shutil.copyfileobj(res.raw, f)
        lfn.append(fn)
        i+=1

def query_vendor_site(e: Entity):
    pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)

    try:
        page = requests.get(e.url)
    except Exception:
        e.url = e.url.replace('http', 'https')
        page = requests.get(e.url)

    write_cert(e)
    get_logos(e, page)
    screenshot.sc_entity(e)
    return (fn, lfn)

def from_csv(fn):
    with open(fn, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
            futures = {executor.submit(query_vendor_site, e): e for e in [Entity.from_dict(d) for d in reader]}
            bar = ChargingBar('Processing', max=len(futures))
            for f in concurrent.futures.as_completed(futures):
                url = futures[f]
                try:
                    (cert, logos) = f.result()
                except Exception as exc:
                    print('%r generated an exception: %s' % (url, exc))
                else:
                    print(cert, logos)
                bar.next()
            bar.finish()

#query_vendor_site('http://www.bancoprovincia.com.ar', 'debug')
#exit()

if __name__ == '__main__':
    from_csv('entidades.csv')