Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
dc89ab8
1
Parent(s):
68ee7bd
deno port
Browse filesSigned-off-by: Niv Sardi <[email protected]>
- Dockerfile.deno +8 -0
- docker-compose.yaml +31 -0
- src/csv.test.ts +16 -0
- src/csv.ts +28 -0
- src/img.ts +1 -0
- src/index.ts +71 -0
- src/puppet.test.ts +9 -0
- src/puppet.ts +108 -0
- src/selectors.ts +6 -0
Dockerfile.deno
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM docker.io/denoland/deno
|
2 |
+
MAINTAINER Niv Sardi <[email protected]>
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY src-deno ./src
|
6 |
+
RUN deno cache ./src/index.ts
|
7 |
+
|
8 |
+
CMD deno ./src/index.ts
|
docker-compose.yaml
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: "3.9" # optional since v1.27.0
|
2 |
+
services:
|
3 |
+
puppet:
|
4 |
+
build:
|
5 |
+
dockerfile: Dockerfile.deno
|
6 |
+
context: .
|
7 |
+
links:
|
8 |
+
- browserless
|
9 |
+
environment:
|
10 |
+
BROWSERLESS_HOST: browserless
|
11 |
+
BROWSERLESS_PORT: 3000
|
12 |
+
DEBUG: "puppet"
|
13 |
+
depends_on:
|
14 |
+
- "browserless"
|
15 |
+
command: "sh -c 'while echo deno; do sleep 3h; done'" #"deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
|
16 |
+
volumes:
|
17 |
+
- "./src-deno:/app/src:z"
|
18 |
+
- "./data:/app/data:z"
|
19 |
+
#restart: unless-stopped:600
|
20 |
+
deploy:
|
21 |
+
restart_policy:
|
22 |
+
condition: any
|
23 |
+
delay: 600s
|
24 |
+
window: 300s
|
25 |
+
|
26 |
+
browserless:
|
27 |
+
image: docker.io/zenika/alpine-chrome
|
28 |
+
entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]
|
29 |
+
port:
|
30 |
+
- "3000:3000"
|
31 |
+
|
src/csv.test.ts
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import {
|
2 |
+
assertEquals,
|
3 |
+
assertObjectMatch
|
4 |
+
} from "https://deno.land/[email protected]/testing/asserts.ts";
|
5 |
+
import * as CSV from './csv.ts';
|
6 |
+
|
7 |
+
Deno.test("ParseLine", () => {
|
8 |
+
assertEquals(CSV.parseLine('"test", "test, with", without'), ['test', 'test, with', 'without'])
|
9 |
+
})
|
10 |
+
Deno.test("ParseCSV", () => {
|
11 |
+
const res: object[] = []
|
12 |
+
const expected = { test: 'hello', case: 'world' }
|
13 |
+
CSV.parse('test,case\nhello,world', e => res.push(e))
|
14 |
+
assertObjectMatch(res[0], expected)
|
15 |
+
assertEquals(res, [expected])
|
16 |
+
})
|
src/csv.ts
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export function parseLine(l: string) {
|
2 |
+
const res = l.match(/((?:\s+"[^"]+")|(?:[^,"]+))/g) || [];
|
3 |
+
for (let i = 0; i < res.length; i++) {
|
4 |
+
res[i] = res[i].replace(/^\s+/, '').replace(/^"/, '').replace(/[\r\n"]+$/, '')
|
5 |
+
}
|
6 |
+
return res;
|
7 |
+
}
|
8 |
+
|
9 |
+
export function parse(t: string, cb: (o: object) => void) {
|
10 |
+
const lines = t.split('\n');
|
11 |
+
const header = parseLine(lines[0]);
|
12 |
+
for (let i = 1; i < lines.length; i++) {
|
13 |
+
if (!lines[i].length) {
|
14 |
+
continue;
|
15 |
+
}
|
16 |
+
const l = parseLine(lines[i]) || []
|
17 |
+
|
18 |
+
if (l.length < header.length) {
|
19 |
+
console.error(`couldn't parse '${lines[i]}' yielded '${l}' of length ${l.length} expected ${header.length}: ${header}`);
|
20 |
+
return null;
|
21 |
+
}
|
22 |
+
const e = { [header[0]]: l[0] };
|
23 |
+
for (let j = 1; j < header.length; j++) {
|
24 |
+
e[`${header[j]}`] = l[j];
|
25 |
+
}
|
26 |
+
cb(e)
|
27 |
+
}
|
28 |
+
}
|
src/img.ts
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
import * as opencv from "https://deno.land/x/[email protected]/mod.ts";
|
src/index.ts
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PQueue from "https://deno.land/x/[email protected]/mod.ts"
|
2 |
+
|
3 |
+
import * as CSV from './csv.ts';
|
4 |
+
import Puppet from './puppet.ts';
|
5 |
+
import selectors from './selectors.ts';
|
6 |
+
|
7 |
+
const puppet = new Puppet();
|
8 |
+
const queue = new PQueue({
|
9 |
+
concurrency: 10,
|
10 |
+
timeout: 60000
|
11 |
+
})
|
12 |
+
let count = 0
|
13 |
+
queue.addEventListener("active", () =>
|
14 |
+
console.log(`Working on item #${++count}. Size: ${queue.size} Pending: ${queue.pending}`))
|
15 |
+
queue.addEventListener("next", () =>
|
16 |
+
console.log(`task finished, Size: ${queue.size} Pending: ${queue.pending}`))
|
17 |
+
const statInterval = setInterval(() =>
|
18 |
+
console.log(`Size: ${queue.size} Pending: ${queue.pending}`), 1000);
|
19 |
+
|
20 |
+
queue.addEventListener("idle", async () => {
|
21 |
+
clearInterval(statInterval)
|
22 |
+
await puppet.close()
|
23 |
+
console.log("all done")
|
24 |
+
})
|
25 |
+
|
26 |
+
function process(o: { url: string, bco: string, name: string }): Promise<void> {
|
27 |
+
const promises: Promise<void>[] = [];
|
28 |
+
|
29 |
+
return puppet.run(async page => {
|
30 |
+
const url = o.url.replace('http:', 'https:');
|
31 |
+
promises.push(new Promise<void>((accept, _reject) => {
|
32 |
+
page.once('load', async () => {
|
33 |
+
try {
|
34 |
+
const logos = await page.$$(selectors.logo);
|
35 |
+
let annotations = '';
|
36 |
+
for (const i in logos) {
|
37 |
+
const bb = await logos[i].boundingBox();
|
38 |
+
if (!bb) continue;
|
39 |
+
|
40 |
+
try {
|
41 |
+
await logos[i].screenshot({ path: `./data/logos/${o.bco}.logo${i}.png` })
|
42 |
+
annotations +=
|
43 |
+
`${o.bco} ${bb.x + bb.width / 2} ${bb.y + bb.height / 2} ${bb.width} ${bb.height}\n`
|
44 |
+
} catch (e) {
|
45 |
+
console.error(`couldn't screenshot logo: ${e}`);
|
46 |
+
}
|
47 |
+
}
|
48 |
+
if (logos.length) {
|
49 |
+
await Deno.writeTextFile(`./data/${o.bco}.chrome.full.txt`, annotations);
|
50 |
+
}
|
51 |
+
await page.screenshot({ path: `./data/${o.bco}.chrome.full.png`, fullPage: true })
|
52 |
+
console.log(`screenshot ok for ${o.name}`);
|
53 |
+
} catch (err) {
|
54 |
+
console.error(`error in screenshot: ${err}`);
|
55 |
+
}
|
56 |
+
accept()
|
57 |
+
})
|
58 |
+
}))
|
59 |
+
|
60 |
+
try {
|
61 |
+
await page.goto(url)
|
62 |
+
.catch(() => page.goto(o.url))
|
63 |
+
} catch (e) {
|
64 |
+
console.error(`got error: ${e}`);
|
65 |
+
}
|
66 |
+
await Promise.all(promises);
|
67 |
+
})
|
68 |
+
}
|
69 |
+
|
70 |
+
const text = await Deno.readTextFile("./data/entidades.csv");
|
71 |
+
CSV.parse(text, o => queue.add(() => process(o)))
|
src/puppet.test.ts
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { assertEquals } from "https://deno.land/[email protected]/testing/asserts.ts";
|
2 |
+
import Puppet from './puppet.ts'
|
3 |
+
|
4 |
+
Deno.test("Puppet", async () => {
|
5 |
+
const P = new Puppet()
|
6 |
+
await P.connect()
|
7 |
+
await P.run(page => page.goto("https://google.com"))
|
8 |
+
await P.close()
|
9 |
+
})
|
src/puppet.ts
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import Puppeteer from "https://deno.land/x/[email protected]/mod.ts";
|
2 |
+
import EventEmitter from "https://deno.land/x/[email protected]/mod.ts";
|
3 |
+
import type { Browser, Page } from "https://deno.land/x/[email protected]/mod.ts";
|
4 |
+
|
5 |
+
const BROWSER_SIGNALS = [
|
6 |
+
'disconnected',
|
7 |
+
'targetchanged',
|
8 |
+
'targetcreated',
|
9 |
+
'targetdestroyed'
|
10 |
+
];
|
11 |
+
|
12 |
+
const CHROME_ARGS = [
|
13 |
+
'--no-sandbox',
|
14 |
+
'--disable-setuid-sandbox'
|
15 |
+
];
|
16 |
+
|
17 |
+
async function resolve(a: string) {
|
18 |
+
if (a.match(/(\d.?){4}/)) {
|
19 |
+
return a;
|
20 |
+
}
|
21 |
+
return await Deno.resolveDns(a, "A");
|
22 |
+
}
|
23 |
+
|
24 |
+
export default class Runner extends EventEmitter {
|
25 |
+
config: {
|
26 |
+
BROWSERLESS_HOST: string;
|
27 |
+
BROWSERLESS_PORT: string;
|
28 |
+
};
|
29 |
+
target: string;
|
30 |
+
browser: Browser | undefined;
|
31 |
+
connected: Promise<boolean> | undefined;
|
32 |
+
|
33 |
+
constructor(config = {
|
34 |
+
BROWSERLESS_HOST: Deno.env.get("BROWSERLESS_HOST") || "localhost",
|
35 |
+
BROWSERLESS_PORT: Deno.env.get("BROWSERLESS_PORT") || "3000",
|
36 |
+
}) {
|
37 |
+
super();
|
38 |
+
this.target = `ws://${config.BROWSERLESS_HOST}:${config.BROWSERLESS_PORT}`;
|
39 |
+
this.config = config;
|
40 |
+
this.connected
|
41 |
+
}
|
42 |
+
public async close() {
|
43 |
+
try {
|
44 |
+
if (this.browser) await this.browser.close();
|
45 |
+
} catch (err) {
|
46 |
+
console.error(`${err} on close`)
|
47 |
+
}
|
48 |
+
}
|
49 |
+
async connect() {
|
50 |
+
if (!this.connected)
|
51 |
+
this.connected = this._connect()
|
52 |
+
return this.connected
|
53 |
+
}
|
54 |
+
async _connect() {
|
55 |
+
try {
|
56 |
+
const host = await resolve(this.config.BROWSERLESS_HOST);
|
57 |
+
const ver = await fetch(`http://${host}:${this.config.BROWSERLESS_PORT}/json/version`)
|
58 |
+
.then(async res => await res.json())
|
59 |
+
this.target = ver.webSocketDebuggerUrl;
|
60 |
+
this.browser = this.browser || await Puppeteer.connect({
|
61 |
+
browserWSEndpoint: this.target
|
62 |
+
}).catch(() => {
|
63 |
+
console.error(`
|
64 |
+
⚠ COULD NOT CONNECT TO BROWSERLESS
|
65 |
+
🦄 will try to spawn a chromedriver instance for you to debug`)
|
66 |
+
return Puppeteer.launch({
|
67 |
+
args: CHROME_ARGS,
|
68 |
+
headless: false
|
69 |
+
})
|
70 |
+
});
|
71 |
+
|
72 |
+
if (!this.browser) {
|
73 |
+
console.error("couldn't init Browser");
|
74 |
+
return false;
|
75 |
+
}
|
76 |
+
BROWSER_SIGNALS.map(e => this.browser?.on(e, d => this.emit(`browser:${e}`, d)))
|
77 |
+
this.browser.on('error', e => console.error(`got browser error: ${e}`))
|
78 |
+
|
79 |
+
const pages = await this.browser.pages();
|
80 |
+
for (let p in pages) {
|
81 |
+
await pages[p].close();
|
82 |
+
}
|
83 |
+
this.emit("ready")
|
84 |
+
} catch (e) {
|
85 |
+
console.error(e);
|
86 |
+
}
|
87 |
+
return true;
|
88 |
+
}
|
89 |
+
|
90 |
+
public async run(fn: (page: Page) => void) {
|
91 |
+
await this.connect();
|
92 |
+
|
93 |
+
if (!this.browser) {
|
94 |
+
return;
|
95 |
+
}
|
96 |
+
try {
|
97 |
+
const page = await this.browser.newPage()
|
98 |
+
if (!page) {
|
99 |
+
return;
|
100 |
+
}
|
101 |
+
const ret = await fn(page)
|
102 |
+
await page.close()
|
103 |
+
return ret
|
104 |
+
} catch (e) {
|
105 |
+
return
|
106 |
+
}
|
107 |
+
}
|
108 |
+
}
|
src/selectors.ts
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export default {
|
2 |
+
"logo": "img[src*=logo]",
|
3 |
+
"logosbancos": "img[src*=logosbancos]",
|
4 |
+
"entity_http": "p.post-pagina-interior a[target=_blank][href*=http]",
|
5 |
+
"entity_mailto": "p.post-pagina-interior a[target=_blank][href*=mailto]"
|
6 |
+
}
|