rkwyu
commited on
Commit
·
acf17aa
1
Parent(s):
0418ab0
Add modes
Browse files- README.md +27 -5
- config.ini +3 -0
- run.js +20 -3
- src/App.js +3 -3
- src/const/ScribdFlag.js +4 -0
- src/{regex → const}/ScribdRegex.js +0 -0
- src/service/ScribdDownloader.js +75 -11
README.md
CHANGED
@@ -3,10 +3,19 @@
|
|
3 |
|
4 |
## About ##
|
5 |
Scribd-dl helps downloading documents on [scribd.com](https://www.scribd.com/) without membership / sign-in.
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
Friendly reminder:
|
9 |
-
The .pdf generated is formed by images, so
|
|
|
|
|
|
|
|
|
10 |
|
11 |
## Prerequisites ##
|
12 |
Please make sure the following tool(s) / application(s) are properly setup and ready to use:
|
@@ -24,21 +33,34 @@ npm install
|
|
24 |
```
|
25 |
|
26 |
## Configuration ##
|
27 |
-
Output directory can be configured in `config.ini`
|
28 |
```ini
|
|
|
|
|
|
|
29 |
[DIRECTORY]
|
30 |
output=output
|
31 |
```
|
|
|
|
|
|
|
32 |
|
33 |
## Usage (CLI) ##
|
34 |
```console
|
35 |
-
Usage: npm start [
|
|
|
|
|
|
|
36 |
```
|
37 |
|
38 |
-
#### Example: Download 《The Minds of Billy Milligan》 ####
|
39 |
```console
|
40 |
npm start https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes
|
41 |
```
|
42 |
|
|
|
|
|
|
|
|
|
|
|
43 |
## License ##
|
44 |
[GNU GPL v3.0](LICENSE.md)
|
|
|
3 |
|
4 |
## About ##
|
5 |
Scribd-dl helps downloading documents on [scribd.com](https://www.scribd.com/) without membership / sign-in.
|
6 |
+
2 modes are available:
|
7 |
+
- default: the .pdf file is generated by chromium's print function
|
8 |
+
- image-based the .pdf file is generated by image snapshots taken for pages
|
9 |
+
|
10 |
+
It is prefer to use the `default` mode as it gives a better performance in generation time and file size.
|
11 |
+
`image-based` mode is a backup solution in case the `default` mode doesn't work as expected.
|
12 |
|
13 |
Friendly reminder:
|
14 |
+
1. The .pdf generated by `image-based` mode is formed by images, so it does NOT contain any text.
|
15 |
+
|
16 |
+
## Development Plan ##
|
17 |
+
Scribd obfuscates the .pdf files, the texts copied from the documents will become strange garbled message.
|
18 |
+
De-obfuscating will be the next stage.
|
19 |
|
20 |
## Prerequisites ##
|
21 |
Please make sure the following tool(s) / application(s) are properly setup and ready to use:
|
|
|
33 |
```
|
34 |
|
35 |
## Configuration ##
|
|
|
36 |
```ini
|
37 |
+
[SCRIBD]
|
38 |
+
rendertime=100
|
39 |
+
|
40 |
[DIRECTORY]
|
41 |
output=output
|
42 |
```
|
43 |
+
Configuration can be altered in `config.ini`.
|
44 |
+
`rendertime` is the waiting time in millisecond for single page rendering, it is only applicable for `default` mode. (too short might cause missing images)
|
45 |
+
`output` is the ouput directory for generated .pdf files.
|
46 |
|
47 |
## Usage (CLI) ##
|
48 |
```console
|
49 |
+
Usage: npm start [options] url
|
50 |
+
Options:
|
51 |
+
/d default: generated by chromium's print function
|
52 |
+
/i image-based: generated by image snapshots taken for pages
|
53 |
```
|
54 |
|
55 |
+
#### Example 1: Download 《The Minds of Billy Milligan》 ####
|
56 |
```console
|
57 |
npm start https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes
|
58 |
```
|
59 |
|
60 |
+
#### Example 2: Download 《The Minds of Billy Milligan》 using image-based method####
|
61 |
+
```console
|
62 |
+
npm start /i https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes
|
63 |
+
```
|
64 |
+
|
65 |
## License ##
|
66 |
[GNU GPL v3.0](LICENSE.md)
|
config.ini
CHANGED
@@ -1,2 +1,5 @@
|
|
|
|
|
|
|
|
1 |
[DIRECTORY]
|
2 |
output=output
|
|
|
1 |
+
[SCRIBD]
|
2 |
+
rendertime=100
|
3 |
+
|
4 |
[DIRECTORY]
|
5 |
output=output
|
run.js
CHANGED
@@ -1,7 +1,24 @@
|
|
1 |
import { app } from './src/App.js'
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
} else {
|
6 |
-
console.error(`
|
|
|
|
|
|
|
|
|
|
|
7 |
}
|
|
|
1 |
import { app } from './src/App.js'
|
2 |
+
import * as scribdFlag from './src/const/scribdFlag.js'
|
3 |
|
4 |
+
const flags = [scribdFlag.DEFAULT, scribdFlag.IMAGE]
|
5 |
+
|
6 |
+
if (process.argv.length >= 3) {
|
7 |
+
let url;
|
8 |
+
let flag;
|
9 |
+
for (let i = 2; i < process.argv.length; i++) {
|
10 |
+
if (flags.includes(process.argv[i])) {
|
11 |
+
flag = process.argv[i]
|
12 |
+
} else {
|
13 |
+
url = process.argv[i]
|
14 |
+
}
|
15 |
+
}
|
16 |
+
await app.execute(url, flag)
|
17 |
} else {
|
18 |
+
console.error(`
|
19 |
+
Usage: npm start [options] url
|
20 |
+
Options:
|
21 |
+
/d default: generated by chromium's print function
|
22 |
+
/i image-based: generated by image snapshots taken for pages
|
23 |
+
`)
|
24 |
}
|
src/App.js
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import { scribdDownloader } from "./service/ScribdDownloader.js"
|
2 |
-
import * as scribdRegex from "./
|
3 |
|
4 |
class App {
|
5 |
constructor() {
|
@@ -9,9 +9,9 @@ class App {
|
|
9 |
return App.instance
|
10 |
}
|
11 |
|
12 |
-
async execute(url) {
|
13 |
if (url.match(scribdRegex.DOMAIN)) {
|
14 |
-
await scribdDownloader.execute(url)
|
15 |
} else {
|
16 |
throw new Error(`Unsupported URL: ${url}`)
|
17 |
}
|
|
|
1 |
import { scribdDownloader } from "./service/ScribdDownloader.js"
|
2 |
+
import * as scribdRegex from "./const/ScribdRegex.js"
|
3 |
|
4 |
class App {
|
5 |
constructor() {
|
|
|
9 |
return App.instance
|
10 |
}
|
11 |
|
12 |
+
async execute(url, flag) {
|
13 |
if (url.match(scribdRegex.DOMAIN)) {
|
14 |
+
await scribdDownloader.execute(url, flag)
|
15 |
} else {
|
16 |
throw new Error(`Unsupported URL: ${url}`)
|
17 |
}
|
src/const/ScribdFlag.js
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const DEFAULT = "/d"
|
2 |
+
const IMAGE = "/i"
|
3 |
+
|
4 |
+
export { DEFAULT, IMAGE }
|
src/{regex → const}/ScribdRegex.js
RENAMED
File without changes
|
src/service/ScribdDownloader.js
CHANGED
@@ -3,12 +3,15 @@ import { puppeteerSg } from "../utils/request/PuppeteerSg.js";
|
|
3 |
import { pdfGenerator } from "../utils/io/PdfGenerator.js";
|
4 |
import { configLoader } from "../utils/io/ConfigLoader.js";
|
5 |
import { directoryIo } from "../utils/io/DirectoryIo.js"
|
6 |
-
import * as scribdRegex from "../
|
|
|
7 |
import { Image } from "../object/Image.js"
|
8 |
import sharp from "sharp";
|
|
|
9 |
|
10 |
|
11 |
-
const
|
|
|
12 |
|
13 |
class ScribdDownloader {
|
14 |
constructor() {
|
@@ -18,27 +21,88 @@ class ScribdDownloader {
|
|
18 |
return ScribdDownloader.instance
|
19 |
}
|
20 |
|
21 |
-
async execute(url) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
if (url.match(scribdRegex.DOCUMENT)) {
|
23 |
-
await
|
24 |
} else if (url.match(scribdRegex.EMBED)) {
|
25 |
-
await
|
26 |
} else {
|
27 |
throw new Error(`Unsupported URL: ${url}`)
|
28 |
}
|
29 |
}
|
30 |
|
31 |
-
async
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
let deviceScaleFactor = 2
|
33 |
const m = scribdRegex.EMBED.exec(url)
|
34 |
if (m) {
|
35 |
let id = m[1]
|
36 |
|
37 |
// prepare temp dir
|
38 |
-
let dir = `${
|
39 |
await directoryIo.create(dir)
|
40 |
|
41 |
-
//
|
42 |
let page = await puppeteerSg.getPage(url)
|
43 |
|
44 |
// wait rendering
|
@@ -62,7 +126,7 @@ class ScribdDownloader {
|
|
62 |
await page.evaluate((i) => { // eslint-disable-next-line
|
63 |
document.getElementById(`outer_page_${(i + 1)}`).scrollIntoView()
|
64 |
}, i)
|
65 |
-
|
66 |
let width = 1191
|
67 |
let height = 1684
|
68 |
let style = await doc_pages[i].evaluate((el) => el.getAttribute("style"));
|
@@ -85,10 +149,10 @@ class ScribdDownloader {
|
|
85 |
bar.stop();
|
86 |
|
87 |
// generate pdf
|
88 |
-
await pdfGenerator.generate(images, `${
|
89 |
|
90 |
// remove temp dir
|
91 |
-
directoryIo.remove(`${
|
92 |
|
93 |
await page.close()
|
94 |
await puppeteerSg.close()
|
|
|
3 |
import { pdfGenerator } from "../utils/io/PdfGenerator.js";
|
4 |
import { configLoader } from "../utils/io/ConfigLoader.js";
|
5 |
import { directoryIo } from "../utils/io/DirectoryIo.js"
|
6 |
+
import * as scribdRegex from "../const/ScribdRegex.js"
|
7 |
+
import * as scribdFlag from '../const/scribdFlag.js'
|
8 |
import { Image } from "../object/Image.js"
|
9 |
import sharp from "sharp";
|
10 |
+
import path from 'path'
|
11 |
|
12 |
|
13 |
+
const output = configLoader.load("DIRECTORY", "output")
|
14 |
+
const rendertime = parseInt(configLoader.load("SCRIBD", "rendertime"))
|
15 |
|
16 |
class ScribdDownloader {
|
17 |
constructor() {
|
|
|
21 |
return ScribdDownloader.instance
|
22 |
}
|
23 |
|
24 |
+
async execute(url, flag) {
|
25 |
+
let fn;
|
26 |
+
if (flag === scribdFlag.IMAGE) {
|
27 |
+
console.log(`Mode: IMAGE`)
|
28 |
+
fn = this.embeds_image
|
29 |
+
} else {
|
30 |
+
console.log(`Mode: DEFAULT`)
|
31 |
+
fn = this.embeds_default
|
32 |
+
}
|
33 |
if (url.match(scribdRegex.DOCUMENT)) {
|
34 |
+
await fn(`https://www.scribd.com/embeds/${scribdRegex.DOCUMENT.exec(url)[2]}/content`)
|
35 |
} else if (url.match(scribdRegex.EMBED)) {
|
36 |
+
await fn(url)
|
37 |
} else {
|
38 |
throw new Error(`Unsupported URL: ${url}`)
|
39 |
}
|
40 |
}
|
41 |
|
42 |
+
async embeds_default(url) {
|
43 |
+
const m = scribdRegex.EMBED.exec(url)
|
44 |
+
if (m) {
|
45 |
+
let id = m[1]
|
46 |
+
|
47 |
+
// navigate to scribd
|
48 |
+
let page = await puppeteerSg.getPage(url)
|
49 |
+
|
50 |
+
// wait rendering
|
51 |
+
await new Promise(resolve => setTimeout(resolve, 1000))
|
52 |
+
|
53 |
+
// load all pages
|
54 |
+
let doc_pages = await page.$$("div.outer_page_container div[id^='outer_page_']")
|
55 |
+
const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
|
56 |
+
bar.start(doc_pages.length, 0);
|
57 |
+
for (let i = 0; i < doc_pages.length; i++) {
|
58 |
+
await page.evaluate((i) => { // eslint-disable-next-line
|
59 |
+
document.getElementById(`outer_page_${(i + 1)}`).scrollIntoView() // eslint-disable-next-line
|
60 |
+
document.getElementById(`outer_page_${(i + 1)}`).style.margin = 0
|
61 |
+
}, i)
|
62 |
+
await new Promise(resolve => setTimeout(resolve, rendertime))
|
63 |
+
bar.update(i + 1);
|
64 |
+
}
|
65 |
+
bar.stop();
|
66 |
+
|
67 |
+
// pdf setting
|
68 |
+
let options = {
|
69 |
+
path: `${output}/${id}.pdf`,
|
70 |
+
printBackground: true,
|
71 |
+
}
|
72 |
+
let first_page = await page.$("div.outer_page_container div[id^='outer_page_']")
|
73 |
+
let style = await first_page.evaluate((el) => el.getAttribute("style"))
|
74 |
+
if (style.includes("width:") && style.includes("height:")) {
|
75 |
+
options.height = parseInt(style.split("height:")[1].split("px")[0].trim())
|
76 |
+
options.width = parseInt(style.split("width:")[1].split("px")[0].trim())
|
77 |
+
}
|
78 |
+
|
79 |
+
// show doc only
|
80 |
+
await page.evaluate(() => { // eslint-disable-next-line
|
81 |
+
document.body.innerHTML = document.querySelector("div.outer_page_container").innerHTML
|
82 |
+
})
|
83 |
+
|
84 |
+
await directoryIo.create(path.dirname(options.path))
|
85 |
+
await page.pdf(options);
|
86 |
+
console.log(`Generated: ${options.path}`)
|
87 |
+
|
88 |
+
await page.close()
|
89 |
+
await puppeteerSg.close()
|
90 |
+
} else {
|
91 |
+
throw new Error(`Unsupported URL: ${url}`)
|
92 |
+
}
|
93 |
+
}
|
94 |
+
|
95 |
+
async embeds_image(url) {
|
96 |
let deviceScaleFactor = 2
|
97 |
const m = scribdRegex.EMBED.exec(url)
|
98 |
if (m) {
|
99 |
let id = m[1]
|
100 |
|
101 |
// prepare temp dir
|
102 |
+
let dir = `${output}/${id}`
|
103 |
await directoryIo.create(dir)
|
104 |
|
105 |
+
// navigate to scribd
|
106 |
let page = await puppeteerSg.getPage(url)
|
107 |
|
108 |
// wait rendering
|
|
|
126 |
await page.evaluate((i) => { // eslint-disable-next-line
|
127 |
document.getElementById(`outer_page_${(i + 1)}`).scrollIntoView()
|
128 |
}, i)
|
129 |
+
|
130 |
let width = 1191
|
131 |
let height = 1684
|
132 |
let style = await doc_pages[i].evaluate((el) => el.getAttribute("style"));
|
|
|
149 |
bar.stop();
|
150 |
|
151 |
// generate pdf
|
152 |
+
await pdfGenerator.generate(images, `${output}/${id}.pdf`)
|
153 |
|
154 |
// remove temp dir
|
155 |
+
directoryIo.remove(`${output}/${id}`)
|
156 |
|
157 |
await page.close()
|
158 |
await puppeteerSg.close()
|