rkwyu commited on
Commit
acf17aa
·
1 Parent(s): 0418ab0
README.md CHANGED
@@ -3,10 +3,19 @@
3
 
4
  ## About ##
5
  Scribd-dl helps downloading documents on [scribd.com](https://www.scribd.com/) without membership / sign-in.
6
- It takes snapshots of the pages and generate a .pdf file.
 
 
 
 
 
7
 
8
  Friendly reminder:
9
- The .pdf generated is formed by images, so NO text can be copied directly from it.
 
 
 
 
10
 
11
  ## Prerequisites ##
12
  Please make sure the following tool(s) / application(s) are properly setup and ready to use:
@@ -24,21 +33,34 @@ npm install
24
  ```
25
 
26
  ## Configuration ##
27
- Output directory can be configured in `config.ini`
28
  ```ini
 
 
 
29
  [DIRECTORY]
30
  output=output
31
  ```
 
 
 
32
 
33
  ## Usage (CLI) ##
34
  ```console
35
- Usage: npm start [URL]
 
 
 
36
  ```
37
 
38
- #### Example: Download 《The Minds of Billy Milligan》 ####
39
  ```console
40
  npm start https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes
41
  ```
42
 
 
 
 
 
 
43
  ## License ##
44
  [GNU GPL v3.0](LICENSE.md)
 
3
 
4
  ## About ##
5
  Scribd-dl helps downloading documents on [scribd.com](https://www.scribd.com/) without membership / sign-in.
6
+ 2 modes are available:
7
+ - default: the .pdf file is generated by chromium's print function
8
+ - image-based the .pdf file is generated by image snapshots taken for pages
9
+
10
+ It is prefer to use the `default` mode as it gives a better performance in generation time and file size.
11
+ `image-based` mode is a backup solution in case the `default` mode doesn't work as expected.
12
 
13
  Friendly reminder:
14
+ 1. The .pdf generated by `image-based` mode is formed by images, so it does NOT contain any text.
15
+
16
+ ## Development Plan ##
17
+ Scribd obfuscates the .pdf files, the texts copied from the documents will become strange garbled message.
18
+ De-obfuscating will be the next stage.
19
 
20
  ## Prerequisites ##
21
  Please make sure the following tool(s) / application(s) are properly setup and ready to use:
 
33
  ```
34
 
35
  ## Configuration ##
 
36
  ```ini
37
+ [SCRIBD]
38
+ rendertime=100
39
+
40
  [DIRECTORY]
41
  output=output
42
  ```
43
+ Configuration can be altered in `config.ini`.
44
+ `rendertime` is the waiting time in millisecond for single page rendering, it is only applicable for `default` mode. (too short might cause missing images)
45
+ `output` is the ouput directory for generated .pdf files.
46
 
47
  ## Usage (CLI) ##
48
  ```console
49
+ Usage: npm start [options] url
50
+ Options:
51
+ /d default: generated by chromium's print function
52
+ /i image-based: generated by image snapshots taken for pages
53
  ```
54
 
55
+ #### Example 1: Download 《The Minds of Billy Milligan》 ####
56
  ```console
57
  npm start https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes
58
  ```
59
 
60
+ #### Example 2: Download 《The Minds of Billy Milligan》 using image-based method####
61
+ ```console
62
+ npm start /i https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes
63
+ ```
64
+
65
  ## License ##
66
  [GNU GPL v3.0](LICENSE.md)
config.ini CHANGED
@@ -1,2 +1,5 @@
 
 
 
1
  [DIRECTORY]
2
  output=output
 
1
+ [SCRIBD]
2
+ rendertime=100
3
+
4
  [DIRECTORY]
5
  output=output
run.js CHANGED
@@ -1,7 +1,24 @@
1
  import { app } from './src/App.js'
 
2
 
3
- if (process.argv.length == 3) {
4
- await app.execute(process.argv[2])
 
 
 
 
 
 
 
 
 
 
 
5
  } else {
6
- console.error(`Usage: npm start [URL]`)
 
 
 
 
 
7
  }
 
1
  import { app } from './src/App.js'
2
+ import * as scribdFlag from './src/const/scribdFlag.js'
3
 
4
+ const flags = [scribdFlag.DEFAULT, scribdFlag.IMAGE]
5
+
6
+ if (process.argv.length >= 3) {
7
+ let url;
8
+ let flag;
9
+ for (let i = 2; i < process.argv.length; i++) {
10
+ if (flags.includes(process.argv[i])) {
11
+ flag = process.argv[i]
12
+ } else {
13
+ url = process.argv[i]
14
+ }
15
+ }
16
+ await app.execute(url, flag)
17
  } else {
18
+ console.error(`
19
+ Usage: npm start [options] url
20
+ Options:
21
+ /d default: generated by chromium's print function
22
+ /i image-based: generated by image snapshots taken for pages
23
+ `)
24
  }
src/App.js CHANGED
@@ -1,5 +1,5 @@
1
  import { scribdDownloader } from "./service/ScribdDownloader.js"
2
- import * as scribdRegex from "./regex/ScribdRegex.js"
3
 
4
  class App {
5
  constructor() {
@@ -9,9 +9,9 @@ class App {
9
  return App.instance
10
  }
11
 
12
- async execute(url) {
13
  if (url.match(scribdRegex.DOMAIN)) {
14
- await scribdDownloader.execute(url)
15
  } else {
16
  throw new Error(`Unsupported URL: ${url}`)
17
  }
 
1
  import { scribdDownloader } from "./service/ScribdDownloader.js"
2
+ import * as scribdRegex from "./const/ScribdRegex.js"
3
 
4
  class App {
5
  constructor() {
 
9
  return App.instance
10
  }
11
 
12
+ async execute(url, flag) {
13
  if (url.match(scribdRegex.DOMAIN)) {
14
+ await scribdDownloader.execute(url, flag)
15
  } else {
16
  throw new Error(`Unsupported URL: ${url}`)
17
  }
src/const/ScribdFlag.js ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ const DEFAULT = "/d"
2
+ const IMAGE = "/i"
3
+
4
+ export { DEFAULT, IMAGE }
src/{regex → const}/ScribdRegex.js RENAMED
File without changes
src/service/ScribdDownloader.js CHANGED
@@ -3,12 +3,15 @@ import { puppeteerSg } from "../utils/request/PuppeteerSg.js";
3
  import { pdfGenerator } from "../utils/io/PdfGenerator.js";
4
  import { configLoader } from "../utils/io/ConfigLoader.js";
5
  import { directoryIo } from "../utils/io/DirectoryIo.js"
6
- import * as scribdRegex from "../regex/ScribdRegex.js"
 
7
  import { Image } from "../object/Image.js"
8
  import sharp from "sharp";
 
9
 
10
 
11
- const outputDir = configLoader.load("DIRECTORY", "output")
 
12
 
13
  class ScribdDownloader {
14
  constructor() {
@@ -18,27 +21,88 @@ class ScribdDownloader {
18
  return ScribdDownloader.instance
19
  }
20
 
21
- async execute(url) {
 
 
 
 
 
 
 
 
22
  if (url.match(scribdRegex.DOCUMENT)) {
23
- await this.embeds(`https://www.scribd.com/embeds/${scribdRegex.DOCUMENT.exec(url)[2]}/content`)
24
  } else if (url.match(scribdRegex.EMBED)) {
25
- await this.embeds(url)
26
  } else {
27
  throw new Error(`Unsupported URL: ${url}`)
28
  }
29
  }
30
 
31
- async embeds(url) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  let deviceScaleFactor = 2
33
  const m = scribdRegex.EMBED.exec(url)
34
  if (m) {
35
  let id = m[1]
36
 
37
  // prepare temp dir
38
- let dir = `${outputDir}/${id}`
39
  await directoryIo.create(dir)
40
 
41
- // access scribd
42
  let page = await puppeteerSg.getPage(url)
43
 
44
  // wait rendering
@@ -62,7 +126,7 @@ class ScribdDownloader {
62
  await page.evaluate((i) => { // eslint-disable-next-line
63
  document.getElementById(`outer_page_${(i + 1)}`).scrollIntoView()
64
  }, i)
65
-
66
  let width = 1191
67
  let height = 1684
68
  let style = await doc_pages[i].evaluate((el) => el.getAttribute("style"));
@@ -85,10 +149,10 @@ class ScribdDownloader {
85
  bar.stop();
86
 
87
  // generate pdf
88
- await pdfGenerator.generate(images, `${outputDir}/${id}.pdf`)
89
 
90
  // remove temp dir
91
- directoryIo.remove(`${outputDir}/${id}`)
92
 
93
  await page.close()
94
  await puppeteerSg.close()
 
3
  import { pdfGenerator } from "../utils/io/PdfGenerator.js";
4
  import { configLoader } from "../utils/io/ConfigLoader.js";
5
  import { directoryIo } from "../utils/io/DirectoryIo.js"
6
+ import * as scribdRegex from "../const/ScribdRegex.js"
7
+ import * as scribdFlag from '../const/scribdFlag.js'
8
  import { Image } from "../object/Image.js"
9
  import sharp from "sharp";
10
+ import path from 'path'
11
 
12
 
13
+ const output = configLoader.load("DIRECTORY", "output")
14
+ const rendertime = parseInt(configLoader.load("SCRIBD", "rendertime"))
15
 
16
  class ScribdDownloader {
17
  constructor() {
 
21
  return ScribdDownloader.instance
22
  }
23
 
24
+ async execute(url, flag) {
25
+ let fn;
26
+ if (flag === scribdFlag.IMAGE) {
27
+ console.log(`Mode: IMAGE`)
28
+ fn = this.embeds_image
29
+ } else {
30
+ console.log(`Mode: DEFAULT`)
31
+ fn = this.embeds_default
32
+ }
33
  if (url.match(scribdRegex.DOCUMENT)) {
34
+ await fn(`https://www.scribd.com/embeds/${scribdRegex.DOCUMENT.exec(url)[2]}/content`)
35
  } else if (url.match(scribdRegex.EMBED)) {
36
+ await fn(url)
37
  } else {
38
  throw new Error(`Unsupported URL: ${url}`)
39
  }
40
  }
41
 
42
+ async embeds_default(url) {
43
+ const m = scribdRegex.EMBED.exec(url)
44
+ if (m) {
45
+ let id = m[1]
46
+
47
+ // navigate to scribd
48
+ let page = await puppeteerSg.getPage(url)
49
+
50
+ // wait rendering
51
+ await new Promise(resolve => setTimeout(resolve, 1000))
52
+
53
+ // load all pages
54
+ let doc_pages = await page.$$("div.outer_page_container div[id^='outer_page_']")
55
+ const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
56
+ bar.start(doc_pages.length, 0);
57
+ for (let i = 0; i < doc_pages.length; i++) {
58
+ await page.evaluate((i) => { // eslint-disable-next-line
59
+ document.getElementById(`outer_page_${(i + 1)}`).scrollIntoView() // eslint-disable-next-line
60
+ document.getElementById(`outer_page_${(i + 1)}`).style.margin = 0
61
+ }, i)
62
+ await new Promise(resolve => setTimeout(resolve, rendertime))
63
+ bar.update(i + 1);
64
+ }
65
+ bar.stop();
66
+
67
+ // pdf setting
68
+ let options = {
69
+ path: `${output}/${id}.pdf`,
70
+ printBackground: true,
71
+ }
72
+ let first_page = await page.$("div.outer_page_container div[id^='outer_page_']")
73
+ let style = await first_page.evaluate((el) => el.getAttribute("style"))
74
+ if (style.includes("width:") && style.includes("height:")) {
75
+ options.height = parseInt(style.split("height:")[1].split("px")[0].trim())
76
+ options.width = parseInt(style.split("width:")[1].split("px")[0].trim())
77
+ }
78
+
79
+ // show doc only
80
+ await page.evaluate(() => { // eslint-disable-next-line
81
+ document.body.innerHTML = document.querySelector("div.outer_page_container").innerHTML
82
+ })
83
+
84
+ await directoryIo.create(path.dirname(options.path))
85
+ await page.pdf(options);
86
+ console.log(`Generated: ${options.path}`)
87
+
88
+ await page.close()
89
+ await puppeteerSg.close()
90
+ } else {
91
+ throw new Error(`Unsupported URL: ${url}`)
92
+ }
93
+ }
94
+
95
+ async embeds_image(url) {
96
  let deviceScaleFactor = 2
97
  const m = scribdRegex.EMBED.exec(url)
98
  if (m) {
99
  let id = m[1]
100
 
101
  // prepare temp dir
102
+ let dir = `${output}/${id}`
103
  await directoryIo.create(dir)
104
 
105
+ // navigate to scribd
106
  let page = await puppeteerSg.getPage(url)
107
 
108
  // wait rendering
 
126
  await page.evaluate((i) => { // eslint-disable-next-line
127
  document.getElementById(`outer_page_${(i + 1)}`).scrollIntoView()
128
  }, i)
129
+
130
  let width = 1191
131
  let height = 1684
132
  let style = await doc_pages[i].evaluate((el) => el.getAttribute("style"));
 
149
  bar.stop();
150
 
151
  // generate pdf
152
+ await pdfGenerator.generate(images, `${output}/${id}.pdf`)
153
 
154
  // remove temp dir
155
+ directoryIo.remove(`${output}/${id}`)
156
 
157
  await page.close()
158
  await puppeteerSg.close()