Spaces:

cINAWGD
/

skribd

Sleeping

App Files Files Community

rkwyu commited on Apr 28, 2024

Commit

acf17aa

1 Parent(s): 0418ab0

Add modes

Browse files

Files changed (7) hide show

README.md +27 -5
config.ini +3 -0
run.js +20 -3
src/App.js +3 -3
src/const/ScribdFlag.js +4 -0
src/{regex → const}/ScribdRegex.js +0 -0
src/service/ScribdDownloader.js +75 -11

README.md CHANGED Viewed

@@ -3,10 +3,19 @@
 ## About ##
 Scribd-dl helps downloading documents on [scribd.com](https://www.scribd.com/) without membership / sign-in.
-It takes snapshots of the pages and generate a .pdf file.
 Friendly reminder:
-The .pdf generated is formed by images, so NO text can be copied directly from it.
 ## Prerequisites ##
 Please make sure the following tool(s) / application(s) are properly setup and ready to use:
@@ -24,21 +33,34 @@ npm install
 ```
 ## Configuration ##
-Output directory can be configured in `config.ini`
 ```ini
 [DIRECTORY]
 output=output
 ```
 ## Usage (CLI) ##
 ```console
-Usage: npm start [URL]
 ```
-#### Example: Download 《The Minds of Billy Milligan》 ####
 ```console
 npm start https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes
 ```
 ## License ##
 [GNU GPL v3.0](LICENSE.md)

 ## About ##
 Scribd-dl helps downloading documents on [scribd.com](https://www.scribd.com/) without membership / sign-in.
+2 modes are available:
+- default:          the .pdf file is generated by chromium's print function
+- image-based       the .pdf file is generated by image snapshots taken for pages
+It is prefer to use the `default` mode as it gives a better performance in generation time and file size.
+`image-based` mode is a backup solution in case the `default` mode doesn't work as expected.
 Friendly reminder:
+1. The .pdf generated by `image-based` mode is formed by images, so it does NOT contain any text.
+## Development Plan ##
+Scribd obfuscates the .pdf files, the texts copied from the documents will become strange garbled message.
+De-obfuscating will be the next stage.
 ## Prerequisites ##
 Please make sure the following tool(s) / application(s) are properly setup and ready to use:
 ```
 ## Configuration ##
 ```ini
+[SCRIBD]
+rendertime=100
 [DIRECTORY]
 output=output
 ```
+Configuration can be altered in `config.ini`.
+`rendertime` is the waiting time in millisecond for single page rendering, it is only applicable for `default` mode. (too short might cause missing images)
+`output` is the ouput directory for generated .pdf files.
 ## Usage (CLI) ##
 ```console
+Usage: npm start [options] url
+Options:
+  /d            default: generated by chromium's print function
+  /i        image-based: generated by image snapshots taken for pages
 ```
+#### Example 1: Download 《The Minds of Billy Milligan》 ####
 ```console
 npm start https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes
 ```
+#### Example 2: Download 《The Minds of Billy Milligan》 using image-based method####
+```console
+npm start /i https://www.scribd.com/doc/249398282/The-Minds-of-Billy-Milligan-Daniel-Keyes
+```
 ## License ##
 [GNU GPL v3.0](LICENSE.md)

config.ini CHANGED Viewed

@@ -1,2 +1,5 @@
 [DIRECTORY]
 output=output

+[SCRIBD]
+rendertime=100
 [DIRECTORY]
 output=output

run.js CHANGED Viewed

@@ -1,7 +1,24 @@
 import { app } from './src/App.js'
-if (process.argv.length == 3) {
-    await app.execute(process.argv[2])
 } else {
-    console.error(`Usage: npm start [URL]`)
 }

 import { app } from './src/App.js'
+import * as scribdFlag  from './src/const/scribdFlag.js'
+const flags = [scribdFlag.DEFAULT, scribdFlag.IMAGE]
+if (process.argv.length >= 3) {
+    let url;
+    let flag;
+    for (let i = 2; i < process.argv.length; i++) {
+        if (flags.includes(process.argv[i])) {
+            flag = process.argv[i]
+        } else {
+            url = process.argv[i]
+        }
+    }
+    await app.execute(url, flag)
 } else {
+    console.error(`
+Usage: npm start [options] url
+Options:
+  /d            default: generated by chromium's print function
+  /i        image-based: generated by image snapshots taken for pages
+    `)
 }

src/App.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { scribdDownloader } from "./service/ScribdDownloader.js"
-import * as scribdRegex from "./regex/ScribdRegex.js"
 class App {
     constructor() {
@@ -9,9 +9,9 @@ class App {
         return App.instance
     }
-    async execute(url) {
         if (url.match(scribdRegex.DOMAIN)) {
-            await scribdDownloader.execute(url)
         } else {
             throw new Error(`Unsupported URL: ${url}`)
         }

 import { scribdDownloader } from "./service/ScribdDownloader.js"
+import * as scribdRegex from "./const/ScribdRegex.js"
 class App {
     constructor() {
         return App.instance
     }
+    async execute(url, flag) {
         if (url.match(scribdRegex.DOMAIN)) {
+            await scribdDownloader.execute(url, flag)
         } else {
             throw new Error(`Unsupported URL: ${url}`)
         }

src/const/ScribdFlag.js ADDED Viewed

	@@ -0,0 +1,4 @@

+const DEFAULT = "/d"
+const IMAGE = "/i"
+export { DEFAULT, IMAGE }

src/{regex → const}/ScribdRegex.js RENAMED Viewed

File without changes

src/service/ScribdDownloader.js CHANGED Viewed

@@ -3,12 +3,15 @@ import { puppeteerSg } from "../utils/request/PuppeteerSg.js";
 import { pdfGenerator } from "../utils/io/PdfGenerator.js";
 import { configLoader } from "../utils/io/ConfigLoader.js";
 import { directoryIo } from "../utils/io/DirectoryIo.js"
-import * as scribdRegex from "../regex/ScribdRegex.js"
 import { Image } from "../object/Image.js"
 import sharp from "sharp";
-const outputDir = configLoader.load("DIRECTORY", "output")
 class ScribdDownloader {
     constructor() {
@@ -18,27 +21,88 @@ class ScribdDownloader {
         return ScribdDownloader.instance
     }
-    async execute(url) {
         if (url.match(scribdRegex.DOCUMENT)) {
-            await this.embeds(`https://www.scribd.com/embeds/${scribdRegex.DOCUMENT.exec(url)[2]}/content`)
         } else if (url.match(scribdRegex.EMBED)) {
-            await this.embeds(url)
         } else {
             throw new Error(`Unsupported URL: ${url}`)
         }
     }
-    async embeds(url) {
         let deviceScaleFactor = 2
         const m = scribdRegex.EMBED.exec(url)
         if (m) {
             let id = m[1]
             // prepare temp dir
-            let dir = `${outputDir}/${id}`
             await directoryIo.create(dir)
-            // access scribd
             let page = await puppeteerSg.getPage(url)
             // wait rendering
@@ -62,7 +126,7 @@ class ScribdDownloader {
                 await page.evaluate((i) => { // eslint-disable-next-line
                     document.getElementById(`outer_page_${(i + 1)}`).scrollIntoView()
                 }, i)
                 let width = 1191
                 let height = 1684
                 let style = await doc_pages[i].evaluate((el) => el.getAttribute("style"));
@@ -85,10 +149,10 @@ class ScribdDownloader {
             bar.stop();
             // generate pdf
-            await pdfGenerator.generate(images, `${outputDir}/${id}.pdf`)
             // remove temp dir
-            directoryIo.remove(`${outputDir}/${id}`)
             await page.close()
             await puppeteerSg.close()

 import { pdfGenerator } from "../utils/io/PdfGenerator.js";
 import { configLoader } from "../utils/io/ConfigLoader.js";
 import { directoryIo } from "../utils/io/DirectoryIo.js"
+import * as scribdRegex from "../const/ScribdRegex.js"
+import * as scribdFlag  from '../const/scribdFlag.js'
 import { Image } from "../object/Image.js"
 import sharp from "sharp";
+import path from 'path'
+const output = configLoader.load("DIRECTORY", "output")
+const rendertime = parseInt(configLoader.load("SCRIBD", "rendertime"))
 class ScribdDownloader {
     constructor() {
         return ScribdDownloader.instance
     }
+    async execute(url, flag) {
+        let fn;
+        if (flag === scribdFlag.IMAGE) {
+            console.log(`Mode: IMAGE`)
+            fn = this.embeds_image
+        } else {
+            console.log(`Mode: DEFAULT`)
+            fn = this.embeds_default
+        }
         if (url.match(scribdRegex.DOCUMENT)) {
+            await fn(`https://www.scribd.com/embeds/${scribdRegex.DOCUMENT.exec(url)[2]}/content`)
         } else if (url.match(scribdRegex.EMBED)) {
+            await fn(url)
         } else {
             throw new Error(`Unsupported URL: ${url}`)
         }
     }
+    async embeds_default(url) {
+        const m = scribdRegex.EMBED.exec(url)
+        if (m) {
+            let id = m[1]
+            // navigate to scribd
+            let page = await puppeteerSg.getPage(url)
+            // wait rendering
+            await new Promise(resolve => setTimeout(resolve, 1000))
+            // load all pages
+            let doc_pages = await page.$$("div.outer_page_container div[id^='outer_page_']")
+            const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
+            bar.start(doc_pages.length, 0);
+            for (let i = 0; i < doc_pages.length; i++) {
+                await page.evaluate((i) => { // eslint-disable-next-line
+                    document.getElementById(`outer_page_${(i + 1)}`).scrollIntoView()  // eslint-disable-next-line
+                    document.getElementById(`outer_page_${(i + 1)}`).style.margin = 0
+                }, i)
+                await new Promise(resolve => setTimeout(resolve, rendertime))
+                bar.update(i + 1);
+            }
+            bar.stop();
+            // pdf setting
+            let options = {
+                path: `${output}/${id}.pdf`,
+                printBackground: true,
+            }
+            let first_page = await page.$("div.outer_page_container div[id^='outer_page_']")
+            let style = await first_page.evaluate((el) => el.getAttribute("style"))
+            if (style.includes("width:") && style.includes("height:")) {
+                options.height = parseInt(style.split("height:")[1].split("px")[0].trim())
+                options.width = parseInt(style.split("width:")[1].split("px")[0].trim())
+            }
+            // show doc only
+            await page.evaluate(() => { // eslint-disable-next-line
+                document.body.innerHTML = document.querySelector("div.outer_page_container").innerHTML
+            })
+            await directoryIo.create(path.dirname(options.path))
+            await page.pdf(options);
+            console.log(`Generated: ${options.path}`)
+            await page.close()
+            await puppeteerSg.close()
+        } else {
+            throw new Error(`Unsupported URL: ${url}`)
+        }
+    }
+    async embeds_image(url) {
         let deviceScaleFactor = 2
         const m = scribdRegex.EMBED.exec(url)
         if (m) {
             let id = m[1]
             // prepare temp dir
+            let dir = `${output}/${id}`
             await directoryIo.create(dir)
+            // navigate to scribd
             let page = await puppeteerSg.getPage(url)
             // wait rendering
                 await page.evaluate((i) => { // eslint-disable-next-line
                     document.getElementById(`outer_page_${(i + 1)}`).scrollIntoView()
                 }, i)
                 let width = 1191
                 let height = 1684
                 let style = await doc_pages[i].evaluate((el) => el.getAttribute("style"));
             bar.stop();
             // generate pdf
+            await pdfGenerator.generate(images, `${output}/${id}.pdf`)
             // remove temp dir
+            directoryIo.remove(`${output}/${id}`)
             await page.close()
             await puppeteerSg.close()