File size: 1,005 Bytes
b22f922
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import pypandoc
## stdlib
import subprocess
import json
from typing import Optional

def file_to_html(file_path: str) -> str:
    return pypandoc.convert_file(file_path, "html")


def extract_url(url: str) -> Optional[str]:
    cmd = f"""shot-scraper javascript -b firefox \
      "{url}" "
    async () => {{
      const readability = await import('https://cdn.skypack.dev/@mozilla/readability');
      return (new readability.Readability(document)).parse();
    }}"
"""
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    try:
        result.check_returncode()
    except:
        raise Exception(
            f"Please try copy-paste as input. Failed to extract content from url: {url}. Error: {result.stderr}"
        )
    result = json.loads(result.stdout)
    try:
        return result["textContent"]
    except:
        raise Exception(
            f"Please try copy-paste as input. Failed to extract content from: {url}. Didn't find content from given URL!"
        )