Spaces:
Running
Running
File size: 1,005 Bytes
b22f922 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
import pypandoc
## stdlib
import subprocess
import json
from typing import Optional
def file_to_html(file_path: str) -> str:
return pypandoc.convert_file(file_path, "html")
def extract_url(url: str) -> Optional[str]:
cmd = f"""shot-scraper javascript -b firefox \
"{url}" "
async () => {{
const readability = await import('https://cdn.skypack.dev/@mozilla/readability');
return (new readability.Readability(document)).parse();
}}"
"""
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
try:
result.check_returncode()
except:
raise Exception(
f"Please try copy-paste as input. Failed to extract content from url: {url}. Error: {result.stderr}"
)
result = json.loads(result.stdout)
try:
return result["textContent"]
except:
raise Exception(
f"Please try copy-paste as input. Failed to extract content from: {url}. Didn't find content from given URL!"
) |