Shreyas094 commited on
Commit
9bc0e06
·
verified ·
1 Parent(s): 6c0f253

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -4
app.py CHANGED
@@ -21,8 +21,8 @@ import os
21
  from dotenv import load_dotenv
22
  import certifi
23
  from bs4 import BeautifulSoup
24
- from trafilatura import extract
25
- from trafilatura.htmlprocessing import convert_tree
26
 
27
  # Load environment variables from a .env file
28
  load_dotenv()
@@ -99,8 +99,11 @@ def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=Fa
99
  content = ""
100
  if use_beautifulsoup:
101
  soup = BeautifulSoup(downloaded, "lxml")
102
- lxml_tree = convert_tree(soup)[0]
103
- content = extract(lxml_tree, include_comments=False, include_tables=True, no_fallback=False)
 
 
 
104
 
105
  # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
106
  if not content and use_beautifulsoup:
 
21
  from dotenv import load_dotenv
22
  import certifi
23
  from bs4 import BeautifulSoup
24
+
25
+ from trafilatura.core import parse_html
26
 
27
  # Load environment variables from a .env file
28
  load_dotenv()
 
99
  content = ""
100
  if use_beautifulsoup:
101
  soup = BeautifulSoup(downloaded, "lxml")
102
+ # Convert BeautifulSoup object to a string
103
+ html_string = str(soup)
104
+ # Use Trafilatura's parse_html function
105
+ tree = parse_html(html_string)
106
+ content = extract(tree, include_comments=False, include_tables=True, no_fallback=False)
107
 
108
  # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
109
  if not content and use_beautifulsoup: