Shreyas094 commited on
Commit
e4abe82
1 Parent(s): 9bc0e06

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -91,29 +91,35 @@ def scrape_with_bs4(url, session, max_chars=None):
91
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
92
  return ""
93
 
 
 
 
 
94
  def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
95
  try:
96
  response = requests.get(url, timeout=timeout)
97
  response.raise_for_status()
98
  downloaded = response.text
99
  content = ""
 
100
  if use_beautifulsoup:
101
  soup = BeautifulSoup(downloaded, "lxml")
102
  # Convert BeautifulSoup object to a string
103
  html_string = str(soup)
104
- # Use Trafilatura's parse_html function
105
- tree = parse_html(html_string)
106
- content = extract(tree, include_comments=False, include_tables=True, no_fallback=False)
107
 
108
  # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
109
  if not content and use_beautifulsoup:
110
  logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
111
  content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
112
- # If still no content, use the direct method
 
113
  if not content:
114
- content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
 
115
  return (content or "")[:max_chars] if max_chars else (content or "")
116
- except Timeout:
117
  logger.error(f"Timeout error while scraping {url} with Trafilatura")
118
  return ""
119
  except Exception as e:
 
91
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
92
  return ""
93
 
94
+ from bs4 import BeautifulSoup
95
+ from trafilatura import extract
96
+ import requests
97
+
98
  def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
99
  try:
100
  response = requests.get(url, timeout=timeout)
101
  response.raise_for_status()
102
  downloaded = response.text
103
  content = ""
104
+
105
  if use_beautifulsoup:
106
  soup = BeautifulSoup(downloaded, "lxml")
107
  # Convert BeautifulSoup object to a string
108
  html_string = str(soup)
109
+ # Use Trafilatura's extract function directly on the HTML string
110
+ content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
 
111
 
112
  # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
113
  if not content and use_beautifulsoup:
114
  logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
115
  content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
116
+
117
+ # If still no content, use the URL directly
118
  if not content:
119
+ content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
120
+
121
  return (content or "")[:max_chars] if max_chars else (content or "")
122
+ except requests.Timeout:
123
  logger.error(f"Timeout error while scraping {url} with Trafilatura")
124
  return ""
125
  except Exception as e: