dvt81 commited on
Commit
a5390b7
·
verified ·
1 Parent(s): ac0f86b

simplifying with curl

Browse files

removing selenium to reduce complexity

Files changed (1) hide show
  1. app.py +34 -43
app.py CHANGED
@@ -5,12 +5,8 @@ import pytz
5
  import yaml
6
  from tools.final_answer import FinalAnswerTool
7
  from Gradio_UI import GradioUI
8
-
9
- from selenium import webdriver
10
- from selenium.webdriver.chrome.options import Options
11
- from selenium.webdriver.common.by import By
12
- from selenium.webdriver.chrome.service import Service
13
- from webdriver_manager.chrome import ChromeDriverManager
14
 
15
  @tool
16
  def get_zh_top_news() -> tuple[str, str]:
@@ -18,57 +14,52 @@ def get_zh_top_news() -> tuple[str, str]:
18
 
19
  Returns:
20
  tuple[str, str]: A tuple containing the article title (str) and its URL (str).
21
-
22
- Raises:
23
- Exception: If the page fails to load or the expected element is not found.
24
  """
25
- # Set up Chrome options for headless browsing
26
- chrome_options = Options()
27
- chrome_options.add_argument("--headless")
28
- chrome_options.add_argument("--disable-gpu")
29
- chrome_options.add_argument("--no-sandbox")
30
- chrome_options.add_argument("--disable-dev-shm-usage")
31
-
32
- # Specify ChromeDriver path (installed via Dockerfile)
33
- service = Service(executable_path="/usr/local/bin/chromedriver")
34
-
35
- try:
36
- # Initialize the WebDriver
37
- driver = webdriver.Chrome(service=service, options=chrome_options)
38
- print("DEBUG: WebDriver initialized successfully")
39
- except Exception as e:
40
- print(f"DEBUG: Failed to initialize WebDriver: {e}")
41
- return "Error: WebDriver failed", "https://www.zerohedge.com"
42
-
43
  try:
44
- # Navigate to ZeroHedge homepage
45
- driver.get("https://www.zerohedge.com")
46
- print("DEBUG: Page loaded")
 
 
 
 
 
 
 
 
 
47
 
48
  # Find the first <h2> with class starting with 'Article_title___'
49
- top_article = driver.find_element(By.CSS_SELECTOR, "h2[class^='Article_title___']")
50
- print(f"DEBUG: Found article object: {top_article}")
51
-
52
- # Extract the title from the <a> tag inside the <h2>
53
- article_title = top_article.find_element(By.TAG_NAME, "a").text.strip() or "No title found"
54
-
55
- # Extract the URL from the href attribute of the <a> tag
56
- article_link = top_article.find_element(By.TAG_NAME, "a").get_attribute("href") or "https://www.zerohedge.com"
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  # Ensure the link is absolute
59
  if not article_link.startswith("http"):
60
  article_link = f"https://www.zerohedge.com{article_link}"
61
 
62
- print(f"DEBUG: Returning title='{article_title}', link='{article_link}'")
63
  return article_title, article_link
64
 
 
 
 
65
  except Exception as e:
66
- print(f"Error retrieving top headline: {e}")
67
  return "Error: Headline not found", "https://www.zerohedge.com"
68
 
69
- finally:
70
- driver.quit()
71
-
72
  @tool
73
  def get_current_time_in_timezone(timezone: str) -> str:
74
  """A tool that fetches the current local time in a specified timezone.
 
5
  import yaml
6
  from tools.final_answer import FinalAnswerTool
7
  from Gradio_UI import GradioUI
8
+ from bs4 import BeautifulSoup
9
+ import subprocess
 
 
 
 
10
 
11
  @tool
12
  def get_zh_top_news() -> tuple[str, str]:
 
14
 
15
  Returns:
16
  tuple[str, str]: A tuple containing the article title (str) and its URL (str).
 
 
 
17
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  try:
19
+ # Use curl to fetch the HTML content
20
+ result = subprocess.run(
21
+ ["curl", "-s", "https://www.zerohedge.com"],
22
+ capture_output=True,
23
+ text=True,
24
+ check=True
25
+ )
26
+ html_content = result.stdout
27
+ print(f"DEBUG: Fetched HTML length: {len(html_content)}") # Debug: Check if content is retrieved
28
+
29
+ # Parse HTML with BeautifulSoup
30
+ soup = BeautifulSoup(html_content, "html.parser")
31
 
32
  # Find the first <h2> with class starting with 'Article_title___'
33
+ top_article = soup.find("h2", class_=lambda x: x and x.startswith("Article_title___"))
 
 
 
 
 
 
 
34
 
35
+ if not top_article:
36
+ print("DEBUG: No matching <h2> found")
37
+ return "Error: Headline not found", "https://www.zerohedge.com"
38
+
39
+ # Extract the <a> tag inside the <h2>
40
+ link_tag = top_article.find("a")
41
+ if not link_tag:
42
+ print("DEBUG: No <a> tag found in top article")
43
+ return "Error: Headline not found", "https://www.zerohedge.com"
44
+
45
+ # Get title and URL
46
+ article_title = link_tag.get_text(strip=True) or "No title found"
47
+ article_link = link_tag.get("href") or "https://www.zerohedge.com"
48
+
49
  # Ensure the link is absolute
50
  if not article_link.startswith("http"):
51
  article_link = f"https://www.zerohedge.com{article_link}"
52
 
53
+ print(f"DEBUG: Title = '{article_title}', Link = '{article_link}'")
54
  return article_title, article_link
55
 
56
+ except subprocess.CalledProcessError as e:
57
+ print(f"Error fetching page with curl: {e}")
58
+ return "Error: Fetch failed", "https://www.zerohedge.com"
59
  except Exception as e:
60
+ print(f"Error parsing content: {e}")
61
  return "Error: Headline not found", "https://www.zerohedge.com"
62
 
 
 
 
63
  @tool
64
  def get_current_time_in_timezone(timezone: str) -> str:
65
  """A tool that fetches the current local time in a specified timezone.