finance-tracker / news_scraper.py
arachn0id's picture
Upload 7 files
f3d9e94 verified
import requests
from bs4 import BeautifulSoup
import json
excluded_urls = ["finance.yahoo.com", "google.com/finance"]
def search_duckduckgo(keywords):
url = f"https://duckduckgo.com/html/?q={'+'.join(keywords)}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching search results: {e}")
return ""
def parse_results(html, keywords):
soup = BeautifulSoup(html, "html.parser")
results = soup.select(".result")
parsed_results = []
for result in results:
try:
link = result.select_one(".result__a").get(
"href"
) # Changed selector for link
title = result.select_one(".result__a").text # Changed selector for title
description = result.select_one(
".result__snippet"
) # Kept the same selector for description
if description:
description = description.text
else:
description = ""
result_data = {
"Link": link,
"Title": title,
"Description": description,
}
# Check if the link is not in excluded URLs
if not any(excluded_url in link for excluded_url in excluded_urls):
# Check if any keyword is in title, description, or link
if any(
keyword.lower() in result_data["Title"].lower()
or keyword.lower() in result_data["Description"].lower()
or keyword.lower() in result_data["Link"].lower()
for keyword in keywords
):
print(result_data)
parsed_results.append(result_data)
except Exception as e:
print(f"Error parsing result: {e}")
return parsed_results
# keywords = ["tatasteel", "finance", "news"]
def perform_search(keywords):
html = search_duckduckgo(keywords)
if html:
results = parse_results(html, keywords)
if results:
with open("results.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=4)
else:
print("No results found.")
else:
print("Failed to fetch search results.")
# if __name__ == "__main__":
# perform_search(keywords = keywords)