Update approach_api/utils/news_extraction_api.py
Browse files
approach_api/utils/news_extraction_api.py
CHANGED
@@ -4,7 +4,7 @@ from bs4 import BeautifulSoup
|
|
4 |
# NewsAPI Key
|
5 |
NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
|
6 |
|
7 |
-
def extract_news(company, num_articles=
|
8 |
"""Fetch multiple news articles from NewsAPI and return titles and contents."""
|
9 |
url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
|
10 |
response = requests.get(url)
|
@@ -48,62 +48,3 @@ def extract_news(company, num_articles=2):
|
|
48 |
return extracted_articles
|
49 |
|
50 |
|
51 |
-
# import requests
|
52 |
-
# from bs4 import BeautifulSoup
|
53 |
-
|
54 |
-
# # NewsAPI Key
|
55 |
-
# NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
|
56 |
-
|
57 |
-
# def fetch_articles(company, num_articles=11):
|
58 |
-
# """Fetch multiple news articles from NewsAPI and return their titles and content."""
|
59 |
-
# url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
|
60 |
-
# response = requests.get(url)
|
61 |
-
|
62 |
-
# if response.status_code != 200:
|
63 |
-
# print("Error:", response.status_code, response.text)
|
64 |
-
# return []
|
65 |
-
|
66 |
-
# data = response.json()
|
67 |
-
# articles = data.get("articles", [])
|
68 |
-
|
69 |
-
# if not articles:
|
70 |
-
# print("No articles found.")
|
71 |
-
# return []
|
72 |
-
|
73 |
-
# fetched_articles = []
|
74 |
-
|
75 |
-
# for article in articles[:num_articles]: # Fetch only the required number of articles
|
76 |
-
# article_url = article.get("url")
|
77 |
-
# if not article_url:
|
78 |
-
# continue
|
79 |
-
|
80 |
-
# # Scrape the article for title and content
|
81 |
-
# try:
|
82 |
-
# article_response = requests.get(article_url, timeout=5) # Removed headers
|
83 |
-
# if article_response.status_code == 200:
|
84 |
-
# soup = BeautifulSoup(article_response.content, 'html.parser')
|
85 |
-
# title = soup.title.string if soup.title else "No Title Found"
|
86 |
-
|
87 |
-
# # Extract paragraphs and clean the content
|
88 |
-
# paragraphs = soup.find_all('p')
|
89 |
-
# content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
|
90 |
-
|
91 |
-
# # Remove unwanted text patterns
|
92 |
-
# unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
|
93 |
-
# for pattern in unwanted_patterns:
|
94 |
-
# content = content.replace(pattern, "")
|
95 |
-
|
96 |
-
# # Clean up extra spaces
|
97 |
-
# content = ' '.join(content.split())
|
98 |
-
|
99 |
-
# # Store the article's title and content
|
100 |
-
# fetched_articles.append({"title": title, "content": content})
|
101 |
-
# except requests.exceptions.RequestException as e:
|
102 |
-
# print(f"Error fetching article: {e}")
|
103 |
-
|
104 |
-
# return fetched_articles
|
105 |
-
|
106 |
-
# if __name__ == "__main__":
|
107 |
-
# company = input("Enter the company name for analysis: ").strip()
|
108 |
-
# articles = fetch_articles(company, num_articles=11)
|
109 |
-
# print(articles)
|
|
|
4 |
# NewsAPI Key
|
5 |
NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
|
6 |
|
7 |
+
def extract_news(company, num_articles=15):
|
8 |
"""Fetch multiple news articles from NewsAPI and return titles and contents."""
|
9 |
url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
|
10 |
response = requests.get(url)
|
|
|
48 |
return extracted_articles
|
49 |
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|