first_commit
Browse files- .gitignore +43 -0
- approach_api/api/api.py +18 -0
- approach_api/main.py +198 -0
- approach_api/utils/comparative_analysis.py +121 -0
- approach_api/utils/news_extraction_api.py +109 -0
- approach_api/utils/news_sentiment.py +54 -0
- approach_api/utils/news_summarisation.py +25 -0
- approach_api/utils/text_to_speech.py +20 -0
- approach_api/utils/topic_extraction.py +64 -0
- approach_library/api/api.py +18 -0
- approach_library/app.py +91 -0
- approach_library/main.py +145 -0
- approach_library/utils/comparative_analysis.py +90 -0
- approach_library/utils/news_extraction.py +51 -0
- approach_library/utils/news_sentiment.py +54 -0
- approach_library/utils/news_summarisation.py +26 -0
- approach_library/utils/text_to_speech.py +20 -0
- approach_library/utils/topic_extraction.py +64 -0
- requirements.txt +26 -0
.gitignore
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
*.egg-info/
|
20 |
+
.installed.cfg
|
21 |
+
*.egg
|
22 |
+
|
23 |
+
# Virtual Environment
|
24 |
+
venv/
|
25 |
+
env/
|
26 |
+
myenv/
|
27 |
+
|
28 |
+
# IDE
|
29 |
+
.idea/
|
30 |
+
.vscode/
|
31 |
+
*.swp
|
32 |
+
*.swo
|
33 |
+
|
34 |
+
# Environment files
|
35 |
+
.env
|
36 |
+
.env.local
|
37 |
+
.env.development.local
|
38 |
+
.env.test.local
|
39 |
+
.env.production.local
|
40 |
+
|
41 |
+
# Logs
|
42 |
+
*.log
|
43 |
+
logs/
|
approach_api/api/api.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from main import analyze_company_news
|
4 |
+
|
5 |
+
app = FastAPI()
|
6 |
+
|
7 |
+
class CompanyRequest(BaseModel):
|
8 |
+
Company_Name: str
|
9 |
+
|
10 |
+
@app.post("/api/company")
|
11 |
+
async def handle_company(request: CompanyRequest):
|
12 |
+
company = request.Company_Name.strip()
|
13 |
+
result = analyze_company_news(company)
|
14 |
+
return result
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
import uvicorn
|
18 |
+
uvicorn.run(app, host="127.0.0.1", port=8000)
|
approach_api/main.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import json
|
2 |
+
# import time
|
3 |
+
# from utils.news_extraction_api import fetch_articles
|
4 |
+
# from utils.news_summarisation import summarize_text
|
5 |
+
# from utils.news_sentiment import analyze_sentiment
|
6 |
+
# from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
|
7 |
+
# from utils.comparative_analysis import comparative_sentiment_analysis
|
8 |
+
# from utils.text_to_speech import text_to_speech
|
9 |
+
|
10 |
+
# def main():
|
11 |
+
# company = input("Enter the company name for analysis: ").strip()
|
12 |
+
|
13 |
+
# # Extract news articles
|
14 |
+
# start_time = time.time()
|
15 |
+
# articles = fetch_articles(company, num_articles=2) # Fetch 2 articles
|
16 |
+
# extraction_time = time.time() - start_time
|
17 |
+
# print(f"✅ Articles extracted in {extraction_time:.2f} seconds")
|
18 |
+
|
19 |
+
# if not articles:
|
20 |
+
# print("⚠️ No news articles found. Try a different company.")
|
21 |
+
# return
|
22 |
+
|
23 |
+
# articles_data = []
|
24 |
+
# all_topics = [] # Collect all topics for better analysis
|
25 |
+
|
26 |
+
# for article in articles:
|
27 |
+
# text = article.get("content", "").strip()
|
28 |
+
|
29 |
+
# if not text:
|
30 |
+
# print(f"⚠️ Skipping article '{article.get('title', 'No Title')}' due to missing content.")
|
31 |
+
# continue
|
32 |
+
|
33 |
+
# # Perform sentiment analysis
|
34 |
+
# start_time = time.time()
|
35 |
+
# sentiment_result = analyze_sentiment([text])
|
36 |
+
# sentiment = sentiment_result.get("Predicted Sentiment", ["Unknown"])[0]
|
37 |
+
# sentiment_time = time.time() - start_time
|
38 |
+
# print(f"✅ Sentiment analysis completed in {sentiment_time:.2f} seconds")
|
39 |
+
|
40 |
+
# # Summarize the article
|
41 |
+
# start_time = time.time()
|
42 |
+
# summary = summarize_text(text)
|
43 |
+
# summary_time = time.time() - start_time
|
44 |
+
# print(f"✅ Summary generation completed in {summary_time:.2f} seconds")
|
45 |
+
|
46 |
+
# # Extract topics
|
47 |
+
# start_time = time.time()
|
48 |
+
# preprocessed_text = preprocess_text([text])
|
49 |
+
# if not preprocessed_text:
|
50 |
+
# print(f"⚠️ No meaningful text extracted for LDA topic modeling in '{article.get('title', 'No Title')}'.")
|
51 |
+
# topic_words = []
|
52 |
+
# else:
|
53 |
+
# lda_model, dictionary = train_lda(preprocessed_text)
|
54 |
+
# topic_words = extract_topic_words(lda_model)
|
55 |
+
# topic_time = time.time() - start_time
|
56 |
+
# print(f"✅ Topic extraction completed in {topic_time:.2f} seconds")
|
57 |
+
|
58 |
+
# # Store processed data
|
59 |
+
# articles_data.append({
|
60 |
+
# "Title": article.get("title", "No Title"),
|
61 |
+
# "Summary": summary,
|
62 |
+
# "Sentiment": sentiment,
|
63 |
+
# "Topics": topic_words if topic_words else []
|
64 |
+
# })
|
65 |
+
|
66 |
+
# # Collect topics for comparative analysis
|
67 |
+
# if topic_words:
|
68 |
+
# all_topics.extend(topic_words)
|
69 |
+
|
70 |
+
# # Ensure articles_data is not empty before analysis
|
71 |
+
# if not articles_data:
|
72 |
+
# print("⚠️ No valid articles with content were processed.")
|
73 |
+
# return
|
74 |
+
|
75 |
+
# # Perform comparative sentiment analysis
|
76 |
+
# start_time = time.time()
|
77 |
+
# analysis_result = comparative_sentiment_analysis(company, articles_data)
|
78 |
+
# analysis_time = time.time() - start_time
|
79 |
+
# print(f"✅ Comparative sentiment analysis completed in {analysis_time:.2f} seconds")
|
80 |
+
|
81 |
+
# # Correctly extract "Comparative Sentiment Score"
|
82 |
+
# comparative_score = analysis_result.get("Comparative Sentiment Score", {})
|
83 |
+
|
84 |
+
# sentiment_distribution = comparative_score.get("Sentiment Distribution", {})
|
85 |
+
# coverage_differences = comparative_score.get("Coverage Differences", {})
|
86 |
+
# topic_overlap = comparative_score.get("Topic Overlap", [])
|
87 |
+
|
88 |
+
# # Debugging check
|
89 |
+
# if not sentiment_distribution:
|
90 |
+
# print("⚠️ No sentiment distribution detected.")
|
91 |
+
# if not coverage_differences:
|
92 |
+
# print("⚠️ No coverage differences found.")
|
93 |
+
# if not topic_overlap:
|
94 |
+
# print("⚠️ No topic overlap detected among articles.")
|
95 |
+
|
96 |
+
# # Final sentiment summary
|
97 |
+
# final_sentiment_analysis = analysis_result.get("Final Sentiment Analysis", "Analysis could not be completed.")
|
98 |
+
|
99 |
+
# # Generate summary speech
|
100 |
+
# start_time = time.time()
|
101 |
+
# final_summary = f"{company}’s latest news coverage is mostly {final_sentiment_analysis}."
|
102 |
+
# audio_file = text_to_speech(final_summary)
|
103 |
+
# audio_time = time.time() - start_time
|
104 |
+
# print(f"✅ Summary speech generation completed in {audio_time:.2f} seconds")
|
105 |
+
|
106 |
+
# # Construct final JSON output
|
107 |
+
# output = {
|
108 |
+
# "Company": company,
|
109 |
+
# "Articles": articles_data,
|
110 |
+
# "Comparative Sentiment Score": {
|
111 |
+
# "Sentiment Distribution": sentiment_distribution,
|
112 |
+
# "Coverage Differences": coverage_differences,
|
113 |
+
# "Topic Overlap": topic_overlap
|
114 |
+
# },
|
115 |
+
# "Extracted Topics": list(set(all_topics)), # Unique topics across articles
|
116 |
+
# "Final Sentiment Analysis": final_summary,
|
117 |
+
# "Audio": f"[Play {audio_file}]"
|
118 |
+
# }
|
119 |
+
|
120 |
+
# # Print JSON output
|
121 |
+
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
122 |
+
|
123 |
+
# # Save JSON output
|
124 |
+
# with open(f"{company}_news_analysis.json", "w", encoding="utf-8") as json_file:
|
125 |
+
# json.dump(output, json_file, indent=4, ensure_ascii=False)
|
126 |
+
|
127 |
+
# if __name__ == "__main__":
|
128 |
+
# main()
|
129 |
+
|
130 |
+
|
131 |
+
import json
|
132 |
+
import time
|
133 |
+
from utils.news_extraction_api import extract_news
|
134 |
+
from utils.news_summarisation import summarize_text
|
135 |
+
from utils.news_sentiment import analyze_sentiment
|
136 |
+
from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
|
137 |
+
from utils.comparative_analysis import comparative_sentiment_analysis
|
138 |
+
from utils.text_to_speech import text_to_speech
|
139 |
+
|
140 |
+
def analyze_company_news(company):
|
141 |
+
# Extract news articles
|
142 |
+
start_time = time.time()
|
143 |
+
articles = extract_news(company)
|
144 |
+
extraction_time = time.time() - start_time
|
145 |
+
|
146 |
+
if not articles:
|
147 |
+
return {"message": "No news articles found. Try a different company."}
|
148 |
+
|
149 |
+
articles_data = [] # List to store processed articles
|
150 |
+
|
151 |
+
# Extract texts from articles for sentiment analysis
|
152 |
+
texts = [article["content"] for article in articles]
|
153 |
+
|
154 |
+
# Perform sentiment analysis
|
155 |
+
start_time = time.time()
|
156 |
+
sentiment_results = analyze_sentiment(texts)
|
157 |
+
sentiment_time = time.time() - start_time
|
158 |
+
|
159 |
+
# Process each article
|
160 |
+
for i, (article, sentiment) in enumerate(zip(articles, sentiment_results["Predicted Sentiment"]), start=1):
|
161 |
+
start_time = time.time()
|
162 |
+
summary = summarize_text(article["content"]) # Summarize article
|
163 |
+
summarization_time = time.time() - start_time
|
164 |
+
|
165 |
+
# Extract topics for the specific article
|
166 |
+
preprocessed_text = preprocess_text([article["content"]])
|
167 |
+
lda_model, dictionary = train_lda(preprocessed_text)
|
168 |
+
topic_words = extract_topic_words(lda_model)
|
169 |
+
|
170 |
+
article_entry = {
|
171 |
+
"Title": article["title"],
|
172 |
+
"Summary": summary,
|
173 |
+
"Sentiment": sentiment,
|
174 |
+
"Topics": topic_words
|
175 |
+
}
|
176 |
+
articles_data.append(article_entry)
|
177 |
+
|
178 |
+
# Perform comparative sentiment analysis
|
179 |
+
analysis_result = comparative_sentiment_analysis(company, articles_data)
|
180 |
+
|
181 |
+
# Generate a summary speech for the entire report
|
182 |
+
final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
|
183 |
+
audio_file = text_to_speech(final_summary) # Generate TTS
|
184 |
+
|
185 |
+
# Construct final JSON output
|
186 |
+
output = {
|
187 |
+
"Company": company,
|
188 |
+
"Articles": articles_data,
|
189 |
+
"Comparative Sentiment Score": analysis_result,
|
190 |
+
"Audio": f"[Play {audio_file}]" # Include a playable reference
|
191 |
+
}
|
192 |
+
|
193 |
+
return output
|
194 |
+
|
195 |
+
if __name__ == "__main__":
|
196 |
+
company = input("Enter the company name for analysis: ").strip()
|
197 |
+
result = analyze_company_news(company)
|
198 |
+
print(json.dumps(result, indent=4, ensure_ascii=False))
|
approach_api/utils/comparative_analysis.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import json
|
3 |
+
from collections import Counter
|
4 |
+
|
5 |
+
def comparative_sentiment_analysis(company, articles, max_comparisons=10, chunk_size=5):
|
6 |
+
"""
|
7 |
+
Perform a comparative sentiment analysis on multiple articles.
|
8 |
+
"""
|
9 |
+
overall_sentiment_counts = Counter()
|
10 |
+
overall_coverage_differences = []
|
11 |
+
all_topics = []
|
12 |
+
|
13 |
+
if not articles:
|
14 |
+
print("No articles found for analysis.")
|
15 |
+
return {
|
16 |
+
"Company": company,
|
17 |
+
"Articles": [],
|
18 |
+
"Comparative Sentiment Score": {
|
19 |
+
"Sentiment Distribution": {},
|
20 |
+
"Coverage Differences": [],
|
21 |
+
"Topic Overlap": {"Common Topics": [], "Unique Topics Per Article": []}
|
22 |
+
},
|
23 |
+
"Final Sentiment Analysis": "No data available."
|
24 |
+
}
|
25 |
+
|
26 |
+
# Process articles in chunks
|
27 |
+
for start in range(0, len(articles), chunk_size):
|
28 |
+
chunk = articles[start:start + chunk_size]
|
29 |
+
|
30 |
+
# Count sentiment distribution
|
31 |
+
sentiment_counts = Counter(article["Sentiment"] for article in chunk)
|
32 |
+
overall_sentiment_counts.update(sentiment_counts)
|
33 |
+
|
34 |
+
# Identify coverage differences
|
35 |
+
for i in range(len(chunk) - 1):
|
36 |
+
for j in range(i + 1, len(chunk)):
|
37 |
+
if len(overall_coverage_differences) >= max_comparisons:
|
38 |
+
break
|
39 |
+
article1, article2 = chunk[i], chunk[j]
|
40 |
+
comparison = {
|
41 |
+
"Comparison": f"'{article1.get('Title', 'Article 1')}' vs '{article2.get('Title', 'Article 2')}'",
|
42 |
+
"Impact": f"{article1.get('Topics', [])} vs {article2.get('Topics', [])}"
|
43 |
+
}
|
44 |
+
overall_coverage_differences.append(comparison)
|
45 |
+
|
46 |
+
# Extract topics ensuring valid lists
|
47 |
+
topics = [set(article.get("Topics", [])) for article in chunk if isinstance(article.get("Topics", list), list) and article.get("Topics", [])]
|
48 |
+
all_topics.extend(topics)
|
49 |
+
|
50 |
+
# Debugging Output
|
51 |
+
print("All Topics Extracted:", all_topics)
|
52 |
+
|
53 |
+
# Determine common and unique topics
|
54 |
+
if len(all_topics) == 0:
|
55 |
+
common_topics = set() # No topics found
|
56 |
+
elif len(all_topics) == 1:
|
57 |
+
common_topics = all_topics[0] # Only one article, take its topics as common
|
58 |
+
else:
|
59 |
+
common_topics = set.intersection(*all_topics) # Find intersection normally
|
60 |
+
|
61 |
+
unique_topics = [{"Article": i + 1, "Unique Topics": list(topics - common_topics)}
|
62 |
+
for i, topics in enumerate(all_topics)]
|
63 |
+
|
64 |
+
# Convert to list for JSON output
|
65 |
+
common_topics = list(common_topics)
|
66 |
+
|
67 |
+
print("Common Topics:", common_topics)
|
68 |
+
|
69 |
+
# Final sentiment summary
|
70 |
+
final_analysis = "The news coverage is mostly "
|
71 |
+
if overall_sentiment_counts["Positive"] > overall_sentiment_counts["Negative"]:
|
72 |
+
final_analysis += "positive, indicating potential growth."
|
73 |
+
elif overall_sentiment_counts["Negative"] > overall_sentiment_counts["Positive"]:
|
74 |
+
final_analysis += "negative, suggesting challenges ahead."
|
75 |
+
else:
|
76 |
+
final_analysis += "balanced, with mixed reactions."
|
77 |
+
|
78 |
+
# Final JSON structure
|
79 |
+
return {
|
80 |
+
"Comparative Sentiment Score": {
|
81 |
+
"Sentiment Distribution": dict(overall_sentiment_counts),
|
82 |
+
"Coverage Differences": overall_coverage_differences,
|
83 |
+
"Topic Overlap": {
|
84 |
+
"Common Topics": common_topics,
|
85 |
+
"Unique Topics Per Article": unique_topics
|
86 |
+
}
|
87 |
+
},
|
88 |
+
"Final Sentiment Analysis": final_analysis
|
89 |
+
}
|
90 |
+
|
91 |
+
# if __name__ == "__main__":
|
92 |
+
# articles = [
|
93 |
+
# {
|
94 |
+
# "Title": "Agentic AI startup AMT aims to be 'Google Adwords for influencers,' raises seed round",
|
95 |
+
# "Summary": "Agentic Marketing Technologies (AMT) has raised $3.5 million in a seed funding round led by San Francisco-based VC NFX .<n>AMT works by getting its AI agent, dubbed Lyra, to talk to influencers using natural language .<n>The company claims Lyra can also autonomously find influencers that match a campaign’s goals .",
|
96 |
+
# "Sentiment": "neutral",
|
97 |
+
# "Topics": [
|
98 |
+
# "influencer",
|
99 |
+
# "marketing"
|
100 |
+
# ]
|
101 |
+
# },
|
102 |
+
# {
|
103 |
+
# "Title": "Google Seals $32 Billion Deal for Cyber Start-Up Wiz",
|
104 |
+
# "Summary": "Google agreed to buy Wiz, a fast-growing cybersecurity start-up, for $32 billion .<n>The all-cash deal would be Google's largest, easily surpassing its $12.5 billion purchase of Motorola Mobility in 2012 .<n>In July, Wiz rejected Google’s $23 billion takeover offer, saying it wanted to pursue an initial public offering .",
|
105 |
+
# "Sentiment": "neutral",
|
106 |
+
# "Topics": [
|
107 |
+
# "wiz",
|
108 |
+
# "google"
|
109 |
+
# ]
|
110 |
+
# },
|
111 |
+
# {
|
112 |
+
# "Title": "Google's new Severance Easter egg is one only innies will understand",
|
113 |
+
# "Summary": "Just search for Severance and Google will pepper your screen with blue balloons .<n>Severance producer and frequent director Ben Stiller shared his show’s new Easter egg on X last night .<n>Severance’s season two finale airs this Friday on Apple TV Plus .",
|
114 |
+
# "Sentiment": "positive",
|
115 |
+
# "Topics": [
|
116 |
+
# "severance"
|
117 |
+
# ]
|
118 |
+
# }
|
119 |
+
# ]
|
120 |
+
# result = comparative_sentiment_analysis(articles)
|
121 |
+
# print(json.dumps(result, indent=4))
|
approach_api/utils/news_extraction_api.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
|
4 |
+
# NewsAPI Key
|
5 |
+
NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
|
6 |
+
|
7 |
+
def extract_news(company, num_articles=2):
|
8 |
+
"""Fetch multiple news articles from NewsAPI and return titles and contents."""
|
9 |
+
url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
|
10 |
+
response = requests.get(url)
|
11 |
+
|
12 |
+
if response.status_code != 200:
|
13 |
+
print("Error:", response.status_code, response.text)
|
14 |
+
return []
|
15 |
+
|
16 |
+
data = response.json()
|
17 |
+
articles = data.get("articles", [])
|
18 |
+
|
19 |
+
if not articles:
|
20 |
+
print("No articles found.")
|
21 |
+
return []
|
22 |
+
|
23 |
+
extracted_articles = []
|
24 |
+
|
25 |
+
for article in articles[:num_articles]: # Get the required number of articles
|
26 |
+
article_url = article.get("url", "No URL available.")
|
27 |
+
|
28 |
+
# Scrape the article for title and content
|
29 |
+
article_response = requests.get(article_url)
|
30 |
+
if article_response.status_code == 200:
|
31 |
+
soup = BeautifulSoup(article_response.content, 'html.parser')
|
32 |
+
title = soup.title.string if soup.title else "No Title Found"
|
33 |
+
|
34 |
+
# Extract paragraphs and clean the content
|
35 |
+
paragraphs = soup.find_all('p')
|
36 |
+
content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
|
37 |
+
|
38 |
+
# Optionally, filter out unwanted text patterns
|
39 |
+
unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
|
40 |
+
for pattern in unwanted_patterns:
|
41 |
+
content = content.replace(pattern, "")
|
42 |
+
|
43 |
+
# Clean up extra spaces
|
44 |
+
content = ' '.join(content.split())
|
45 |
+
|
46 |
+
extracted_articles.append({"title": title, "content": content})
|
47 |
+
|
48 |
+
return extracted_articles
|
49 |
+
|
50 |
+
|
51 |
+
# import requests
|
52 |
+
# from bs4 import BeautifulSoup
|
53 |
+
|
54 |
+
# # NewsAPI Key
|
55 |
+
# NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
|
56 |
+
|
57 |
+
# def fetch_articles(company, num_articles=11):
|
58 |
+
# """Fetch multiple news articles from NewsAPI and return their titles and content."""
|
59 |
+
# url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
|
60 |
+
# response = requests.get(url)
|
61 |
+
|
62 |
+
# if response.status_code != 200:
|
63 |
+
# print("Error:", response.status_code, response.text)
|
64 |
+
# return []
|
65 |
+
|
66 |
+
# data = response.json()
|
67 |
+
# articles = data.get("articles", [])
|
68 |
+
|
69 |
+
# if not articles:
|
70 |
+
# print("No articles found.")
|
71 |
+
# return []
|
72 |
+
|
73 |
+
# fetched_articles = []
|
74 |
+
|
75 |
+
# for article in articles[:num_articles]: # Fetch only the required number of articles
|
76 |
+
# article_url = article.get("url")
|
77 |
+
# if not article_url:
|
78 |
+
# continue
|
79 |
+
|
80 |
+
# # Scrape the article for title and content
|
81 |
+
# try:
|
82 |
+
# article_response = requests.get(article_url, timeout=5) # Removed headers
|
83 |
+
# if article_response.status_code == 200:
|
84 |
+
# soup = BeautifulSoup(article_response.content, 'html.parser')
|
85 |
+
# title = soup.title.string if soup.title else "No Title Found"
|
86 |
+
|
87 |
+
# # Extract paragraphs and clean the content
|
88 |
+
# paragraphs = soup.find_all('p')
|
89 |
+
# content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
|
90 |
+
|
91 |
+
# # Remove unwanted text patterns
|
92 |
+
# unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
|
93 |
+
# for pattern in unwanted_patterns:
|
94 |
+
# content = content.replace(pattern, "")
|
95 |
+
|
96 |
+
# # Clean up extra spaces
|
97 |
+
# content = ' '.join(content.split())
|
98 |
+
|
99 |
+
# # Store the article's title and content
|
100 |
+
# fetched_articles.append({"title": title, "content": content})
|
101 |
+
# except requests.exceptions.RequestException as e:
|
102 |
+
# print(f"Error fetching article: {e}")
|
103 |
+
|
104 |
+
# return fetched_articles
|
105 |
+
|
106 |
+
# if __name__ == "__main__":
|
107 |
+
# company = input("Enter the company name for analysis: ").strip()
|
108 |
+
# articles = fetch_articles(company, num_articles=11)
|
109 |
+
# print(articles)
|
approach_api/utils/news_sentiment.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import scipy.special
|
3 |
+
import pandas as pd
|
4 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
5 |
+
|
6 |
+
# Load FinBERT model and tokenizer
|
7 |
+
finbert_ckpt = "cardiffnlp/twitter-roberta-base-sentiment"
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained(finbert_ckpt)
|
9 |
+
model_finbert = AutoModelForSequenceClassification.from_pretrained(finbert_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")
|
10 |
+
|
11 |
+
def analyze_sentiment(text_list):
|
12 |
+
"""Performs sentiment analysis on a list of texts using FinBERT."""
|
13 |
+
preds = []
|
14 |
+
preds_proba = []
|
15 |
+
|
16 |
+
tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}
|
17 |
+
|
18 |
+
for text in text_list:
|
19 |
+
with torch.no_grad():
|
20 |
+
# Tokenize the input
|
21 |
+
input_sequence = tokenizer(text, return_tensors="pt", **tokenizer_kwargs).to(model_finbert.device)
|
22 |
+
logits = model_finbert(**input_sequence).logits.cpu().numpy().squeeze()
|
23 |
+
|
24 |
+
# Convert logits to probabilities
|
25 |
+
scores = {
|
26 |
+
k: v for k, v in zip(
|
27 |
+
model_finbert.config.id2label.values(),
|
28 |
+
scipy.special.softmax(logits)
|
29 |
+
)
|
30 |
+
}
|
31 |
+
|
32 |
+
# Get the most probable sentiment
|
33 |
+
sentiment = max(scores, key=scores.get)
|
34 |
+
probability = max(scores.values())
|
35 |
+
|
36 |
+
# Map the sentiment labels
|
37 |
+
if sentiment == 'LABEL_2':
|
38 |
+
sentiment = 'positive'
|
39 |
+
elif sentiment == 'LABEL_0':
|
40 |
+
sentiment = 'negative'
|
41 |
+
else:
|
42 |
+
sentiment = 'neutral'
|
43 |
+
|
44 |
+
preds.append(sentiment)
|
45 |
+
preds_proba.append(probability)
|
46 |
+
|
47 |
+
# Return a DataFrame with results
|
48 |
+
df_results = pd.DataFrame({
|
49 |
+
"Text": text_list,
|
50 |
+
"Predicted Sentiment": preds,
|
51 |
+
"Probability": preds_proba
|
52 |
+
})
|
53 |
+
|
54 |
+
return df_results
|
approach_api/utils/news_summarisation.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
3 |
+
|
4 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
5 |
+
print(f"Summarization Device: {device}")
|
6 |
+
|
7 |
+
model_ckpt = "google/pegasus-cnn_dailymail"
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
9 |
+
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
|
10 |
+
|
11 |
+
def summarize_text(text: str) -> str:
|
12 |
+
input_ids = tokenizer.encode(
|
13 |
+
text,
|
14 |
+
return_tensors="pt",
|
15 |
+
max_length=1024,
|
16 |
+
truncation=True,
|
17 |
+
).to(device)
|
18 |
+
try:
|
19 |
+
summary_ids = model_pegasus.generate(input_ids, max_length=130, min_length=30, do_sample=False)
|
20 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
21 |
+
return summary
|
22 |
+
except RuntimeError as e:
|
23 |
+
print(f"Summarization Error: {e}")
|
24 |
+
return "Error: Could not generate summary due to length constraints."
|
25 |
+
|
approach_api/utils/text_to_speech.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gtts import gTTS
|
2 |
+
from deep_translator import GoogleTranslator
|
3 |
+
|
4 |
+
def text_to_speech(text):
|
5 |
+
""" Converts text into both English and Hindi speech using gTTS (Cloud-based TTS). """
|
6 |
+
|
7 |
+
# ✅ Translate English to Hindi
|
8 |
+
translated_text = GoogleTranslator(source="en", target="hi").translate(text)
|
9 |
+
|
10 |
+
# ✅ Hindi Voice (Using gTTS)
|
11 |
+
hindi_tts = gTTS(text=translated_text, lang="hi")
|
12 |
+
hindi_file = "output_hindi.mp3"
|
13 |
+
hindi_tts.save(hindi_file)
|
14 |
+
|
15 |
+
return hindi_file
|
16 |
+
|
17 |
+
# if __name__ == "__main__":
|
18 |
+
# text = input("Enter text: ")
|
19 |
+
# hindi_file = text_to_speech(text)
|
20 |
+
# print(f"Hindi audio saved to: {hindi_file}")
|
approach_api/utils/topic_extraction.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gensim import corpora, models
|
2 |
+
from nltk.corpus import stopwords
|
3 |
+
from nltk.tokenize import word_tokenize
|
4 |
+
import string
|
5 |
+
import nltk
|
6 |
+
|
7 |
+
# Download necessary NLTK resources
|
8 |
+
nltk.download("stopwords")
|
9 |
+
nltk.download("punkt")
|
10 |
+
|
11 |
+
def preprocess_text(text_data):
|
12 |
+
"""
|
13 |
+
Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.
|
14 |
+
|
15 |
+
:param text_data: List of raw text documents
|
16 |
+
:return: List of preprocessed tokenized texts
|
17 |
+
"""
|
18 |
+
stop_words = set(stopwords.words("english"))
|
19 |
+
processed_texts = [
|
20 |
+
[
|
21 |
+
word for word in word_tokenize(document.lower())
|
22 |
+
if word not in stop_words and word not in string.punctuation and word.isalpha()
|
23 |
+
]
|
24 |
+
for document in text_data
|
25 |
+
]
|
26 |
+
return processed_texts
|
27 |
+
|
28 |
+
def train_lda(texts, num_topics=3):
|
29 |
+
"""
|
30 |
+
Trains an LDA model on the given preprocessed text data.
|
31 |
+
|
32 |
+
:param texts: List of tokenized texts
|
33 |
+
:param num_topics: Number of topics for the LDA model
|
34 |
+
:return: Trained LDA model and corresponding dictionary
|
35 |
+
"""
|
36 |
+
dictionary = corpora.Dictionary(texts)
|
37 |
+
corpus = [dictionary.doc2bow(text) for text in texts]
|
38 |
+
|
39 |
+
ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
|
40 |
+
|
41 |
+
return ldamodel, dictionary
|
42 |
+
|
43 |
+
def extract_topic_words(ldamodel, num_topics=3, num_words=3):
|
44 |
+
"""
|
45 |
+
Extracts meaningful words from each topic identified by the LDA model.
|
46 |
+
|
47 |
+
:param ldamodel: Trained LDA model
|
48 |
+
:param num_topics: Number of topics to extract
|
49 |
+
:param num_words: Number of words per topic to consider
|
50 |
+
:return: List of top words representing each topic
|
51 |
+
"""
|
52 |
+
topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
|
53 |
+
topic_names = []
|
54 |
+
|
55 |
+
for topic in topics:
|
56 |
+
words = topic[1].split(" + ")
|
57 |
+
for word_data in words:
|
58 |
+
word = word_data.split("*")[1].strip('"') # Extract word
|
59 |
+
if word.isalpha() and len(word) > 2: # Ensure it's a meaningful word
|
60 |
+
topic_names.append(word)
|
61 |
+
break # Only take the top valid word
|
62 |
+
|
63 |
+
return list(set(topic_names)) # Ensure unique topics
|
64 |
+
|
approach_library/api/api.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from main import analyze_company_news # Import the function from main.py
|
4 |
+
|
5 |
+
app = FastAPI()
|
6 |
+
|
7 |
+
class CompanyRequest(BaseModel):
|
8 |
+
Company_Name: str
|
9 |
+
|
10 |
+
@app.post("/api/company")
|
11 |
+
async def handle_company(request: CompanyRequest):
|
12 |
+
company = request.Company_Name.strip()
|
13 |
+
result = analyze_company_news(company)
|
14 |
+
return result
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
import uvicorn
|
18 |
+
uvicorn.run(app, host="127.0.0.1", port=8000)
|
approach_library/app.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import json
|
3 |
+
import time
|
4 |
+
from utils.news_extraction import extract_news
|
5 |
+
from utils.news_summarisation import summarize_text
|
6 |
+
from utils.news_sentiment import analyze_sentiment
|
7 |
+
from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
|
8 |
+
from utils.comparative_analysis import comparative_sentiment_analysis
|
9 |
+
from utils.text_to_speech import text_to_speech
|
10 |
+
import os
|
11 |
+
|
12 |
+
def analyze_company_news(company):
|
13 |
+
st.write(f"Analyzing company: {company}")
|
14 |
+
|
15 |
+
with st.spinner("Fetching news articles..."):
|
16 |
+
articles = extract_news(company)
|
17 |
+
if not articles:
|
18 |
+
st.error("No news articles found. Try a different company.")
|
19 |
+
return None
|
20 |
+
st.write(f"Found {len(articles)} articles")
|
21 |
+
|
22 |
+
articles_data = []
|
23 |
+
texts = [article["text"] for article in articles]
|
24 |
+
|
25 |
+
with st.spinner("Performing sentiment analysis..."):
|
26 |
+
sentiment_results = analyze_sentiment(texts)
|
27 |
+
st.write(f"Sentiment analysis completed for {len(sentiment_results['Predicted Sentiment'])} articles")
|
28 |
+
|
29 |
+
for article, sentiment in zip(articles, sentiment_results["Predicted Sentiment"]):
|
30 |
+
summary = summarize_text(article["text"])
|
31 |
+
preprocessed_text = preprocess_text([article["text"]])
|
32 |
+
lda_model, dictionary = train_lda(preprocessed_text)
|
33 |
+
topic_words = extract_topic_words(lda_model)
|
34 |
+
|
35 |
+
articles_data.append({
|
36 |
+
"Title": article["title"],
|
37 |
+
"Summary": summary,
|
38 |
+
"Sentiment": sentiment,
|
39 |
+
"Topics": topic_words
|
40 |
+
})
|
41 |
+
|
42 |
+
with st.spinner("Performing comparative analysis..."):
|
43 |
+
analysis_result = comparative_sentiment_analysis(company, articles_data)
|
44 |
+
st.write("Comparative analysis completed")
|
45 |
+
st.write("Analysis result:", analysis_result)
|
46 |
+
|
47 |
+
final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
|
48 |
+
|
49 |
+
with st.spinner("Generating Hindi TTS summary..."):
|
50 |
+
try:
|
51 |
+
audio_file = text_to_speech(final_summary)
|
52 |
+
if os.path.exists(audio_file):
|
53 |
+
st.write(f"TTS summary generated: {audio_file}")
|
54 |
+
else:
|
55 |
+
st.error("Failed to generate TTS summary")
|
56 |
+
audio_file = None
|
57 |
+
except Exception as e:
|
58 |
+
st.error(f"TTS generation failed: {str(e)}")
|
59 |
+
audio_file = None
|
60 |
+
|
61 |
+
return {
|
62 |
+
"Company": company,
|
63 |
+
"Articles": articles_data,
|
64 |
+
"Comparative Sentiment Score": analysis_result,
|
65 |
+
"Audio": audio_file
|
66 |
+
}
|
67 |
+
|
68 |
+
st.title("Company News Analysis")
|
69 |
+
company = st.text_input("Enter the company name for analysis:")
|
70 |
+
if st.button("Analyze") and company:
|
71 |
+
st.write(f"Starting analysis for: {company}")
|
72 |
+
result = analyze_company_news(company)
|
73 |
+
if result:
|
74 |
+
st.subheader(f"Analysis for {result['Company']}")
|
75 |
+
|
76 |
+
for article in result["Articles"]:
|
77 |
+
st.write(f"**Title:** {article['Title']}")
|
78 |
+
st.write(f"**Summary:** {article['Summary']}")
|
79 |
+
st.write(f"**Sentiment:** {article['Sentiment']}")
|
80 |
+
st.write(f"**Topics:** {', '.join(article['Topics'])}")
|
81 |
+
st.markdown("---")
|
82 |
+
|
83 |
+
st.subheader("Comparative Sentiment Score")
|
84 |
+
st.json(result["Comparative Sentiment Score"])
|
85 |
+
|
86 |
+
st.subheader("Hindi TTS Summary")
|
87 |
+
if result["Audio"]:
|
88 |
+
st.audio(result["Audio"], format="audio/mp3")
|
89 |
+
else:
|
90 |
+
st.warning("TTS summary not available")
|
91 |
+
|
approach_library/main.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import json
|
2 |
+
# import time
|
3 |
+
# from news_extraction import extract_news
|
4 |
+
# from news_summarisation import summarize_text
|
5 |
+
# from news_sentiment import analyze_sentiment
|
6 |
+
# from topic_extraction import preprocess_text, train_lda, extract_topic_words
|
7 |
+
# from comparative_analysis import comparative_sentiment_analysis
|
8 |
+
# from text_to_speech import text_to_speech # ✅ Import the TTS function
|
9 |
+
|
10 |
+
# def main():
|
11 |
+
# # User input for the company/topic
|
12 |
+
# company = input("Enter the company name for analysis: ").strip()
|
13 |
+
|
14 |
+
# # Extract news articles
|
15 |
+
# start_time = time.time()
|
16 |
+
# articles = extract_news(company)
|
17 |
+
# extraction_time = time.time() - start_time
|
18 |
+
|
19 |
+
# if not articles:
|
20 |
+
# print("No news articles found. Try a different company.")
|
21 |
+
# return
|
22 |
+
|
23 |
+
# articles_data = [] # List to store processed articles
|
24 |
+
|
25 |
+
# # Extract texts from articles for sentiment analysis
|
26 |
+
# texts = [article["text"] for article in articles]
|
27 |
+
|
28 |
+
# # Perform sentiment analysis
|
29 |
+
# start_time = time.time()
|
30 |
+
# sentiment_results = analyze_sentiment(texts)
|
31 |
+
# sentiment_time = time.time() - start_time
|
32 |
+
|
33 |
+
# # Process each article
|
34 |
+
# for i, (article, sentiment) in enumerate(zip(articles, sentiment_results["Predicted Sentiment"]), start=1):
|
35 |
+
# start_time = time.time()
|
36 |
+
# summary = summarize_text(article["text"]) # Summarize article
|
37 |
+
# summarization_time = time.time() - start_time
|
38 |
+
|
39 |
+
# # Extract topics for the specific article
|
40 |
+
# preprocessed_text = preprocess_text([article["text"]])
|
41 |
+
# lda_model, dictionary = train_lda(preprocessed_text)
|
42 |
+
# topic_words = extract_topic_words(lda_model)
|
43 |
+
|
44 |
+
# article_entry = {
|
45 |
+
# "Title": article["title"],
|
46 |
+
# "Summary": summary,
|
47 |
+
# "Sentiment": sentiment,
|
48 |
+
# "Topics": topic_words
|
49 |
+
# }
|
50 |
+
# articles_data.append(article_entry)
|
51 |
+
|
52 |
+
# # Perform comparative sentiment analysis
|
53 |
+
# analysis_result = comparative_sentiment_analysis(company, articles_data)
|
54 |
+
|
55 |
+
# # ✅ Generate a summary speech for the entire report
|
56 |
+
# final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
|
57 |
+
# audio_file = text_to_speech(final_summary) # Generate Hindi TTS
|
58 |
+
|
59 |
+
# # ✅ Construct final JSON output
|
60 |
+
# output = {
|
61 |
+
# "Company": company,
|
62 |
+
# "Articles": articles_data,
|
63 |
+
# "Comparative Sentiment Score": analysis_result,
|
64 |
+
# "Final Sentiment Analysis": final_summary,
|
65 |
+
# "Audio": f"[Play {audio_file}]" # ✅ Include a playable reference
|
66 |
+
# }
|
67 |
+
|
68 |
+
# # Print JSON output
|
69 |
+
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
70 |
+
|
71 |
+
# # Save JSON output to file
|
72 |
+
# with open(f"{company}_news_analysis.json", "w", encoding="utf-8") as json_file:
|
73 |
+
# json.dump(output, json_file, indent=4, ensure_ascii=False)
|
74 |
+
|
75 |
+
# if __name__ == "__main__":
|
76 |
+
# main()
|
77 |
+
|
78 |
+
import json
|
79 |
+
import time
|
80 |
+
from utils.news_extraction import extract_news
|
81 |
+
from utils.news_summarisation import summarize_text
|
82 |
+
from utils.news_sentiment import analyze_sentiment
|
83 |
+
from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
|
84 |
+
from utils.comparative_analysis import comparative_sentiment_analysis
|
85 |
+
from utils.text_to_speech import text_to_speech # ✅ Import the TTS function
|
86 |
+
|
87 |
+
def analyze_company_news(company):
|
88 |
+
# Extract news articles
|
89 |
+
start_time = time.time()
|
90 |
+
articles = extract_news(company)
|
91 |
+
extraction_time = time.time() - start_time
|
92 |
+
|
93 |
+
if not articles:
|
94 |
+
return {"message": "No news articles found. Try a different company."}
|
95 |
+
|
96 |
+
articles_data = [] # List to store processed articles
|
97 |
+
|
98 |
+
# Extract texts from articles for sentiment analysis
|
99 |
+
texts = [article["text"] for article in articles]
|
100 |
+
|
101 |
+
# Perform sentiment analysis
|
102 |
+
start_time = time.time()
|
103 |
+
sentiment_results = analyze_sentiment(texts)
|
104 |
+
sentiment_time = time.time() - start_time
|
105 |
+
|
106 |
+
# Process each article
|
107 |
+
for i, (article, sentiment) in enumerate(zip(articles, sentiment_results["Predicted Sentiment"]), start=1):
|
108 |
+
start_time = time.time()
|
109 |
+
summary = summarize_text(article["text"]) # Summarize article
|
110 |
+
summarization_time = time.time() - start_time
|
111 |
+
|
112 |
+
# Extract topics for the specific article
|
113 |
+
preprocessed_text = preprocess_text([article["text"]])
|
114 |
+
lda_model, dictionary = train_lda(preprocessed_text)
|
115 |
+
topic_words = extract_topic_words(lda_model)
|
116 |
+
|
117 |
+
article_entry = {
|
118 |
+
"Title": article["title"],
|
119 |
+
"Summary": summary,
|
120 |
+
"Sentiment": sentiment,
|
121 |
+
"Topics": topic_words
|
122 |
+
}
|
123 |
+
articles_data.append(article_entry)
|
124 |
+
|
125 |
+
# Perform comparative sentiment analysis
|
126 |
+
analysis_result = comparative_sentiment_analysis(company, articles_data)
|
127 |
+
|
128 |
+
# ✅ Generate a summary speech for the entire report
|
129 |
+
final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
|
130 |
+
audio_file = text_to_speech(final_summary) # Generate TTS
|
131 |
+
|
132 |
+
# ✅ Construct final JSON output
|
133 |
+
output = {
|
134 |
+
"Company": company,
|
135 |
+
"Articles": articles_data,
|
136 |
+
"Comparative Sentiment Score": analysis_result,
|
137 |
+
"Audio": f"[Play {audio_file}]" # ✅ Include a playable reference
|
138 |
+
}
|
139 |
+
|
140 |
+
return output
|
141 |
+
|
142 |
+
# if __name__ == "__main__":
|
143 |
+
# company = input("Enter the company name for analysis: ").strip()
|
144 |
+
# result = analyze_company_news(company)
|
145 |
+
# print(json.dumps(result, indent=4, ensure_ascii=False))
|
approach_library/utils/comparative_analysis.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import json
|
3 |
+
from collections import Counter
|
4 |
+
|
5 |
+
def comparative_sentiment_analysis(company, articles, max_comparisons=10, chunk_size=5):
|
6 |
+
"""
|
7 |
+
Perform a comparative sentiment analysis on multiple articles.
|
8 |
+
"""
|
9 |
+
overall_sentiment_counts = Counter()
|
10 |
+
overall_coverage_differences = []
|
11 |
+
all_topics = []
|
12 |
+
|
13 |
+
if not articles:
|
14 |
+
print("No articles found for analysis.")
|
15 |
+
return {
|
16 |
+
"Company": company,
|
17 |
+
"Articles": [],
|
18 |
+
"Comparative Sentiment Score": {
|
19 |
+
"Sentiment Distribution": {},
|
20 |
+
"Coverage Differences": [],
|
21 |
+
"Topic Overlap": {"Common Topics": [], "Unique Topics Per Article": []}
|
22 |
+
},
|
23 |
+
"Final Sentiment Analysis": "No data available."
|
24 |
+
}
|
25 |
+
|
26 |
+
# Process articles in chunks
|
27 |
+
for start in range(0, len(articles), chunk_size):
|
28 |
+
chunk = articles[start:start + chunk_size]
|
29 |
+
|
30 |
+
# Count sentiment distribution
|
31 |
+
sentiment_counts = Counter(article["Sentiment"] for article in chunk)
|
32 |
+
overall_sentiment_counts.update(sentiment_counts)
|
33 |
+
|
34 |
+
# Identify coverage differences
|
35 |
+
for i in range(len(chunk) - 1):
|
36 |
+
for j in range(i + 1, len(chunk)):
|
37 |
+
if len(overall_coverage_differences) >= max_comparisons:
|
38 |
+
break
|
39 |
+
article1, article2 = chunk[i], chunk[j]
|
40 |
+
comparison = {
|
41 |
+
"Comparison": f"'{article1.get('Title', 'Article 1')}' vs '{article2.get('Title', 'Article 2')}'",
|
42 |
+
"Impact": f"{article1.get('Topics', [])} vs {article2.get('Topics', [])}"
|
43 |
+
}
|
44 |
+
overall_coverage_differences.append(comparison)
|
45 |
+
|
46 |
+
# Extract topics ensuring valid lists
|
47 |
+
topics = [set(article.get("Topics", [])) for article in chunk if isinstance(article.get("Topics", list), list) and article.get("Topics", [])]
|
48 |
+
all_topics.extend(topics)
|
49 |
+
|
50 |
+
# Debugging Output
|
51 |
+
print("All Topics Extracted:", all_topics)
|
52 |
+
|
53 |
+
# Determine common and unique topics
|
54 |
+
if len(all_topics) == 0:
|
55 |
+
common_topics = set() # No topics found
|
56 |
+
elif len(all_topics) == 1:
|
57 |
+
common_topics = all_topics[0] # Only one article, take its topics as common
|
58 |
+
else:
|
59 |
+
common_topics = set.intersection(*all_topics) # Find intersection normally
|
60 |
+
|
61 |
+
unique_topics = [{"Article": i + 1, "Unique Topics": list(topics - common_topics)}
|
62 |
+
for i, topics in enumerate(all_topics)]
|
63 |
+
|
64 |
+
# Convert to list for JSON output
|
65 |
+
common_topics = list(common_topics)
|
66 |
+
|
67 |
+
print("Common Topics:", common_topics)
|
68 |
+
|
69 |
+
# Final sentiment summary
|
70 |
+
final_analysis = "The news coverage is mostly "
|
71 |
+
if overall_sentiment_counts["Positive"] > overall_sentiment_counts["Negative"]:
|
72 |
+
final_analysis += "positive, indicating potential growth."
|
73 |
+
elif overall_sentiment_counts["Negative"] > overall_sentiment_counts["Positive"]:
|
74 |
+
final_analysis += "negative, suggesting challenges ahead."
|
75 |
+
else:
|
76 |
+
final_analysis += "balanced, with mixed reactions."
|
77 |
+
|
78 |
+
|
79 |
+
return {
|
80 |
+
"Comparative Sentiment Score": {
|
81 |
+
"Sentiment Distribution": dict(overall_sentiment_counts),
|
82 |
+
"Coverage Differences": overall_coverage_differences,
|
83 |
+
"Topic Overlap": {
|
84 |
+
"Common Topics": common_topics,
|
85 |
+
"Unique Topics Per Article": unique_topics
|
86 |
+
}
|
87 |
+
},
|
88 |
+
"Final Sentiment Analysis": final_analysis
|
89 |
+
}
|
90 |
+
|
approach_library/utils/news_extraction.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nest_asyncio
|
2 |
+
from typing import List, Dict
|
3 |
+
from duckduckgo_search import DDGS
|
4 |
+
from phi.tools.newspaper4k import Newspaper4k
|
5 |
+
import time
|
6 |
+
|
7 |
+
nest_asyncio.apply()
|
8 |
+
|
9 |
+
def extract_news(article_topic: str, num_search_results: int = 15, max_retries: int = 3) -> List[Dict[str, str]]:
|
10 |
+
"""
|
11 |
+
Extracts full news articles based on the given topic and number of search results.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
article_topic: The topic to search for.
|
15 |
+
num_search_results: The number of search results to retrieve.
|
16 |
+
max_retries: The maximum number of retries if an article fails to scrape.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
A list of dictionaries, where each dictionary represents a news article.
|
20 |
+
"""
|
21 |
+
news_results = []
|
22 |
+
ddgs = DDGS()
|
23 |
+
newspaper_tools = Newspaper4k()
|
24 |
+
|
25 |
+
results = ddgs.news(keywords=article_topic, max_results=num_search_results) # Fetch extra results
|
26 |
+
|
27 |
+
for r in results:
|
28 |
+
if "url" in r:
|
29 |
+
retries = 0
|
30 |
+
while retries < max_retries:
|
31 |
+
try:
|
32 |
+
article_data = newspaper_tools.get_article_data(r["url"])
|
33 |
+
|
34 |
+
if article_data and "text" in article_data and len(article_data["text"]) > 100:
|
35 |
+
news_results.append({
|
36 |
+
"title": r.get("title", "No Title"),
|
37 |
+
"text": article_data["text"] # Full article text
|
38 |
+
})
|
39 |
+
break # Successful extraction, break retry loop
|
40 |
+
else:
|
41 |
+
retries += 1
|
42 |
+
time.sleep(1) # Wait before retrying
|
43 |
+
except Exception as e:
|
44 |
+
retries += 1
|
45 |
+
time.sleep(1)
|
46 |
+
|
47 |
+
# Stop if we have collected enough articles
|
48 |
+
if len(news_results) >= num_search_results:
|
49 |
+
break
|
50 |
+
|
51 |
+
return news_results
|
approach_library/utils/news_sentiment.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import scipy.special
|
3 |
+
import pandas as pd
|
4 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
5 |
+
|
6 |
+
# Load FinBERT model and tokenizer
|
7 |
+
finbert_ckpt = "cardiffnlp/twitter-roberta-base-sentiment"
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained(finbert_ckpt)
|
9 |
+
model_finbert = AutoModelForSequenceClassification.from_pretrained(finbert_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")
|
10 |
+
|
11 |
+
def analyze_sentiment(text_list):
|
12 |
+
"""Performs sentiment analysis on a list of texts using FinBERT."""
|
13 |
+
preds = []
|
14 |
+
preds_proba = []
|
15 |
+
|
16 |
+
tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}
|
17 |
+
|
18 |
+
for text in text_list:
|
19 |
+
with torch.no_grad():
|
20 |
+
# Tokenize the input
|
21 |
+
input_sequence = tokenizer(text, return_tensors="pt", **tokenizer_kwargs).to(model_finbert.device)
|
22 |
+
logits = model_finbert(**input_sequence).logits.cpu().numpy().squeeze()
|
23 |
+
|
24 |
+
# Convert logits to probabilities
|
25 |
+
scores = {
|
26 |
+
k: v for k, v in zip(
|
27 |
+
model_finbert.config.id2label.values(),
|
28 |
+
scipy.special.softmax(logits)
|
29 |
+
)
|
30 |
+
}
|
31 |
+
|
32 |
+
# Get the most probable sentiment
|
33 |
+
sentiment = max(scores, key=scores.get)
|
34 |
+
probability = max(scores.values())
|
35 |
+
|
36 |
+
# Map the sentiment labels
|
37 |
+
if sentiment == 'LABEL_2':
|
38 |
+
sentiment = 'positive'
|
39 |
+
elif sentiment == 'LABEL_0':
|
40 |
+
sentiment = 'negative'
|
41 |
+
else:
|
42 |
+
sentiment = 'neutral'
|
43 |
+
|
44 |
+
preds.append(sentiment)
|
45 |
+
preds_proba.append(probability)
|
46 |
+
|
47 |
+
# Return a DataFrame with results
|
48 |
+
df_results = pd.DataFrame({
|
49 |
+
"Text": text_list,
|
50 |
+
"Predicted Sentiment": preds,
|
51 |
+
"Probability": preds_proba
|
52 |
+
})
|
53 |
+
|
54 |
+
return df_results
|
approach_library/utils/news_summarisation.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
4 |
+
|
5 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
6 |
+
print(f"Summarization Device: {device}")
|
7 |
+
|
8 |
+
model_ckpt = "google/pegasus-cnn_dailymail"
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
10 |
+
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
|
11 |
+
|
12 |
+
def summarize_text(text: str) -> str:
|
13 |
+
input_ids = tokenizer.encode(
|
14 |
+
text,
|
15 |
+
return_tensors="pt",
|
16 |
+
max_length=1024,
|
17 |
+
truncation=True,
|
18 |
+
).to(device)
|
19 |
+
try:
|
20 |
+
summary_ids = model_pegasus.generate(input_ids, max_length=130, min_length=30, do_sample=False)
|
21 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
22 |
+
return summary
|
23 |
+
except RuntimeError as e:
|
24 |
+
print(f"Summarization Error: {e}")
|
25 |
+
return "Error: Could not generate summary due to length constraints."
|
26 |
+
|
approach_library/utils/text_to_speech.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gtts import gTTS
|
2 |
+
from deep_translator import GoogleTranslator
|
3 |
+
|
4 |
+
def text_to_speech(text):
|
5 |
+
""" Converts text into both English and Hindi speech using gTTS (Cloud-based TTS). """
|
6 |
+
|
7 |
+
# ✅ Translate English to Hindi
|
8 |
+
translated_text = GoogleTranslator(source="en", target="hi").translate(text)
|
9 |
+
|
10 |
+
# ✅ Hindi Voice (Using gTTS)
|
11 |
+
hindi_tts = gTTS(text=translated_text, lang="hi")
|
12 |
+
hindi_file = "output_hindi.mp3"
|
13 |
+
hindi_tts.save(hindi_file)
|
14 |
+
|
15 |
+
return hindi_file
|
16 |
+
|
17 |
+
# if __name__ == "__main__":
|
18 |
+
# text = input("Enter text: ")
|
19 |
+
# hindi_file = text_to_speech(text)
|
20 |
+
# print(f"Hindi audio saved to: {hindi_file}")
|
approach_library/utils/topic_extraction.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gensim import corpora, models
|
2 |
+
from nltk.corpus import stopwords
|
3 |
+
from nltk.tokenize import word_tokenize
|
4 |
+
import string
|
5 |
+
import nltk
|
6 |
+
|
7 |
+
# Download necessary NLTK resources
|
8 |
+
nltk.download("stopwords")
|
9 |
+
nltk.download("punkt")
|
10 |
+
|
11 |
+
def preprocess_text(text_data):
|
12 |
+
"""
|
13 |
+
Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.
|
14 |
+
|
15 |
+
:param text_data: List of raw text documents
|
16 |
+
:return: List of preprocessed tokenized texts
|
17 |
+
"""
|
18 |
+
stop_words = set(stopwords.words("english"))
|
19 |
+
processed_texts = [
|
20 |
+
[
|
21 |
+
word for word in word_tokenize(document.lower())
|
22 |
+
if word not in stop_words and word not in string.punctuation and word.isalpha()
|
23 |
+
]
|
24 |
+
for document in text_data
|
25 |
+
]
|
26 |
+
return processed_texts
|
27 |
+
|
28 |
+
def train_lda(texts, num_topics=3):
|
29 |
+
"""
|
30 |
+
Trains an LDA model on the given preprocessed text data.
|
31 |
+
|
32 |
+
:param texts: List of tokenized texts
|
33 |
+
:param num_topics: Number of topics for the LDA model
|
34 |
+
:return: Trained LDA model and corresponding dictionary
|
35 |
+
"""
|
36 |
+
dictionary = corpora.Dictionary(texts)
|
37 |
+
corpus = [dictionary.doc2bow(text) for text in texts]
|
38 |
+
|
39 |
+
ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
|
40 |
+
|
41 |
+
return ldamodel, dictionary
|
42 |
+
|
43 |
+
def extract_topic_words(ldamodel, num_topics=3, num_words=3):
|
44 |
+
"""
|
45 |
+
Extracts meaningful words from each topic identified by the LDA model.
|
46 |
+
|
47 |
+
:param ldamodel: Trained LDA model
|
48 |
+
:param num_topics: Number of topics to extract
|
49 |
+
:param num_words: Number of words per topic to consider
|
50 |
+
:return: List of top words representing each topic
|
51 |
+
"""
|
52 |
+
topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
|
53 |
+
topic_names = []
|
54 |
+
|
55 |
+
for topic in topics:
|
56 |
+
words = topic[1].split(" + ")
|
57 |
+
for word_data in words:
|
58 |
+
word = word_data.split("*")[1].strip('"') # Extract word
|
59 |
+
if word.isalpha() and len(word) > 2: # Ensure it's a meaningful word
|
60 |
+
topic_names.append(word)
|
61 |
+
break # Only take the top valid word
|
62 |
+
|
63 |
+
return list(set(topic_names)) # Ensure unique topics
|
64 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nest_asyncio
|
2 |
+
duckduckgo-search
|
3 |
+
newspaper4k
|
4 |
+
transformers
|
5 |
+
phidata
|
6 |
+
newspaper4k
|
7 |
+
lxml_html_clean
|
8 |
+
duckduckgo_search
|
9 |
+
transformers
|
10 |
+
datasets
|
11 |
+
pandas
|
12 |
+
nltk
|
13 |
+
torch
|
14 |
+
tqdm
|
15 |
+
GoogleNews
|
16 |
+
pygooglenews
|
17 |
+
feedparser
|
18 |
+
googlesearch-python
|
19 |
+
soundfile
|
20 |
+
gtts
|
21 |
+
deep_translator
|
22 |
+
fastapi
|
23 |
+
pydantic
|
24 |
+
uvicorn
|
25 |
+
python-magic
|
26 |
+
streamlit
|