v1shal commited on
Commit
b396e94
·
1 Parent(s): 3f6c8e2

first_commit

Browse files
.gitignore ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ myenv/
27
+
28
+ # IDE
29
+ .idea/
30
+ .vscode/
31
+ *.swp
32
+ *.swo
33
+
34
+ # Environment files
35
+ .env
36
+ .env.local
37
+ .env.development.local
38
+ .env.test.local
39
+ .env.production.local
40
+
41
+ # Logs
42
+ *.log
43
+ logs/
approach_api/api/api.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from main import analyze_company_news
4
+
5
+ app = FastAPI()
6
+
7
+ class CompanyRequest(BaseModel):
8
+ Company_Name: str
9
+
10
+ @app.post("/api/company")
11
+ async def handle_company(request: CompanyRequest):
12
+ company = request.Company_Name.strip()
13
+ result = analyze_company_news(company)
14
+ return result
15
+
16
+ if __name__ == "__main__":
17
+ import uvicorn
18
+ uvicorn.run(app, host="127.0.0.1", port=8000)
approach_api/main.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ # import time
3
+ # from utils.news_extraction_api import fetch_articles
4
+ # from utils.news_summarisation import summarize_text
5
+ # from utils.news_sentiment import analyze_sentiment
6
+ # from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
7
+ # from utils.comparative_analysis import comparative_sentiment_analysis
8
+ # from utils.text_to_speech import text_to_speech
9
+
10
+ # def main():
11
+ # company = input("Enter the company name for analysis: ").strip()
12
+
13
+ # # Extract news articles
14
+ # start_time = time.time()
15
+ # articles = fetch_articles(company, num_articles=2) # Fetch 2 articles
16
+ # extraction_time = time.time() - start_time
17
+ # print(f"✅ Articles extracted in {extraction_time:.2f} seconds")
18
+
19
+ # if not articles:
20
+ # print("⚠️ No news articles found. Try a different company.")
21
+ # return
22
+
23
+ # articles_data = []
24
+ # all_topics = [] # Collect all topics for better analysis
25
+
26
+ # for article in articles:
27
+ # text = article.get("content", "").strip()
28
+
29
+ # if not text:
30
+ # print(f"⚠️ Skipping article '{article.get('title', 'No Title')}' due to missing content.")
31
+ # continue
32
+
33
+ # # Perform sentiment analysis
34
+ # start_time = time.time()
35
+ # sentiment_result = analyze_sentiment([text])
36
+ # sentiment = sentiment_result.get("Predicted Sentiment", ["Unknown"])[0]
37
+ # sentiment_time = time.time() - start_time
38
+ # print(f"✅ Sentiment analysis completed in {sentiment_time:.2f} seconds")
39
+
40
+ # # Summarize the article
41
+ # start_time = time.time()
42
+ # summary = summarize_text(text)
43
+ # summary_time = time.time() - start_time
44
+ # print(f"✅ Summary generation completed in {summary_time:.2f} seconds")
45
+
46
+ # # Extract topics
47
+ # start_time = time.time()
48
+ # preprocessed_text = preprocess_text([text])
49
+ # if not preprocessed_text:
50
+ # print(f"⚠️ No meaningful text extracted for LDA topic modeling in '{article.get('title', 'No Title')}'.")
51
+ # topic_words = []
52
+ # else:
53
+ # lda_model, dictionary = train_lda(preprocessed_text)
54
+ # topic_words = extract_topic_words(lda_model)
55
+ # topic_time = time.time() - start_time
56
+ # print(f"✅ Topic extraction completed in {topic_time:.2f} seconds")
57
+
58
+ # # Store processed data
59
+ # articles_data.append({
60
+ # "Title": article.get("title", "No Title"),
61
+ # "Summary": summary,
62
+ # "Sentiment": sentiment,
63
+ # "Topics": topic_words if topic_words else []
64
+ # })
65
+
66
+ # # Collect topics for comparative analysis
67
+ # if topic_words:
68
+ # all_topics.extend(topic_words)
69
+
70
+ # # Ensure articles_data is not empty before analysis
71
+ # if not articles_data:
72
+ # print("⚠️ No valid articles with content were processed.")
73
+ # return
74
+
75
+ # # Perform comparative sentiment analysis
76
+ # start_time = time.time()
77
+ # analysis_result = comparative_sentiment_analysis(company, articles_data)
78
+ # analysis_time = time.time() - start_time
79
+ # print(f"✅ Comparative sentiment analysis completed in {analysis_time:.2f} seconds")
80
+
81
+ # # Correctly extract "Comparative Sentiment Score"
82
+ # comparative_score = analysis_result.get("Comparative Sentiment Score", {})
83
+
84
+ # sentiment_distribution = comparative_score.get("Sentiment Distribution", {})
85
+ # coverage_differences = comparative_score.get("Coverage Differences", {})
86
+ # topic_overlap = comparative_score.get("Topic Overlap", [])
87
+
88
+ # # Debugging check
89
+ # if not sentiment_distribution:
90
+ # print("⚠️ No sentiment distribution detected.")
91
+ # if not coverage_differences:
92
+ # print("⚠️ No coverage differences found.")
93
+ # if not topic_overlap:
94
+ # print("⚠️ No topic overlap detected among articles.")
95
+
96
+ # # Final sentiment summary
97
+ # final_sentiment_analysis = analysis_result.get("Final Sentiment Analysis", "Analysis could not be completed.")
98
+
99
+ # # Generate summary speech
100
+ # start_time = time.time()
101
+ # final_summary = f"{company}’s latest news coverage is mostly {final_sentiment_analysis}."
102
+ # audio_file = text_to_speech(final_summary)
103
+ # audio_time = time.time() - start_time
104
+ # print(f"✅ Summary speech generation completed in {audio_time:.2f} seconds")
105
+
106
+ # # Construct final JSON output
107
+ # output = {
108
+ # "Company": company,
109
+ # "Articles": articles_data,
110
+ # "Comparative Sentiment Score": {
111
+ # "Sentiment Distribution": sentiment_distribution,
112
+ # "Coverage Differences": coverage_differences,
113
+ # "Topic Overlap": topic_overlap
114
+ # },
115
+ # "Extracted Topics": list(set(all_topics)), # Unique topics across articles
116
+ # "Final Sentiment Analysis": final_summary,
117
+ # "Audio": f"[Play {audio_file}]"
118
+ # }
119
+
120
+ # # Print JSON output
121
+ # print(json.dumps(output, indent=4, ensure_ascii=False))
122
+
123
+ # # Save JSON output
124
+ # with open(f"{company}_news_analysis.json", "w", encoding="utf-8") as json_file:
125
+ # json.dump(output, json_file, indent=4, ensure_ascii=False)
126
+
127
+ # if __name__ == "__main__":
128
+ # main()
129
+
130
+
131
+ import json
132
+ import time
133
+ from utils.news_extraction_api import extract_news
134
+ from utils.news_summarisation import summarize_text
135
+ from utils.news_sentiment import analyze_sentiment
136
+ from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
137
+ from utils.comparative_analysis import comparative_sentiment_analysis
138
+ from utils.text_to_speech import text_to_speech
139
+
140
+ def analyze_company_news(company):
141
+ # Extract news articles
142
+ start_time = time.time()
143
+ articles = extract_news(company)
144
+ extraction_time = time.time() - start_time
145
+
146
+ if not articles:
147
+ return {"message": "No news articles found. Try a different company."}
148
+
149
+ articles_data = [] # List to store processed articles
150
+
151
+ # Extract texts from articles for sentiment analysis
152
+ texts = [article["content"] for article in articles]
153
+
154
+ # Perform sentiment analysis
155
+ start_time = time.time()
156
+ sentiment_results = analyze_sentiment(texts)
157
+ sentiment_time = time.time() - start_time
158
+
159
+ # Process each article
160
+ for i, (article, sentiment) in enumerate(zip(articles, sentiment_results["Predicted Sentiment"]), start=1):
161
+ start_time = time.time()
162
+ summary = summarize_text(article["content"]) # Summarize article
163
+ summarization_time = time.time() - start_time
164
+
165
+ # Extract topics for the specific article
166
+ preprocessed_text = preprocess_text([article["content"]])
167
+ lda_model, dictionary = train_lda(preprocessed_text)
168
+ topic_words = extract_topic_words(lda_model)
169
+
170
+ article_entry = {
171
+ "Title": article["title"],
172
+ "Summary": summary,
173
+ "Sentiment": sentiment,
174
+ "Topics": topic_words
175
+ }
176
+ articles_data.append(article_entry)
177
+
178
+ # Perform comparative sentiment analysis
179
+ analysis_result = comparative_sentiment_analysis(company, articles_data)
180
+
181
+ # Generate a summary speech for the entire report
182
+ final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
183
+ audio_file = text_to_speech(final_summary) # Generate TTS
184
+
185
+ # Construct final JSON output
186
+ output = {
187
+ "Company": company,
188
+ "Articles": articles_data,
189
+ "Comparative Sentiment Score": analysis_result,
190
+ "Audio": f"[Play {audio_file}]" # Include a playable reference
191
+ }
192
+
193
+ return output
194
+
195
+ if __name__ == "__main__":
196
+ company = input("Enter the company name for analysis: ").strip()
197
+ result = analyze_company_news(company)
198
+ print(json.dumps(result, indent=4, ensure_ascii=False))
approach_api/utils/comparative_analysis.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ from collections import Counter
4
+
5
+ def comparative_sentiment_analysis(company, articles, max_comparisons=10, chunk_size=5):
6
+ """
7
+ Perform a comparative sentiment analysis on multiple articles.
8
+ """
9
+ overall_sentiment_counts = Counter()
10
+ overall_coverage_differences = []
11
+ all_topics = []
12
+
13
+ if not articles:
14
+ print("No articles found for analysis.")
15
+ return {
16
+ "Company": company,
17
+ "Articles": [],
18
+ "Comparative Sentiment Score": {
19
+ "Sentiment Distribution": {},
20
+ "Coverage Differences": [],
21
+ "Topic Overlap": {"Common Topics": [], "Unique Topics Per Article": []}
22
+ },
23
+ "Final Sentiment Analysis": "No data available."
24
+ }
25
+
26
+ # Process articles in chunks
27
+ for start in range(0, len(articles), chunk_size):
28
+ chunk = articles[start:start + chunk_size]
29
+
30
+ # Count sentiment distribution
31
+ sentiment_counts = Counter(article["Sentiment"] for article in chunk)
32
+ overall_sentiment_counts.update(sentiment_counts)
33
+
34
+ # Identify coverage differences
35
+ for i in range(len(chunk) - 1):
36
+ for j in range(i + 1, len(chunk)):
37
+ if len(overall_coverage_differences) >= max_comparisons:
38
+ break
39
+ article1, article2 = chunk[i], chunk[j]
40
+ comparison = {
41
+ "Comparison": f"'{article1.get('Title', 'Article 1')}' vs '{article2.get('Title', 'Article 2')}'",
42
+ "Impact": f"{article1.get('Topics', [])} vs {article2.get('Topics', [])}"
43
+ }
44
+ overall_coverage_differences.append(comparison)
45
+
46
+ # Extract topics ensuring valid lists
47
+ topics = [set(article.get("Topics", [])) for article in chunk if isinstance(article.get("Topics", list), list) and article.get("Topics", [])]
48
+ all_topics.extend(topics)
49
+
50
+ # Debugging Output
51
+ print("All Topics Extracted:", all_topics)
52
+
53
+ # Determine common and unique topics
54
+ if len(all_topics) == 0:
55
+ common_topics = set() # No topics found
56
+ elif len(all_topics) == 1:
57
+ common_topics = all_topics[0] # Only one article, take its topics as common
58
+ else:
59
+ common_topics = set.intersection(*all_topics) # Find intersection normally
60
+
61
+ unique_topics = [{"Article": i + 1, "Unique Topics": list(topics - common_topics)}
62
+ for i, topics in enumerate(all_topics)]
63
+
64
+ # Convert to list for JSON output
65
+ common_topics = list(common_topics)
66
+
67
+ print("Common Topics:", common_topics)
68
+
69
+ # Final sentiment summary
70
+ final_analysis = "The news coverage is mostly "
71
+ if overall_sentiment_counts["Positive"] > overall_sentiment_counts["Negative"]:
72
+ final_analysis += "positive, indicating potential growth."
73
+ elif overall_sentiment_counts["Negative"] > overall_sentiment_counts["Positive"]:
74
+ final_analysis += "negative, suggesting challenges ahead."
75
+ else:
76
+ final_analysis += "balanced, with mixed reactions."
77
+
78
+ # Final JSON structure
79
+ return {
80
+ "Comparative Sentiment Score": {
81
+ "Sentiment Distribution": dict(overall_sentiment_counts),
82
+ "Coverage Differences": overall_coverage_differences,
83
+ "Topic Overlap": {
84
+ "Common Topics": common_topics,
85
+ "Unique Topics Per Article": unique_topics
86
+ }
87
+ },
88
+ "Final Sentiment Analysis": final_analysis
89
+ }
90
+
91
+ # if __name__ == "__main__":
92
+ # articles = [
93
+ # {
94
+ # "Title": "Agentic AI startup AMT aims to be 'Google Adwords for influencers,' raises seed round",
95
+ # "Summary": "Agentic Marketing Technologies (AMT) has raised $3.5 million in a seed funding round led by San Francisco-based VC NFX .<n>AMT works by getting its AI agent, dubbed Lyra, to talk to influencers using natural language .<n>The company claims Lyra can also autonomously find influencers that match a campaign’s goals .",
96
+ # "Sentiment": "neutral",
97
+ # "Topics": [
98
+ # "influencer",
99
+ # "marketing"
100
+ # ]
101
+ # },
102
+ # {
103
+ # "Title": "Google Seals $32 Billion Deal for Cyber Start-Up Wiz",
104
+ # "Summary": "Google agreed to buy Wiz, a fast-growing cybersecurity start-up, for $32 billion .<n>The all-cash deal would be Google's largest, easily surpassing its $12.5 billion purchase of Motorola Mobility in 2012 .<n>In July, Wiz rejected Google’s $23 billion takeover offer, saying it wanted to pursue an initial public offering .",
105
+ # "Sentiment": "neutral",
106
+ # "Topics": [
107
+ # "wiz",
108
+ # "google"
109
+ # ]
110
+ # },
111
+ # {
112
+ # "Title": "Google's new Severance Easter egg is one only innies will understand",
113
+ # "Summary": "Just search for Severance and Google will pepper your screen with blue balloons .<n>Severance producer and frequent director Ben Stiller shared his show’s new Easter egg on X last night .<n>Severance’s season two finale airs this Friday on Apple TV Plus .",
114
+ # "Sentiment": "positive",
115
+ # "Topics": [
116
+ # "severance"
117
+ # ]
118
+ # }
119
+ # ]
120
+ # result = comparative_sentiment_analysis(articles)
121
+ # print(json.dumps(result, indent=4))
approach_api/utils/news_extraction_api.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ # NewsAPI Key
5
+ NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
6
+
7
+ def extract_news(company, num_articles=2):
8
+ """Fetch multiple news articles from NewsAPI and return titles and contents."""
9
+ url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
10
+ response = requests.get(url)
11
+
12
+ if response.status_code != 200:
13
+ print("Error:", response.status_code, response.text)
14
+ return []
15
+
16
+ data = response.json()
17
+ articles = data.get("articles", [])
18
+
19
+ if not articles:
20
+ print("No articles found.")
21
+ return []
22
+
23
+ extracted_articles = []
24
+
25
+ for article in articles[:num_articles]: # Get the required number of articles
26
+ article_url = article.get("url", "No URL available.")
27
+
28
+ # Scrape the article for title and content
29
+ article_response = requests.get(article_url)
30
+ if article_response.status_code == 200:
31
+ soup = BeautifulSoup(article_response.content, 'html.parser')
32
+ title = soup.title.string if soup.title else "No Title Found"
33
+
34
+ # Extract paragraphs and clean the content
35
+ paragraphs = soup.find_all('p')
36
+ content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
37
+
38
+ # Optionally, filter out unwanted text patterns
39
+ unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
40
+ for pattern in unwanted_patterns:
41
+ content = content.replace(pattern, "")
42
+
43
+ # Clean up extra spaces
44
+ content = ' '.join(content.split())
45
+
46
+ extracted_articles.append({"title": title, "content": content})
47
+
48
+ return extracted_articles
49
+
50
+
51
+ # import requests
52
+ # from bs4 import BeautifulSoup
53
+
54
+ # # NewsAPI Key
55
+ # NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
56
+
57
+ # def fetch_articles(company, num_articles=11):
58
+ # """Fetch multiple news articles from NewsAPI and return their titles and content."""
59
+ # url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
60
+ # response = requests.get(url)
61
+
62
+ # if response.status_code != 200:
63
+ # print("Error:", response.status_code, response.text)
64
+ # return []
65
+
66
+ # data = response.json()
67
+ # articles = data.get("articles", [])
68
+
69
+ # if not articles:
70
+ # print("No articles found.")
71
+ # return []
72
+
73
+ # fetched_articles = []
74
+
75
+ # for article in articles[:num_articles]: # Fetch only the required number of articles
76
+ # article_url = article.get("url")
77
+ # if not article_url:
78
+ # continue
79
+
80
+ # # Scrape the article for title and content
81
+ # try:
82
+ # article_response = requests.get(article_url, timeout=5) # Removed headers
83
+ # if article_response.status_code == 200:
84
+ # soup = BeautifulSoup(article_response.content, 'html.parser')
85
+ # title = soup.title.string if soup.title else "No Title Found"
86
+
87
+ # # Extract paragraphs and clean the content
88
+ # paragraphs = soup.find_all('p')
89
+ # content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
90
+
91
+ # # Remove unwanted text patterns
92
+ # unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
93
+ # for pattern in unwanted_patterns:
94
+ # content = content.replace(pattern, "")
95
+
96
+ # # Clean up extra spaces
97
+ # content = ' '.join(content.split())
98
+
99
+ # # Store the article's title and content
100
+ # fetched_articles.append({"title": title, "content": content})
101
+ # except requests.exceptions.RequestException as e:
102
+ # print(f"Error fetching article: {e}")
103
+
104
+ # return fetched_articles
105
+
106
+ # if __name__ == "__main__":
107
+ # company = input("Enter the company name for analysis: ").strip()
108
+ # articles = fetch_articles(company, num_articles=11)
109
+ # print(articles)
approach_api/utils/news_sentiment.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import scipy.special
3
+ import pandas as pd
4
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
+
6
+ # Load FinBERT model and tokenizer
7
+ finbert_ckpt = "cardiffnlp/twitter-roberta-base-sentiment"
8
+ tokenizer = AutoTokenizer.from_pretrained(finbert_ckpt)
9
+ model_finbert = AutoModelForSequenceClassification.from_pretrained(finbert_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")
10
+
11
+ def analyze_sentiment(text_list):
12
+ """Performs sentiment analysis on a list of texts using FinBERT."""
13
+ preds = []
14
+ preds_proba = []
15
+
16
+ tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}
17
+
18
+ for text in text_list:
19
+ with torch.no_grad():
20
+ # Tokenize the input
21
+ input_sequence = tokenizer(text, return_tensors="pt", **tokenizer_kwargs).to(model_finbert.device)
22
+ logits = model_finbert(**input_sequence).logits.cpu().numpy().squeeze()
23
+
24
+ # Convert logits to probabilities
25
+ scores = {
26
+ k: v for k, v in zip(
27
+ model_finbert.config.id2label.values(),
28
+ scipy.special.softmax(logits)
29
+ )
30
+ }
31
+
32
+ # Get the most probable sentiment
33
+ sentiment = max(scores, key=scores.get)
34
+ probability = max(scores.values())
35
+
36
+ # Map the sentiment labels
37
+ if sentiment == 'LABEL_2':
38
+ sentiment = 'positive'
39
+ elif sentiment == 'LABEL_0':
40
+ sentiment = 'negative'
41
+ else:
42
+ sentiment = 'neutral'
43
+
44
+ preds.append(sentiment)
45
+ preds_proba.append(probability)
46
+
47
+ # Return a DataFrame with results
48
+ df_results = pd.DataFrame({
49
+ "Text": text_list,
50
+ "Predicted Sentiment": preds,
51
+ "Probability": preds_proba
52
+ })
53
+
54
+ return df_results
approach_api/utils/news_summarisation.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+
4
+ device = "cuda" if torch.cuda.is_available() else "cpu"
5
+ print(f"Summarization Device: {device}")
6
+
7
+ model_ckpt = "google/pegasus-cnn_dailymail"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
9
+ model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
10
+
11
+ def summarize_text(text: str) -> str:
12
+ input_ids = tokenizer.encode(
13
+ text,
14
+ return_tensors="pt",
15
+ max_length=1024,
16
+ truncation=True,
17
+ ).to(device)
18
+ try:
19
+ summary_ids = model_pegasus.generate(input_ids, max_length=130, min_length=30, do_sample=False)
20
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
21
+ return summary
22
+ except RuntimeError as e:
23
+ print(f"Summarization Error: {e}")
24
+ return "Error: Could not generate summary due to length constraints."
25
+
approach_api/utils/text_to_speech.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gtts import gTTS
2
+ from deep_translator import GoogleTranslator
3
+
4
+ def text_to_speech(text):
5
+ """ Converts text into both English and Hindi speech using gTTS (Cloud-based TTS). """
6
+
7
+ # ✅ Translate English to Hindi
8
+ translated_text = GoogleTranslator(source="en", target="hi").translate(text)
9
+
10
+ # ✅ Hindi Voice (Using gTTS)
11
+ hindi_tts = gTTS(text=translated_text, lang="hi")
12
+ hindi_file = "output_hindi.mp3"
13
+ hindi_tts.save(hindi_file)
14
+
15
+ return hindi_file
16
+
17
+ # if __name__ == "__main__":
18
+ # text = input("Enter text: ")
19
+ # hindi_file = text_to_speech(text)
20
+ # print(f"Hindi audio saved to: {hindi_file}")
approach_api/utils/topic_extraction.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gensim import corpora, models
2
+ from nltk.corpus import stopwords
3
+ from nltk.tokenize import word_tokenize
4
+ import string
5
+ import nltk
6
+
7
+ # Download necessary NLTK resources
8
+ nltk.download("stopwords")
9
+ nltk.download("punkt")
10
+
11
+ def preprocess_text(text_data):
12
+ """
13
+ Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.
14
+
15
+ :param text_data: List of raw text documents
16
+ :return: List of preprocessed tokenized texts
17
+ """
18
+ stop_words = set(stopwords.words("english"))
19
+ processed_texts = [
20
+ [
21
+ word for word in word_tokenize(document.lower())
22
+ if word not in stop_words and word not in string.punctuation and word.isalpha()
23
+ ]
24
+ for document in text_data
25
+ ]
26
+ return processed_texts
27
+
28
+ def train_lda(texts, num_topics=3):
29
+ """
30
+ Trains an LDA model on the given preprocessed text data.
31
+
32
+ :param texts: List of tokenized texts
33
+ :param num_topics: Number of topics for the LDA model
34
+ :return: Trained LDA model and corresponding dictionary
35
+ """
36
+ dictionary = corpora.Dictionary(texts)
37
+ corpus = [dictionary.doc2bow(text) for text in texts]
38
+
39
+ ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
40
+
41
+ return ldamodel, dictionary
42
+
43
+ def extract_topic_words(ldamodel, num_topics=3, num_words=3):
44
+ """
45
+ Extracts meaningful words from each topic identified by the LDA model.
46
+
47
+ :param ldamodel: Trained LDA model
48
+ :param num_topics: Number of topics to extract
49
+ :param num_words: Number of words per topic to consider
50
+ :return: List of top words representing each topic
51
+ """
52
+ topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
53
+ topic_names = []
54
+
55
+ for topic in topics:
56
+ words = topic[1].split(" + ")
57
+ for word_data in words:
58
+ word = word_data.split("*")[1].strip('"') # Extract word
59
+ if word.isalpha() and len(word) > 2: # Ensure it's a meaningful word
60
+ topic_names.append(word)
61
+ break # Only take the top valid word
62
+
63
+ return list(set(topic_names)) # Ensure unique topics
64
+
approach_library/api/api.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from main import analyze_company_news # Import the function from main.py
4
+
5
+ app = FastAPI()
6
+
7
+ class CompanyRequest(BaseModel):
8
+ Company_Name: str
9
+
10
+ @app.post("/api/company")
11
+ async def handle_company(request: CompanyRequest):
12
+ company = request.Company_Name.strip()
13
+ result = analyze_company_news(company)
14
+ return result
15
+
16
+ if __name__ == "__main__":
17
+ import uvicorn
18
+ uvicorn.run(app, host="127.0.0.1", port=8000)
approach_library/app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ import time
4
+ from utils.news_extraction import extract_news
5
+ from utils.news_summarisation import summarize_text
6
+ from utils.news_sentiment import analyze_sentiment
7
+ from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
8
+ from utils.comparative_analysis import comparative_sentiment_analysis
9
+ from utils.text_to_speech import text_to_speech
10
+ import os
11
+
12
+ def analyze_company_news(company):
13
+ st.write(f"Analyzing company: {company}")
14
+
15
+ with st.spinner("Fetching news articles..."):
16
+ articles = extract_news(company)
17
+ if not articles:
18
+ st.error("No news articles found. Try a different company.")
19
+ return None
20
+ st.write(f"Found {len(articles)} articles")
21
+
22
+ articles_data = []
23
+ texts = [article["text"] for article in articles]
24
+
25
+ with st.spinner("Performing sentiment analysis..."):
26
+ sentiment_results = analyze_sentiment(texts)
27
+ st.write(f"Sentiment analysis completed for {len(sentiment_results['Predicted Sentiment'])} articles")
28
+
29
+ for article, sentiment in zip(articles, sentiment_results["Predicted Sentiment"]):
30
+ summary = summarize_text(article["text"])
31
+ preprocessed_text = preprocess_text([article["text"]])
32
+ lda_model, dictionary = train_lda(preprocessed_text)
33
+ topic_words = extract_topic_words(lda_model)
34
+
35
+ articles_data.append({
36
+ "Title": article["title"],
37
+ "Summary": summary,
38
+ "Sentiment": sentiment,
39
+ "Topics": topic_words
40
+ })
41
+
42
+ with st.spinner("Performing comparative analysis..."):
43
+ analysis_result = comparative_sentiment_analysis(company, articles_data)
44
+ st.write("Comparative analysis completed")
45
+ st.write("Analysis result:", analysis_result)
46
+
47
+ final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
48
+
49
+ with st.spinner("Generating Hindi TTS summary..."):
50
+ try:
51
+ audio_file = text_to_speech(final_summary)
52
+ if os.path.exists(audio_file):
53
+ st.write(f"TTS summary generated: {audio_file}")
54
+ else:
55
+ st.error("Failed to generate TTS summary")
56
+ audio_file = None
57
+ except Exception as e:
58
+ st.error(f"TTS generation failed: {str(e)}")
59
+ audio_file = None
60
+
61
+ return {
62
+ "Company": company,
63
+ "Articles": articles_data,
64
+ "Comparative Sentiment Score": analysis_result,
65
+ "Audio": audio_file
66
+ }
67
+
68
+ st.title("Company News Analysis")
69
+ company = st.text_input("Enter the company name for analysis:")
70
+ if st.button("Analyze") and company:
71
+ st.write(f"Starting analysis for: {company}")
72
+ result = analyze_company_news(company)
73
+ if result:
74
+ st.subheader(f"Analysis for {result['Company']}")
75
+
76
+ for article in result["Articles"]:
77
+ st.write(f"**Title:** {article['Title']}")
78
+ st.write(f"**Summary:** {article['Summary']}")
79
+ st.write(f"**Sentiment:** {article['Sentiment']}")
80
+ st.write(f"**Topics:** {', '.join(article['Topics'])}")
81
+ st.markdown("---")
82
+
83
+ st.subheader("Comparative Sentiment Score")
84
+ st.json(result["Comparative Sentiment Score"])
85
+
86
+ st.subheader("Hindi TTS Summary")
87
+ if result["Audio"]:
88
+ st.audio(result["Audio"], format="audio/mp3")
89
+ else:
90
+ st.warning("TTS summary not available")
91
+
approach_library/main.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ # import time
3
+ # from news_extraction import extract_news
4
+ # from news_summarisation import summarize_text
5
+ # from news_sentiment import analyze_sentiment
6
+ # from topic_extraction import preprocess_text, train_lda, extract_topic_words
7
+ # from comparative_analysis import comparative_sentiment_analysis
8
+ # from text_to_speech import text_to_speech # ✅ Import the TTS function
9
+
10
+ # def main():
11
+ # # User input for the company/topic
12
+ # company = input("Enter the company name for analysis: ").strip()
13
+
14
+ # # Extract news articles
15
+ # start_time = time.time()
16
+ # articles = extract_news(company)
17
+ # extraction_time = time.time() - start_time
18
+
19
+ # if not articles:
20
+ # print("No news articles found. Try a different company.")
21
+ # return
22
+
23
+ # articles_data = [] # List to store processed articles
24
+
25
+ # # Extract texts from articles for sentiment analysis
26
+ # texts = [article["text"] for article in articles]
27
+
28
+ # # Perform sentiment analysis
29
+ # start_time = time.time()
30
+ # sentiment_results = analyze_sentiment(texts)
31
+ # sentiment_time = time.time() - start_time
32
+
33
+ # # Process each article
34
+ # for i, (article, sentiment) in enumerate(zip(articles, sentiment_results["Predicted Sentiment"]), start=1):
35
+ # start_time = time.time()
36
+ # summary = summarize_text(article["text"]) # Summarize article
37
+ # summarization_time = time.time() - start_time
38
+
39
+ # # Extract topics for the specific article
40
+ # preprocessed_text = preprocess_text([article["text"]])
41
+ # lda_model, dictionary = train_lda(preprocessed_text)
42
+ # topic_words = extract_topic_words(lda_model)
43
+
44
+ # article_entry = {
45
+ # "Title": article["title"],
46
+ # "Summary": summary,
47
+ # "Sentiment": sentiment,
48
+ # "Topics": topic_words
49
+ # }
50
+ # articles_data.append(article_entry)
51
+
52
+ # # Perform comparative sentiment analysis
53
+ # analysis_result = comparative_sentiment_analysis(company, articles_data)
54
+
55
+ # # ✅ Generate a summary speech for the entire report
56
+ # final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
57
+ # audio_file = text_to_speech(final_summary) # Generate Hindi TTS
58
+
59
+ # # ✅ Construct final JSON output
60
+ # output = {
61
+ # "Company": company,
62
+ # "Articles": articles_data,
63
+ # "Comparative Sentiment Score": analysis_result,
64
+ # "Final Sentiment Analysis": final_summary,
65
+ # "Audio": f"[Play {audio_file}]" # ✅ Include a playable reference
66
+ # }
67
+
68
+ # # Print JSON output
69
+ # print(json.dumps(output, indent=4, ensure_ascii=False))
70
+
71
+ # # Save JSON output to file
72
+ # with open(f"{company}_news_analysis.json", "w", encoding="utf-8") as json_file:
73
+ # json.dump(output, json_file, indent=4, ensure_ascii=False)
74
+
75
+ # if __name__ == "__main__":
76
+ # main()
77
+
78
+ import json
79
+ import time
80
+ from utils.news_extraction import extract_news
81
+ from utils.news_summarisation import summarize_text
82
+ from utils.news_sentiment import analyze_sentiment
83
+ from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
84
+ from utils.comparative_analysis import comparative_sentiment_analysis
85
+ from utils.text_to_speech import text_to_speech # ✅ Import the TTS function
86
+
87
+ def analyze_company_news(company):
88
+ # Extract news articles
89
+ start_time = time.time()
90
+ articles = extract_news(company)
91
+ extraction_time = time.time() - start_time
92
+
93
+ if not articles:
94
+ return {"message": "No news articles found. Try a different company."}
95
+
96
+ articles_data = [] # List to store processed articles
97
+
98
+ # Extract texts from articles for sentiment analysis
99
+ texts = [article["text"] for article in articles]
100
+
101
+ # Perform sentiment analysis
102
+ start_time = time.time()
103
+ sentiment_results = analyze_sentiment(texts)
104
+ sentiment_time = time.time() - start_time
105
+
106
+ # Process each article
107
+ for i, (article, sentiment) in enumerate(zip(articles, sentiment_results["Predicted Sentiment"]), start=1):
108
+ start_time = time.time()
109
+ summary = summarize_text(article["text"]) # Summarize article
110
+ summarization_time = time.time() - start_time
111
+
112
+ # Extract topics for the specific article
113
+ preprocessed_text = preprocess_text([article["text"]])
114
+ lda_model, dictionary = train_lda(preprocessed_text)
115
+ topic_words = extract_topic_words(lda_model)
116
+
117
+ article_entry = {
118
+ "Title": article["title"],
119
+ "Summary": summary,
120
+ "Sentiment": sentiment,
121
+ "Topics": topic_words
122
+ }
123
+ articles_data.append(article_entry)
124
+
125
+ # Perform comparative sentiment analysis
126
+ analysis_result = comparative_sentiment_analysis(company, articles_data)
127
+
128
+ # ✅ Generate a summary speech for the entire report
129
+ final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
130
+ audio_file = text_to_speech(final_summary) # Generate TTS
131
+
132
+ # ✅ Construct final JSON output
133
+ output = {
134
+ "Company": company,
135
+ "Articles": articles_data,
136
+ "Comparative Sentiment Score": analysis_result,
137
+ "Audio": f"[Play {audio_file}]" # ✅ Include a playable reference
138
+ }
139
+
140
+ return output
141
+
142
+ # if __name__ == "__main__":
143
+ # company = input("Enter the company name for analysis: ").strip()
144
+ # result = analyze_company_news(company)
145
+ # print(json.dumps(result, indent=4, ensure_ascii=False))
approach_library/utils/comparative_analysis.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ from collections import Counter
4
+
5
+ def comparative_sentiment_analysis(company, articles, max_comparisons=10, chunk_size=5):
6
+ """
7
+ Perform a comparative sentiment analysis on multiple articles.
8
+ """
9
+ overall_sentiment_counts = Counter()
10
+ overall_coverage_differences = []
11
+ all_topics = []
12
+
13
+ if not articles:
14
+ print("No articles found for analysis.")
15
+ return {
16
+ "Company": company,
17
+ "Articles": [],
18
+ "Comparative Sentiment Score": {
19
+ "Sentiment Distribution": {},
20
+ "Coverage Differences": [],
21
+ "Topic Overlap": {"Common Topics": [], "Unique Topics Per Article": []}
22
+ },
23
+ "Final Sentiment Analysis": "No data available."
24
+ }
25
+
26
+ # Process articles in chunks
27
+ for start in range(0, len(articles), chunk_size):
28
+ chunk = articles[start:start + chunk_size]
29
+
30
+ # Count sentiment distribution
31
+ sentiment_counts = Counter(article["Sentiment"] for article in chunk)
32
+ overall_sentiment_counts.update(sentiment_counts)
33
+
34
+ # Identify coverage differences
35
+ for i in range(len(chunk) - 1):
36
+ for j in range(i + 1, len(chunk)):
37
+ if len(overall_coverage_differences) >= max_comparisons:
38
+ break
39
+ article1, article2 = chunk[i], chunk[j]
40
+ comparison = {
41
+ "Comparison": f"'{article1.get('Title', 'Article 1')}' vs '{article2.get('Title', 'Article 2')}'",
42
+ "Impact": f"{article1.get('Topics', [])} vs {article2.get('Topics', [])}"
43
+ }
44
+ overall_coverage_differences.append(comparison)
45
+
46
+ # Extract topics ensuring valid lists
47
+ topics = [set(article.get("Topics", [])) for article in chunk if isinstance(article.get("Topics", list), list) and article.get("Topics", [])]
48
+ all_topics.extend(topics)
49
+
50
+ # Debugging Output
51
+ print("All Topics Extracted:", all_topics)
52
+
53
+ # Determine common and unique topics
54
+ if len(all_topics) == 0:
55
+ common_topics = set() # No topics found
56
+ elif len(all_topics) == 1:
57
+ common_topics = all_topics[0] # Only one article, take its topics as common
58
+ else:
59
+ common_topics = set.intersection(*all_topics) # Find intersection normally
60
+
61
+ unique_topics = [{"Article": i + 1, "Unique Topics": list(topics - common_topics)}
62
+ for i, topics in enumerate(all_topics)]
63
+
64
+ # Convert to list for JSON output
65
+ common_topics = list(common_topics)
66
+
67
+ print("Common Topics:", common_topics)
68
+
69
+ # Final sentiment summary
70
+ final_analysis = "The news coverage is mostly "
71
+ if overall_sentiment_counts["Positive"] > overall_sentiment_counts["Negative"]:
72
+ final_analysis += "positive, indicating potential growth."
73
+ elif overall_sentiment_counts["Negative"] > overall_sentiment_counts["Positive"]:
74
+ final_analysis += "negative, suggesting challenges ahead."
75
+ else:
76
+ final_analysis += "balanced, with mixed reactions."
77
+
78
+
79
+ return {
80
+ "Comparative Sentiment Score": {
81
+ "Sentiment Distribution": dict(overall_sentiment_counts),
82
+ "Coverage Differences": overall_coverage_differences,
83
+ "Topic Overlap": {
84
+ "Common Topics": common_topics,
85
+ "Unique Topics Per Article": unique_topics
86
+ }
87
+ },
88
+ "Final Sentiment Analysis": final_analysis
89
+ }
90
+
approach_library/utils/news_extraction.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nest_asyncio
2
+ from typing import List, Dict
3
+ from duckduckgo_search import DDGS
4
+ from phi.tools.newspaper4k import Newspaper4k
5
+ import time
6
+
7
+ nest_asyncio.apply()
8
+
9
+ def extract_news(article_topic: str, num_search_results: int = 15, max_retries: int = 3) -> List[Dict[str, str]]:
10
+ """
11
+ Extracts full news articles based on the given topic and number of search results.
12
+
13
+ Args:
14
+ article_topic: The topic to search for.
15
+ num_search_results: The number of search results to retrieve.
16
+ max_retries: The maximum number of retries if an article fails to scrape.
17
+
18
+ Returns:
19
+ A list of dictionaries, where each dictionary represents a news article.
20
+ """
21
+ news_results = []
22
+ ddgs = DDGS()
23
+ newspaper_tools = Newspaper4k()
24
+
25
+ results = ddgs.news(keywords=article_topic, max_results=num_search_results) # Fetch extra results
26
+
27
+ for r in results:
28
+ if "url" in r:
29
+ retries = 0
30
+ while retries < max_retries:
31
+ try:
32
+ article_data = newspaper_tools.get_article_data(r["url"])
33
+
34
+ if article_data and "text" in article_data and len(article_data["text"]) > 100:
35
+ news_results.append({
36
+ "title": r.get("title", "No Title"),
37
+ "text": article_data["text"] # Full article text
38
+ })
39
+ break # Successful extraction, break retry loop
40
+ else:
41
+ retries += 1
42
+ time.sleep(1) # Wait before retrying
43
+ except Exception as e:
44
+ retries += 1
45
+ time.sleep(1)
46
+
47
+ # Stop if we have collected enough articles
48
+ if len(news_results) >= num_search_results:
49
+ break
50
+
51
+ return news_results
approach_library/utils/news_sentiment.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import scipy.special
3
+ import pandas as pd
4
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
+
6
+ # Load FinBERT model and tokenizer
7
+ finbert_ckpt = "cardiffnlp/twitter-roberta-base-sentiment"
8
+ tokenizer = AutoTokenizer.from_pretrained(finbert_ckpt)
9
+ model_finbert = AutoModelForSequenceClassification.from_pretrained(finbert_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")
10
+
11
+ def analyze_sentiment(text_list):
12
+ """Performs sentiment analysis on a list of texts using FinBERT."""
13
+ preds = []
14
+ preds_proba = []
15
+
16
+ tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}
17
+
18
+ for text in text_list:
19
+ with torch.no_grad():
20
+ # Tokenize the input
21
+ input_sequence = tokenizer(text, return_tensors="pt", **tokenizer_kwargs).to(model_finbert.device)
22
+ logits = model_finbert(**input_sequence).logits.cpu().numpy().squeeze()
23
+
24
+ # Convert logits to probabilities
25
+ scores = {
26
+ k: v for k, v in zip(
27
+ model_finbert.config.id2label.values(),
28
+ scipy.special.softmax(logits)
29
+ )
30
+ }
31
+
32
+ # Get the most probable sentiment
33
+ sentiment = max(scores, key=scores.get)
34
+ probability = max(scores.values())
35
+
36
+ # Map the sentiment labels
37
+ if sentiment == 'LABEL_2':
38
+ sentiment = 'positive'
39
+ elif sentiment == 'LABEL_0':
40
+ sentiment = 'negative'
41
+ else:
42
+ sentiment = 'neutral'
43
+
44
+ preds.append(sentiment)
45
+ preds_proba.append(probability)
46
+
47
+ # Return a DataFrame with results
48
+ df_results = pd.DataFrame({
49
+ "Text": text_list,
50
+ "Predicted Sentiment": preds,
51
+ "Probability": preds_proba
52
+ })
53
+
54
+ return df_results
approach_library/utils/news_summarisation.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
+
5
+ device = "cuda" if torch.cuda.is_available() else "cpu"
6
+ print(f"Summarization Device: {device}")
7
+
8
+ model_ckpt = "google/pegasus-cnn_dailymail"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
10
+ model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
11
+
12
+ def summarize_text(text: str) -> str:
13
+ input_ids = tokenizer.encode(
14
+ text,
15
+ return_tensors="pt",
16
+ max_length=1024,
17
+ truncation=True,
18
+ ).to(device)
19
+ try:
20
+ summary_ids = model_pegasus.generate(input_ids, max_length=130, min_length=30, do_sample=False)
21
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
22
+ return summary
23
+ except RuntimeError as e:
24
+ print(f"Summarization Error: {e}")
25
+ return "Error: Could not generate summary due to length constraints."
26
+
approach_library/utils/text_to_speech.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gtts import gTTS
2
+ from deep_translator import GoogleTranslator
3
+
4
+ def text_to_speech(text):
5
+ """ Converts text into both English and Hindi speech using gTTS (Cloud-based TTS). """
6
+
7
+ # ✅ Translate English to Hindi
8
+ translated_text = GoogleTranslator(source="en", target="hi").translate(text)
9
+
10
+ # ✅ Hindi Voice (Using gTTS)
11
+ hindi_tts = gTTS(text=translated_text, lang="hi")
12
+ hindi_file = "output_hindi.mp3"
13
+ hindi_tts.save(hindi_file)
14
+
15
+ return hindi_file
16
+
17
+ # if __name__ == "__main__":
18
+ # text = input("Enter text: ")
19
+ # hindi_file = text_to_speech(text)
20
+ # print(f"Hindi audio saved to: {hindi_file}")
approach_library/utils/topic_extraction.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gensim import corpora, models
2
+ from nltk.corpus import stopwords
3
+ from nltk.tokenize import word_tokenize
4
+ import string
5
+ import nltk
6
+
7
+ # Download necessary NLTK resources
8
+ nltk.download("stopwords")
9
+ nltk.download("punkt")
10
+
11
+ def preprocess_text(text_data):
12
+ """
13
+ Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.
14
+
15
+ :param text_data: List of raw text documents
16
+ :return: List of preprocessed tokenized texts
17
+ """
18
+ stop_words = set(stopwords.words("english"))
19
+ processed_texts = [
20
+ [
21
+ word for word in word_tokenize(document.lower())
22
+ if word not in stop_words and word not in string.punctuation and word.isalpha()
23
+ ]
24
+ for document in text_data
25
+ ]
26
+ return processed_texts
27
+
28
+ def train_lda(texts, num_topics=3):
29
+ """
30
+ Trains an LDA model on the given preprocessed text data.
31
+
32
+ :param texts: List of tokenized texts
33
+ :param num_topics: Number of topics for the LDA model
34
+ :return: Trained LDA model and corresponding dictionary
35
+ """
36
+ dictionary = corpora.Dictionary(texts)
37
+ corpus = [dictionary.doc2bow(text) for text in texts]
38
+
39
+ ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
40
+
41
+ return ldamodel, dictionary
42
+
43
+ def extract_topic_words(ldamodel, num_topics=3, num_words=3):
44
+ """
45
+ Extracts meaningful words from each topic identified by the LDA model.
46
+
47
+ :param ldamodel: Trained LDA model
48
+ :param num_topics: Number of topics to extract
49
+ :param num_words: Number of words per topic to consider
50
+ :return: List of top words representing each topic
51
+ """
52
+ topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
53
+ topic_names = []
54
+
55
+ for topic in topics:
56
+ words = topic[1].split(" + ")
57
+ for word_data in words:
58
+ word = word_data.split("*")[1].strip('"') # Extract word
59
+ if word.isalpha() and len(word) > 2: # Ensure it's a meaningful word
60
+ topic_names.append(word)
61
+ break # Only take the top valid word
62
+
63
+ return list(set(topic_names)) # Ensure unique topics
64
+
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nest_asyncio
2
+ duckduckgo-search
3
+ newspaper4k
4
+ transformers
5
+ phidata
6
+ newspaper4k
7
+ lxml_html_clean
8
+ duckduckgo_search
9
+ transformers
10
+ datasets
11
+ pandas
12
+ nltk
13
+ torch
14
+ tqdm
15
+ GoogleNews
16
+ pygooglenews
17
+ feedparser
18
+ googlesearch-python
19
+ soundfile
20
+ gtts
21
+ deep_translator
22
+ fastapi
23
+ pydantic
24
+ uvicorn
25
+ python-magic
26
+ streamlit