Spaces:

v1shal
/

News_Summarisation_and_Sentiment_Analysis

Sleeping

App Files Files Community

v1shal commited on Mar 22

Commit

b396e94

1 Parent(s): 3f6c8e2

first_commit

Browse files

Files changed (19) hide show

.gitignore +43 -0
approach_api/api/api.py +18 -0
approach_api/main.py +198 -0
approach_api/utils/comparative_analysis.py +121 -0
approach_api/utils/news_extraction_api.py +109 -0
approach_api/utils/news_sentiment.py +54 -0
approach_api/utils/news_summarisation.py +25 -0
approach_api/utils/text_to_speech.py +20 -0
approach_api/utils/topic_extraction.py +64 -0
approach_library/api/api.py +18 -0
approach_library/app.py +91 -0
approach_library/main.py +145 -0
approach_library/utils/comparative_analysis.py +90 -0
approach_library/utils/news_extraction.py +51 -0
approach_library/utils/news_sentiment.py +54 -0
approach_library/utils/news_summarisation.py +26 -0
approach_library/utils/text_to_speech.py +20 -0
approach_library/utils/topic_extraction.py +64 -0
requirements.txt +26 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,43 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+myenv/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Environment files
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+# Logs
+*.log
+logs/

approach_api/api/api.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from main import analyze_company_news
+app = FastAPI()
+class CompanyRequest(BaseModel):
+    Company_Name: str
+@app.post("/api/company")
+async def handle_company(request: CompanyRequest):
+    company = request.Company_Name.strip()
+    result = analyze_company_news(company)
+    return result
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="127.0.0.1", port=8000)

approach_api/main.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# import json
+# import time
+# from utils.news_extraction_api import fetch_articles
+# from utils.news_summarisation import summarize_text
+# from utils.news_sentiment import analyze_sentiment
+# from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
+# from utils.comparative_analysis import comparative_sentiment_analysis
+# from utils.text_to_speech import text_to_speech
+# def main():
+#     company = input("Enter the company name for analysis: ").strip()
+#     # Extract news articles
+#     start_time = time.time()
+#     articles = fetch_articles(company, num_articles=2)  # Fetch 2    articles
+#     extraction_time = time.time() - start_time
+#     print(f"✅ Articles extracted in {extraction_time:.2f} seconds")
+#     if not articles:
+#         print("⚠️ No news articles found. Try a different company.")
+#         return
+#     articles_data = []
+#     all_topics = []  # Collect all topics for better analysis
+#     for article in articles:
+#         text = article.get("content", "").strip()
+#         if not text:
+#             print(f"⚠️ Skipping article '{article.get('title', 'No Title')}' due to missing content.")
+#             continue
+#         # Perform sentiment analysis
+#         start_time = time.time()
+#         sentiment_result = analyze_sentiment([text])
+#         sentiment = sentiment_result.get("Predicted Sentiment", ["Unknown"])[0]
+#         sentiment_time = time.time() - start_time
+#         print(f"✅ Sentiment analysis completed in {sentiment_time:.2f} seconds")
+#         # Summarize the article
+#         start_time = time.time()
+#         summary = summarize_text(text)
+#         summary_time = time.time() - start_time
+#         print(f"✅ Summary generation completed in {summary_time:.2f} seconds")
+#         # Extract topics
+#         start_time = time.time()
+#         preprocessed_text = preprocess_text([text])
+#         if not preprocessed_text:
+#             print(f"⚠️ No meaningful text extracted for LDA topic modeling in '{article.get('title', 'No Title')}'.")
+#             topic_words = []
+#         else:
+#             lda_model, dictionary = train_lda(preprocessed_text)
+#             topic_words = extract_topic_words(lda_model)
+#             topic_time = time.time() - start_time
+#             print(f"✅ Topic extraction completed in {topic_time:.2f} seconds")
+#         # Store processed data
+#         articles_data.append({
+#             "Title": article.get("title", "No Title"),
+#             "Summary": summary,
+#             "Sentiment": sentiment,
+#             "Topics": topic_words if topic_words else []
+#         })
+#         # Collect topics for comparative analysis
+#         if topic_words:
+#             all_topics.extend(topic_words)
+#     # Ensure articles_data is not empty before analysis
+#     if not articles_data:
+#         print("⚠️ No valid articles with content were processed.")
+#         return
+#     # Perform comparative sentiment analysis
+#     start_time = time.time()
+#     analysis_result = comparative_sentiment_analysis(company, articles_data)
+#     analysis_time = time.time() - start_time
+#     print(f"✅ Comparative sentiment analysis completed in {analysis_time:.2f} seconds")
+#     # Correctly extract "Comparative Sentiment Score"
+#     comparative_score = analysis_result.get("Comparative Sentiment Score", {})
+#     sentiment_distribution = comparative_score.get("Sentiment Distribution", {})
+#     coverage_differences = comparative_score.get("Coverage Differences", {})
+#     topic_overlap = comparative_score.get("Topic Overlap", [])
+#     # Debugging check
+#     if not sentiment_distribution:
+#         print("⚠️ No sentiment distribution detected.")
+#     if not coverage_differences:
+#         print("⚠️ No coverage differences found.")
+#     if not topic_overlap:
+#         print("⚠️ No topic overlap detected among articles.")
+#     # Final sentiment summary
+#     final_sentiment_analysis = analysis_result.get("Final Sentiment Analysis", "Analysis could not be completed.")
+#     # Generate summary speech
+#     start_time = time.time()
+#     final_summary = f"{company}’s latest news coverage is mostly {final_sentiment_analysis}."
+#     audio_file = text_to_speech(final_summary)
+#     audio_time = time.time() - start_time
+#     print(f"✅ Summary speech generation completed in {audio_time:.2f} seconds")
+#     # Construct final JSON output
+#     output = {
+#         "Company": company,
+#         "Articles": articles_data,
+#         "Comparative Sentiment Score": {
+#             "Sentiment Distribution": sentiment_distribution,
+#             "Coverage Differences": coverage_differences,
+#             "Topic Overlap": topic_overlap
+#         },
+#         "Extracted Topics": list(set(all_topics)),  # Unique topics across articles
+#         "Final Sentiment Analysis": final_summary,
+#         "Audio": f"[Play {audio_file}]"
+#     }
+#     # Print JSON output
+#     print(json.dumps(output, indent=4, ensure_ascii=False))
+#     # Save JSON output
+#     with open(f"{company}_news_analysis.json", "w", encoding="utf-8") as json_file:
+#         json.dump(output, json_file, indent=4, ensure_ascii=False)
+# if __name__ == "__main__":
+#     main()
+import json
+import time
+from utils.news_extraction_api import extract_news
+from utils.news_summarisation import summarize_text
+from utils.news_sentiment import analyze_sentiment
+from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
+from utils.comparative_analysis import comparative_sentiment_analysis
+from utils.text_to_speech import text_to_speech
+def analyze_company_news(company):
+    # Extract news articles
+    start_time = time.time()
+    articles = extract_news(company)
+    extraction_time = time.time() - start_time
+    if not articles:
+        return {"message": "No news articles found. Try a different company."}
+    articles_data = []  # List to store processed articles
+    # Extract texts from articles for sentiment analysis
+    texts = [article["content"] for article in articles]
+    # Perform sentiment analysis
+    start_time = time.time()
+    sentiment_results = analyze_sentiment(texts)
+    sentiment_time = time.time() - start_time
+    # Process each article
+    for i, (article, sentiment) in enumerate(zip(articles, sentiment_results["Predicted Sentiment"]), start=1):
+        start_time = time.time()
+        summary = summarize_text(article["content"])  # Summarize article
+        summarization_time = time.time() - start_time
+        # Extract topics for the specific article
+        preprocessed_text = preprocess_text([article["content"]])
+        lda_model, dictionary = train_lda(preprocessed_text)
+        topic_words = extract_topic_words(lda_model)
+        article_entry = {
+            "Title": article["title"],
+            "Summary": summary,
+            "Sentiment": sentiment,
+            "Topics": topic_words
+        }
+        articles_data.append(article_entry)
+    # Perform comparative sentiment analysis
+    analysis_result = comparative_sentiment_analysis(company, articles_data)
+    # Generate a summary speech for the entire report
+    final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
+    audio_file = text_to_speech(final_summary)  # Generate TTS
+    # Construct final JSON output
+    output = {
+        "Company": company,
+        "Articles": articles_data,
+        "Comparative Sentiment Score": analysis_result,
+        "Audio": f"[Play {audio_file}]"  # Include a playable reference
+    }
+    return output
+if __name__ == "__main__":
+    company = input("Enter the company name for analysis: ").strip()
+    result = analyze_company_news(company)
+    print(json.dumps(result, indent=4, ensure_ascii=False))

approach_api/utils/comparative_analysis.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import json
+from collections import Counter
+def comparative_sentiment_analysis(company, articles, max_comparisons=10, chunk_size=5):
+    """
+    Perform a comparative sentiment analysis on multiple articles.
+    """
+    overall_sentiment_counts = Counter()
+    overall_coverage_differences = []
+    all_topics = []
+    if not articles:
+        print("No articles found for analysis.")
+        return {
+            "Company": company,
+            "Articles": [],
+            "Comparative Sentiment Score": {
+                "Sentiment Distribution": {},
+                "Coverage Differences": [],
+                "Topic Overlap": {"Common Topics": [], "Unique Topics Per Article": []}
+            },
+            "Final Sentiment Analysis": "No data available."
+        }
+    # Process articles in chunks
+    for start in range(0, len(articles), chunk_size):
+        chunk = articles[start:start + chunk_size]
+        # Count sentiment distribution
+        sentiment_counts = Counter(article["Sentiment"] for article in chunk)
+        overall_sentiment_counts.update(sentiment_counts)
+        # Identify coverage differences
+        for i in range(len(chunk) - 1):
+            for j in range(i + 1, len(chunk)):
+                if len(overall_coverage_differences) >= max_comparisons:
+                    break
+                article1, article2 = chunk[i], chunk[j]
+                comparison = {
+                    "Comparison": f"'{article1.get('Title', 'Article 1')}' vs '{article2.get('Title', 'Article 2')}'",
+                    "Impact": f"{article1.get('Topics', [])} vs {article2.get('Topics', [])}"
+                }
+                overall_coverage_differences.append(comparison)
+        # Extract topics ensuring valid lists
+        topics = [set(article.get("Topics", [])) for article in chunk if isinstance(article.get("Topics", list), list) and article.get("Topics", [])]
+        all_topics.extend(topics)
+    # Debugging Output
+    print("All Topics Extracted:", all_topics)
+    # Determine common and unique topics
+    if len(all_topics) == 0:
+        common_topics = set()  # No topics found
+    elif len(all_topics) == 1:
+        common_topics = all_topics[0]  # Only one article, take its topics as common
+    else:
+        common_topics = set.intersection(*all_topics)  # Find intersection normally
+    unique_topics = [{"Article": i + 1, "Unique Topics": list(topics - common_topics)}
+                     for i, topics in enumerate(all_topics)]
+    # Convert to list for JSON output
+    common_topics = list(common_topics)
+    print("Common Topics:", common_topics)
+    # Final sentiment summary
+    final_analysis = "The news coverage is mostly "
+    if overall_sentiment_counts["Positive"] > overall_sentiment_counts["Negative"]:
+        final_analysis += "positive, indicating potential growth."
+    elif overall_sentiment_counts["Negative"] > overall_sentiment_counts["Positive"]:
+        final_analysis += "negative, suggesting challenges ahead."
+    else:
+        final_analysis += "balanced, with mixed reactions."
+    # Final JSON structure
+    return {
+        "Comparative Sentiment Score": {
+            "Sentiment Distribution": dict(overall_sentiment_counts),
+            "Coverage Differences": overall_coverage_differences,
+            "Topic Overlap": {
+                "Common Topics": common_topics,
+                "Unique Topics Per Article": unique_topics
+            }
+        },
+        "Final Sentiment Analysis": final_analysis
+    }
+# if __name__ == "__main__":
+#     articles = [
+#         {
+#             "Title": "Agentic AI startup AMT aims to be 'Google Adwords for influencers,' raises seed round",
+#             "Summary": "Agentic Marketing Technologies (AMT) has raised $3.5 million in a seed funding round led by San Francisco-based VC NFX .<n>AMT works by getting its AI agent, dubbed Lyra, to talk to influencers using natural language .<n>The company claims Lyra can also autonomously find influencers that match a campaign’s goals .",
+#             "Sentiment": "neutral",
+#             "Topics": [
+#                 "influencer",
+#                 "marketing"
+#             ]
+#         },
+#         {
+#             "Title": "Google Seals $32 Billion Deal for Cyber Start-Up Wiz",
+#             "Summary": "Google agreed to buy Wiz, a fast-growing cybersecurity start-up, for $32 billion .<n>The all-cash deal would be Google's largest, easily surpassing its $12.5 billion purchase of Motorola Mobility in 2012 .<n>In July, Wiz rejected Google’s $23 billion takeover offer, saying it wanted to pursue an initial public offering .",
+#             "Sentiment": "neutral",
+#             "Topics": [
+#                 "wiz",
+#                 "google"
+#             ]
+#         },
+#         {
+#             "Title": "Google's new Severance Easter egg is one only innies will understand",
+#             "Summary": "Just search for Severance and Google will pepper your screen with blue balloons .<n>Severance producer and frequent director Ben Stiller shared his show’s new Easter egg on X last night .<n>Severance’s season two finale airs this Friday on Apple TV Plus .",
+#             "Sentiment": "positive",
+#             "Topics": [
+#                 "severance"
+#             ]
+#         }
+#     ]
+# result = comparative_sentiment_analysis(articles)
+# print(json.dumps(result, indent=4))

approach_api/utils/news_extraction_api.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import requests
+from bs4 import BeautifulSoup
+# NewsAPI Key
+NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
+def extract_news(company, num_articles=2):
+    """Fetch multiple news articles from NewsAPI and return titles and contents."""
+    url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
+    response = requests.get(url)
+    if response.status_code != 200:
+        print("Error:", response.status_code, response.text)
+        return []
+    data = response.json()
+    articles = data.get("articles", [])
+    if not articles:
+        print("No articles found.")
+        return []
+    extracted_articles = []
+    for article in articles[:num_articles]:  # Get the required number of articles
+        article_url = article.get("url", "No URL available.")
+        # Scrape the article for title and content
+        article_response = requests.get(article_url)
+        if article_response.status_code == 200:
+            soup = BeautifulSoup(article_response.content, 'html.parser')
+            title = soup.title.string if soup.title else "No Title Found"
+            # Extract paragraphs and clean the content
+            paragraphs = soup.find_all('p')
+            content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
+            # Optionally, filter out unwanted text patterns
+            unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
+            for pattern in unwanted_patterns:
+                content = content.replace(pattern, "")
+            # Clean up extra spaces
+            content = ' '.join(content.split())
+            extracted_articles.append({"title": title, "content": content})
+    return extracted_articles
+# import requests
+# from bs4 import BeautifulSoup
+# # NewsAPI Key
+# NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"
+# def fetch_articles(company, num_articles=11):
+#     """Fetch multiple news articles from NewsAPI and return their titles and content."""
+#     url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
+#     response = requests.get(url)
+#     if response.status_code != 200:
+#         print("Error:", response.status_code, response.text)
+#         return []
+#     data = response.json()
+#     articles = data.get("articles", [])
+#     if not articles:
+#         print("No articles found.")
+#         return []
+#     fetched_articles = []
+#     for article in articles[:num_articles]:  # Fetch only the required number of articles
+#         article_url = article.get("url")
+#         if not article_url:
+#             continue
+#         # Scrape the article for title and content
+#         try:
+#             article_response = requests.get(article_url, timeout=5)  # Removed headers
+#             if article_response.status_code == 200:
+#                 soup = BeautifulSoup(article_response.content, 'html.parser')
+#                 title = soup.title.string if soup.title else "No Title Found"
+#                 # Extract paragraphs and clean the content
+#                 paragraphs = soup.find_all('p')
+#                 content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
+#                 # Remove unwanted text patterns
+#                 unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
+#                 for pattern in unwanted_patterns:
+#                     content = content.replace(pattern, "")
+#                 # Clean up extra spaces
+#                 content = ' '.join(content.split())
+#                 # Store the article's title and content
+#                 fetched_articles.append({"title": title, "content": content})
+#         except requests.exceptions.RequestException as e:
+#             print(f"Error fetching article: {e}")
+#     return fetched_articles
+# if __name__ == "__main__":
+#     company = input("Enter the company name for analysis: ").strip()
+#     articles = fetch_articles(company, num_articles=11)
+#     print(articles)

approach_api/utils/news_sentiment.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+import scipy.special
+import pandas as pd
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+# Load FinBERT model and tokenizer
+finbert_ckpt = "cardiffnlp/twitter-roberta-base-sentiment"
+tokenizer = AutoTokenizer.from_pretrained(finbert_ckpt)
+model_finbert = AutoModelForSequenceClassification.from_pretrained(finbert_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")
+def analyze_sentiment(text_list):
+    """Performs sentiment analysis on a list of texts using FinBERT."""
+    preds = []
+    preds_proba = []
+    tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}
+    for text in text_list:
+        with torch.no_grad():
+            # Tokenize the input
+            input_sequence = tokenizer(text, return_tensors="pt", **tokenizer_kwargs).to(model_finbert.device)
+            logits = model_finbert(**input_sequence).logits.cpu().numpy().squeeze()
+            # Convert logits to probabilities
+            scores = {
+                k: v for k, v in zip(
+                    model_finbert.config.id2label.values(),
+                    scipy.special.softmax(logits)
+                )
+            }
+            # Get the most probable sentiment
+            sentiment = max(scores, key=scores.get)
+            probability = max(scores.values())
+            # Map the sentiment labels
+            if sentiment == 'LABEL_2':
+                sentiment = 'positive'
+            elif sentiment == 'LABEL_0':
+                sentiment = 'negative'
+            else:
+                sentiment = 'neutral'
+            preds.append(sentiment)
+            preds_proba.append(probability)
+    # Return a DataFrame with results
+    df_results = pd.DataFrame({
+        "Text": text_list,
+        "Predicted Sentiment": preds,
+        "Probability": preds_proba
+    })
+    return df_results

approach_api/utils/news_summarisation.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Summarization Device: {device}")
+model_ckpt = "google/pegasus-cnn_dailymail"
+tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
+def summarize_text(text: str) -> str:
+    input_ids = tokenizer.encode(
+        text,
+        return_tensors="pt",
+        max_length=1024,
+        truncation=True,
+    ).to(device)
+    try:
+        summary_ids = model_pegasus.generate(input_ids, max_length=130, min_length=30, do_sample=False)
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return summary
+    except RuntimeError as e:
+        print(f"Summarization Error: {e}")
+        return "Error: Could not generate summary due to length constraints."

approach_api/utils/text_to_speech.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from gtts import gTTS
+from deep_translator import GoogleTranslator
+def text_to_speech(text):
+    """ Converts text into both English and Hindi speech using gTTS (Cloud-based TTS). """
+    # ✅ Translate English to Hindi
+    translated_text = GoogleTranslator(source="en", target="hi").translate(text)
+    # ✅ Hindi Voice (Using gTTS)
+    hindi_tts = gTTS(text=translated_text, lang="hi")
+    hindi_file = "output_hindi.mp3"
+    hindi_tts.save(hindi_file)
+    return hindi_file
+# if __name__ == "__main__":
+#     text = input("Enter text: ")
+#     hindi_file = text_to_speech(text)
+#     print(f"Hindi audio saved to: {hindi_file}")

approach_api/utils/topic_extraction.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from gensim import corpora, models
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import string
+import nltk
+# Download necessary NLTK resources
+nltk.download("stopwords")
+nltk.download("punkt")
+def preprocess_text(text_data):
+    """
+    Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.
+    :param text_data: List of raw text documents
+    :return: List of preprocessed tokenized texts
+    """
+    stop_words = set(stopwords.words("english"))
+    processed_texts = [
+        [
+            word for word in word_tokenize(document.lower())
+            if word not in stop_words and word not in string.punctuation and word.isalpha()
+        ]
+        for document in text_data
+    ]
+    return processed_texts
+def train_lda(texts, num_topics=3):
+    """
+    Trains an LDA model on the given preprocessed text data.
+    :param texts: List of tokenized texts
+    :param num_topics: Number of topics for the LDA model
+    :return: Trained LDA model and corresponding dictionary
+    """
+    dictionary = corpora.Dictionary(texts)
+    corpus = [dictionary.doc2bow(text) for text in texts]
+    ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
+    return ldamodel, dictionary
+def extract_topic_words(ldamodel, num_topics=3, num_words=3):
+    """
+    Extracts meaningful words from each topic identified by the LDA model.
+    :param ldamodel: Trained LDA model
+    :param num_topics: Number of topics to extract
+    :param num_words: Number of words per topic to consider
+    :return: List of top words representing each topic
+    """
+    topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
+    topic_names = []
+    for topic in topics:
+        words = topic[1].split(" + ")
+        for word_data in words:
+            word = word_data.split("*")[1].strip('"')  # Extract word
+            if word.isalpha() and len(word) > 2:  # Ensure it's a meaningful word
+                topic_names.append(word)
+                break  # Only take the top valid word
+    return list(set(topic_names))  # Ensure unique topics

approach_library/api/api.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from main import analyze_company_news  # Import the function from main.py
+app = FastAPI()
+class CompanyRequest(BaseModel):
+    Company_Name: str
+@app.post("/api/company")
+async def handle_company(request: CompanyRequest):
+    company = request.Company_Name.strip()
+    result = analyze_company_news(company)
+    return result
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="127.0.0.1", port=8000)

approach_library/app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import streamlit as st
+import json
+import time
+from utils.news_extraction import extract_news
+from utils.news_summarisation import summarize_text
+from utils.news_sentiment import analyze_sentiment
+from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
+from utils.comparative_analysis import comparative_sentiment_analysis
+from utils.text_to_speech import text_to_speech
+import os
+def analyze_company_news(company):
+    st.write(f"Analyzing company: {company}")
+    with st.spinner("Fetching news articles..."):
+        articles = extract_news(company)
+        if not articles:
+            st.error("No news articles found. Try a different company.")
+            return None
+        st.write(f"Found {len(articles)} articles")
+    articles_data = []
+    texts = [article["text"] for article in articles]
+    with st.spinner("Performing sentiment analysis..."):
+        sentiment_results = analyze_sentiment(texts)
+        st.write(f"Sentiment analysis completed for {len(sentiment_results['Predicted Sentiment'])} articles")
+    for article, sentiment in zip(articles, sentiment_results["Predicted Sentiment"]):
+        summary = summarize_text(article["text"])
+        preprocessed_text = preprocess_text([article["text"]])
+        lda_model, dictionary = train_lda(preprocessed_text)
+        topic_words = extract_topic_words(lda_model)
+        articles_data.append({
+            "Title": article["title"],
+            "Summary": summary,
+            "Sentiment": sentiment,
+            "Topics": topic_words
+        })
+    with st.spinner("Performing comparative analysis..."):
+        analysis_result = comparative_sentiment_analysis(company, articles_data)
+        st.write("Comparative analysis completed")
+        st.write("Analysis result:", analysis_result)
+    final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
+    with st.spinner("Generating Hindi TTS summary..."):
+        try:
+            audio_file = text_to_speech(final_summary)
+            if os.path.exists(audio_file):
+                st.write(f"TTS summary generated: {audio_file}")
+            else:
+                st.error("Failed to generate TTS summary")
+                audio_file = None
+        except Exception as e:
+            st.error(f"TTS generation failed: {str(e)}")
+            audio_file = None
+    return {
+        "Company": company,
+        "Articles": articles_data,
+        "Comparative Sentiment Score": analysis_result,
+        "Audio": audio_file
+    }
+st.title("Company News Analysis")
+company = st.text_input("Enter the company name for analysis:")
+if st.button("Analyze") and company:
+    st.write(f"Starting analysis for: {company}")
+    result = analyze_company_news(company)
+    if result:
+        st.subheader(f"Analysis for {result['Company']}")
+        for article in result["Articles"]:
+            st.write(f"**Title:** {article['Title']}")
+            st.write(f"**Summary:** {article['Summary']}")
+            st.write(f"**Sentiment:** {article['Sentiment']}")
+            st.write(f"**Topics:** {', '.join(article['Topics'])}")
+            st.markdown("---")
+        st.subheader("Comparative Sentiment Score")
+        st.json(result["Comparative Sentiment Score"])
+        st.subheader("Hindi TTS Summary")
+        if result["Audio"]:
+            st.audio(result["Audio"], format="audio/mp3")
+        else:
+            st.warning("TTS summary not available")

approach_library/main.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# import json
+# import time
+# from news_extraction import extract_news
+# from news_summarisation import summarize_text
+# from news_sentiment import analyze_sentiment
+# from topic_extraction import preprocess_text, train_lda, extract_topic_words
+# from comparative_analysis import comparative_sentiment_analysis
+# from text_to_speech import text_to_speech  # ✅ Import the TTS function
+# def main():
+#     # User input for the company/topic
+#     company = input("Enter the company name for analysis: ").strip()
+#     # Extract news articles
+#     start_time = time.time()
+#     articles = extract_news(company)
+#     extraction_time = time.time() - start_time
+#     if not articles:
+#         print("No news articles found. Try a different company.")
+#         return
+#     articles_data = []  # List to store processed articles
+#     # Extract texts from articles for sentiment analysis
+#     texts = [article["text"] for article in articles]
+#     # Perform sentiment analysis
+#     start_time = time.time()
+#     sentiment_results = analyze_sentiment(texts)
+#     sentiment_time = time.time() - start_time
+#     # Process each article
+#     for i, (article, sentiment) in enumerate(zip(articles, sentiment_results["Predicted Sentiment"]), start=1):
+#         start_time = time.time()
+#         summary = summarize_text(article["text"])  # Summarize article
+#         summarization_time = time.time() - start_time
+#         # Extract topics for the specific article
+#         preprocessed_text = preprocess_text([article["text"]])
+#         lda_model, dictionary = train_lda(preprocessed_text)
+#         topic_words = extract_topic_words(lda_model)
+#         article_entry = {
+#             "Title": article["title"],
+#             "Summary": summary,
+#             "Sentiment": sentiment,
+#             "Topics": topic_words
+#         }
+#         articles_data.append(article_entry)
+#     # Perform comparative sentiment analysis
+#     analysis_result = comparative_sentiment_analysis(company, articles_data)
+#     # ✅ Generate a summary speech for the entire report
+#     final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
+#     audio_file = text_to_speech(final_summary)  # Generate Hindi TTS
+#     # ✅ Construct final JSON output
+#     output = {
+#         "Company": company,
+#         "Articles": articles_data,
+#         "Comparative Sentiment Score": analysis_result,
+#         "Final Sentiment Analysis": final_summary,
+#         "Audio": f"[Play {audio_file}]"  # ✅ Include a playable reference
+#     }
+#     # Print JSON output
+#     print(json.dumps(output, indent=4, ensure_ascii=False))
+#     # Save JSON output to file
+#     with open(f"{company}_news_analysis.json", "w", encoding="utf-8") as json_file:
+#         json.dump(output, json_file, indent=4, ensure_ascii=False)
+# if __name__ == "__main__":
+#     main()
+import json
+import time
+from utils.news_extraction import extract_news
+from utils.news_summarisation import summarize_text
+from utils.news_sentiment import analyze_sentiment
+from utils.topic_extraction import preprocess_text, train_lda, extract_topic_words
+from utils.comparative_analysis import comparative_sentiment_analysis
+from utils.text_to_speech import text_to_speech  # ✅ Import the TTS function
+def analyze_company_news(company):
+    # Extract news articles
+    start_time = time.time()
+    articles = extract_news(company)
+    extraction_time = time.time() - start_time
+    if not articles:
+        return {"message": "No news articles found. Try a different company."}
+    articles_data = []  # List to store processed articles
+    # Extract texts from articles for sentiment analysis
+    texts = [article["text"] for article in articles]
+    # Perform sentiment analysis
+    start_time = time.time()
+    sentiment_results = analyze_sentiment(texts)
+    sentiment_time = time.time() - start_time
+    # Process each article
+    for i, (article, sentiment) in enumerate(zip(articles, sentiment_results["Predicted Sentiment"]), start=1):
+        start_time = time.time()
+        summary = summarize_text(article["text"])  # Summarize article
+        summarization_time = time.time() - start_time
+        # Extract topics for the specific article
+        preprocessed_text = preprocess_text([article["text"]])
+        lda_model, dictionary = train_lda(preprocessed_text)
+        topic_words = extract_topic_words(lda_model)
+        article_entry = {
+            "Title": article["title"],
+            "Summary": summary,
+            "Sentiment": sentiment,
+            "Topics": topic_words
+        }
+        articles_data.append(article_entry)
+    # Perform comparative sentiment analysis
+    analysis_result = comparative_sentiment_analysis(company, articles_data)
+    # ✅ Generate a summary speech for the entire report
+    final_summary = f"{company}’s latest news coverage is mostly {analysis_result['Final Sentiment Analysis']}."
+    audio_file = text_to_speech(final_summary)  # Generate TTS
+    # ✅ Construct final JSON output
+    output = {
+        "Company": company,
+        "Articles": articles_data,
+        "Comparative Sentiment Score": analysis_result,
+        "Audio": f"[Play {audio_file}]"  # ✅ Include a playable reference
+    }
+    return output
+# if __name__ == "__main__":
+#     company = input("Enter the company name for analysis: ").strip()
+#     result = analyze_company_news(company)
+#     print(json.dumps(result, indent=4, ensure_ascii=False))

approach_library/utils/comparative_analysis.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import json
+from collections import Counter
+def comparative_sentiment_analysis(company, articles, max_comparisons=10, chunk_size=5):
+    """
+    Perform a comparative sentiment analysis on multiple articles.
+    """
+    overall_sentiment_counts = Counter()
+    overall_coverage_differences = []
+    all_topics = []
+    if not articles:
+        print("No articles found for analysis.")
+        return {
+            "Company": company,
+            "Articles": [],
+            "Comparative Sentiment Score": {
+                "Sentiment Distribution": {},
+                "Coverage Differences": [],
+                "Topic Overlap": {"Common Topics": [], "Unique Topics Per Article": []}
+            },
+            "Final Sentiment Analysis": "No data available."
+        }
+    # Process articles in chunks
+    for start in range(0, len(articles), chunk_size):
+        chunk = articles[start:start + chunk_size]
+        # Count sentiment distribution
+        sentiment_counts = Counter(article["Sentiment"] for article in chunk)
+        overall_sentiment_counts.update(sentiment_counts)
+        # Identify coverage differences
+        for i in range(len(chunk) - 1):
+            for j in range(i + 1, len(chunk)):
+                if len(overall_coverage_differences) >= max_comparisons:
+                    break
+                article1, article2 = chunk[i], chunk[j]
+                comparison = {
+                    "Comparison": f"'{article1.get('Title', 'Article 1')}' vs '{article2.get('Title', 'Article 2')}'",
+                    "Impact": f"{article1.get('Topics', [])} vs {article2.get('Topics', [])}"
+                }
+                overall_coverage_differences.append(comparison)
+        # Extract topics ensuring valid lists
+        topics = [set(article.get("Topics", [])) for article in chunk if isinstance(article.get("Topics", list), list) and article.get("Topics", [])]
+        all_topics.extend(topics)
+    # Debugging Output
+    print("All Topics Extracted:", all_topics)
+    # Determine common and unique topics
+    if len(all_topics) == 0:
+        common_topics = set()  # No topics found
+    elif len(all_topics) == 1:
+        common_topics = all_topics[0]  # Only one article, take its topics as common
+    else:
+        common_topics = set.intersection(*all_topics)  # Find intersection normally
+    unique_topics = [{"Article": i + 1, "Unique Topics": list(topics - common_topics)}
+                     for i, topics in enumerate(all_topics)]
+    # Convert to list for JSON output
+    common_topics = list(common_topics)
+    print("Common Topics:", common_topics)
+    # Final sentiment summary
+    final_analysis = "The news coverage is mostly "
+    if overall_sentiment_counts["Positive"] > overall_sentiment_counts["Negative"]:
+        final_analysis += "positive, indicating potential growth."
+    elif overall_sentiment_counts["Negative"] > overall_sentiment_counts["Positive"]:
+        final_analysis += "negative, suggesting challenges ahead."
+    else:
+        final_analysis += "balanced, with mixed reactions."
+    return {
+        "Comparative Sentiment Score": {
+            "Sentiment Distribution": dict(overall_sentiment_counts),
+            "Coverage Differences": overall_coverage_differences,
+            "Topic Overlap": {
+                "Common Topics": common_topics,
+                "Unique Topics Per Article": unique_topics
+            }
+        },
+        "Final Sentiment Analysis": final_analysis
+    }

approach_library/utils/news_extraction.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import nest_asyncio
+from typing import List, Dict
+from duckduckgo_search import DDGS
+from phi.tools.newspaper4k import Newspaper4k
+import time
+nest_asyncio.apply()
+def extract_news(article_topic: str, num_search_results: int = 15, max_retries: int = 3) -> List[Dict[str, str]]:
+    """
+    Extracts full news articles based on the given topic and number of search results.
+    Args:
+        article_topic: The topic to search for.
+        num_search_results: The number of search results to retrieve.
+        max_retries: The maximum number of retries if an article fails to scrape.
+    Returns:
+        A list of dictionaries, where each dictionary represents a news article.
+    """
+    news_results = []
+    ddgs = DDGS()
+    newspaper_tools = Newspaper4k()
+    results = ddgs.news(keywords=article_topic, max_results=num_search_results)  # Fetch extra results
+    for r in results:
+        if "url" in r:
+            retries = 0
+            while retries < max_retries:
+                try:
+                    article_data = newspaper_tools.get_article_data(r["url"])
+                    if article_data and "text" in article_data and len(article_data["text"]) > 100:
+                        news_results.append({
+                            "title": r.get("title", "No Title"),
+                            "text": article_data["text"]  # Full article text
+                        })
+                        break  # Successful extraction, break retry loop
+                    else:
+                        retries += 1
+                        time.sleep(1)  # Wait before retrying
+                except Exception as e:
+                    retries += 1
+                    time.sleep(1)
+        # Stop if we have collected enough articles
+        if len(news_results) >= num_search_results:
+            break
+    return news_results

approach_library/utils/news_sentiment.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+import scipy.special
+import pandas as pd
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+# Load FinBERT model and tokenizer
+finbert_ckpt = "cardiffnlp/twitter-roberta-base-sentiment"
+tokenizer = AutoTokenizer.from_pretrained(finbert_ckpt)
+model_finbert = AutoModelForSequenceClassification.from_pretrained(finbert_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")
+def analyze_sentiment(text_list):
+    """Performs sentiment analysis on a list of texts using FinBERT."""
+    preds = []
+    preds_proba = []
+    tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}
+    for text in text_list:
+        with torch.no_grad():
+            # Tokenize the input
+            input_sequence = tokenizer(text, return_tensors="pt", **tokenizer_kwargs).to(model_finbert.device)
+            logits = model_finbert(**input_sequence).logits.cpu().numpy().squeeze()
+            # Convert logits to probabilities
+            scores = {
+                k: v for k, v in zip(
+                    model_finbert.config.id2label.values(),
+                    scipy.special.softmax(logits)
+                )
+            }
+            # Get the most probable sentiment
+            sentiment = max(scores, key=scores.get)
+            probability = max(scores.values())
+            # Map the sentiment labels
+            if sentiment == 'LABEL_2':
+                sentiment = 'positive'
+            elif sentiment == 'LABEL_0':
+                sentiment = 'negative'
+            else:
+                sentiment = 'neutral'
+            preds.append(sentiment)
+            preds_proba.append(probability)
+    # Return a DataFrame with results
+    df_results = pd.DataFrame({
+        "Text": text_list,
+        "Predicted Sentiment": preds,
+        "Probability": preds_proba
+    })
+    return df_results

approach_library/utils/news_summarisation.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Summarization Device: {device}")
+model_ckpt = "google/pegasus-cnn_dailymail"
+tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
+def summarize_text(text: str) -> str:
+    input_ids = tokenizer.encode(
+        text,
+        return_tensors="pt",
+        max_length=1024,
+        truncation=True,
+    ).to(device)
+    try:
+        summary_ids = model_pegasus.generate(input_ids, max_length=130, min_length=30, do_sample=False)
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return summary
+    except RuntimeError as e:
+        print(f"Summarization Error: {e}")
+        return "Error: Could not generate summary due to length constraints."

approach_library/utils/text_to_speech.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from gtts import gTTS
+from deep_translator import GoogleTranslator
+def text_to_speech(text):
+    """ Converts text into both English and Hindi speech using gTTS (Cloud-based TTS). """
+    # ✅ Translate English to Hindi
+    translated_text = GoogleTranslator(source="en", target="hi").translate(text)
+    # ✅ Hindi Voice (Using gTTS)
+    hindi_tts = gTTS(text=translated_text, lang="hi")
+    hindi_file = "output_hindi.mp3"
+    hindi_tts.save(hindi_file)
+    return hindi_file
+# if __name__ == "__main__":
+#     text = input("Enter text: ")
+#     hindi_file = text_to_speech(text)
+#     print(f"Hindi audio saved to: {hindi_file}")

approach_library/utils/topic_extraction.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from gensim import corpora, models
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import string
+import nltk
+# Download necessary NLTK resources
+nltk.download("stopwords")
+nltk.download("punkt")
+def preprocess_text(text_data):
+    """
+    Preprocesses text data by tokenizing, removing stopwords, punctuation, and non-alphabetic tokens.
+    :param text_data: List of raw text documents
+    :return: List of preprocessed tokenized texts
+    """
+    stop_words = set(stopwords.words("english"))
+    processed_texts = [
+        [
+            word for word in word_tokenize(document.lower())
+            if word not in stop_words and word not in string.punctuation and word.isalpha()
+        ]
+        for document in text_data
+    ]
+    return processed_texts
+def train_lda(texts, num_topics=3):
+    """
+    Trains an LDA model on the given preprocessed text data.
+    :param texts: List of tokenized texts
+    :param num_topics: Number of topics for the LDA model
+    :return: Trained LDA model and corresponding dictionary
+    """
+    dictionary = corpora.Dictionary(texts)
+    corpus = [dictionary.doc2bow(text) for text in texts]
+    ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
+    return ldamodel, dictionary
+def extract_topic_words(ldamodel, num_topics=3, num_words=3):
+    """
+    Extracts meaningful words from each topic identified by the LDA model.
+    :param ldamodel: Trained LDA model
+    :param num_topics: Number of topics to extract
+    :param num_words: Number of words per topic to consider
+    :return: List of top words representing each topic
+    """
+    topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
+    topic_names = []
+    for topic in topics:
+        words = topic[1].split(" + ")
+        for word_data in words:
+            word = word_data.split("*")[1].strip('"')  # Extract word
+            if word.isalpha() and len(word) > 2:  # Ensure it's a meaningful word
+                topic_names.append(word)
+                break  # Only take the top valid word
+    return list(set(topic_names))  # Ensure unique topics

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+nest_asyncio
+duckduckgo-search
+newspaper4k
+transformers
+phidata
+newspaper4k
+lxml_html_clean
+duckduckgo_search
+transformers
+datasets
+pandas
+nltk
+torch
+tqdm
+GoogleNews
+pygooglenews
+feedparser
+googlesearch-python
+soundfile
+gtts
+deep_translator
+fastapi
+pydantic
+uvicorn
+python-magic
+streamlit