Spaces:
Sleeping
Sleeping
Add files
Browse files- README.md +63 -14
- api.py +15 -0
- app.py +111 -0
- requirements.txt +34 -0
- utils.py +458 -0
README.md
CHANGED
@@ -1,14 +1,63 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Live Company News Analyzer
|
2 |
+
**A project by Sara Nimje - [Visit Portfolio Website](https://saranimje.github.io/)**
|
3 |
+
This application fetches live news articles for a company, analyzes sentiment, summarizes content, and converts it into Hindi audio.
|
4 |
+
## Objective:
|
5 |
+
I have developed a web-based application that extracts key details from multiple news articles related to a given company. The application performs sentiment analysis, conducts a comparative analysis, and generates a text-to-speech (TTS) output in Hindi. Users can input a company name and receive a structured sentiment report along with an audio summary, making the information more accessible and insightful.
|
6 |
+
# Project Setup
|
7 |
+
## Installation:
|
8 |
+
|
9 |
+
- Clone this repository -
|
10 |
+
`git clone https://github.com/saranimje/news-summarizer.git `
|
11 |
+
- Navigate to directory -
|
12 |
+
`cd news-summarizer`
|
13 |
+
|
14 |
+
- Install Dependencies -
|
15 |
+
`pip install -r requirements.txt`
|
16 |
+
|
17 |
+
- Run Gradio App -
|
18 |
+
`python app.py`
|
19 |
+
|
20 |
+
- Run API (Optional) -
|
21 |
+
`uvicorn api:app --reload`
|
22 |
+
|
23 |
+
# Model Details
|
24 |
+
## Summarization Model
|
25 |
+
- Uses transformers from Hugging Face.
|
26 |
+
- Model: `google/long-t5-tglobal-base`
|
27 |
+
|
28 |
+
## Sentiment Analysis
|
29 |
+
Uses default sentiment-analysis pipeline from Hugging Face.
|
30 |
+
|
31 |
+
## Topic Modelling
|
32 |
+
- Uses TF-IDF vectorization with NMF (Non-Negative Matrix Factorization) to extract key topics from news articles.
|
33 |
+
- Utilizes cosine similarity to measure relationships between articles.
|
34 |
+
|
35 |
+
## Text-to-Speech
|
36 |
+
Uses `gTTS (Google Text-to-Speech)`
|
37 |
+
## Translation
|
38 |
+
Uses `GoogleTranslator` (source: English, target: Hindi).
|
39 |
+
|
40 |
+
|
41 |
+
# API Development
|
42 |
+
This project includes a **FastAPI-based API** to fetch news articles and analyze them.
|
43 |
+
## **Endpoints:**
|
44 |
+
**1. Home**
|
45 |
+
- `GET /`
|
46 |
+
- Returns: `{"message": "News Summarization API is running!"}`
|
47 |
+
**2. Fetch News**
|
48 |
+
- `GET /news/?company_name=Tesla&article_number=5`
|
49 |
+
- Returns JSON output containing news articles and analysis.
|
50 |
+
# API Development
|
51 |
+
## Using Postman or Curl:
|
52 |
+
1. Open **Postman** or any API testing tool.
|
53 |
+
2. Send a `GET` request to:
|
54 |
+
` http://127.0.0.1:8000/news/?company_name=Tesla&article_number=5`
|
55 |
+
3. View JSON response with news articles and summaries.
|
56 |
+
|
57 |
+
## Third-Party API Usage
|
58 |
+
- **News Sources**: Google Search (`googlesearch` Python module).
|
59 |
+
- **Libraries Used**:
|
60 |
+
- `requests` for API calls
|
61 |
+
- `gensim`, `deep_translator`, `nltk` for text processing.
|
62 |
+
- `googlesearch` to fetch news links.
|
63 |
+
- `feedparser` for RSS feeds.
|
api.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from utils import fetch_news_data
|
3 |
+
|
4 |
+
app = FastAPI()
|
5 |
+
|
6 |
+
@app.get("/")
|
7 |
+
def home():
|
8 |
+
return {"message": "News Summarization API is running!"}
|
9 |
+
|
10 |
+
@app.get("/news/")
|
11 |
+
def get_news(company_name: str, article_number: int):
|
12 |
+
results = fetch_news_data(company_name, article_number)
|
13 |
+
return {"news": results}
|
14 |
+
|
15 |
+
# run locally with: uvicorn api:app --reload
|
app.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from utils import fetch_news_data
|
4 |
+
|
5 |
+
|
6 |
+
def gradio_interface(company_name, article_number):
|
7 |
+
news_df_output = pd.DataFrame(columns=["Title", "Source"])
|
8 |
+
json_summary = {}
|
9 |
+
english_news_list = []
|
10 |
+
hindi_news_list = []
|
11 |
+
# hindi_news_text = None
|
12 |
+
hindi_news_audio = None
|
13 |
+
pie_chart = None
|
14 |
+
bar_chart = None
|
15 |
+
|
16 |
+
for result in fetch_news_data(company_name, int(article_number)):
|
17 |
+
news_df_output = result.get("news_df_output", news_df_output)
|
18 |
+
json_summary = result.get("json_summary", json_summary)
|
19 |
+
english_news_list = result.get("english_news_list", english_news_list)
|
20 |
+
hindi_news_list = result.get("hindi_news_list", hindi_news_list)
|
21 |
+
# hindi_news_text = result.get("hindi_news_text", hindi_news_text)
|
22 |
+
hindi_news_audio = result.get("hindi_news_audio", hindi_news_audio)
|
23 |
+
pie_chart = result.get("pie_chart", pie_chart)
|
24 |
+
bar_chart = result.get("bar_chart", bar_chart)
|
25 |
+
|
26 |
+
yield news_df_output, json_summary, english_news_list, hindi_news_list, hindi_news_audio, pie_chart, bar_chart
|
27 |
+
|
28 |
+
with gr.Blocks(css=".btn-green { background-color: #2E7D32 !important; color: white !important; }") as interface:
|
29 |
+
gr.Markdown("# Live Company News Analyzer")
|
30 |
+
gr.Markdown("## A Project by Sara Nimje")
|
31 |
+
gr.Markdown("Enter a company name to fetch news, sentiment analysis, and more.")
|
32 |
+
|
33 |
+
with gr.Row():
|
34 |
+
company_name_input = gr.Textbox(label="Company Name", placeholder="Enter company name")
|
35 |
+
article_number_input = gr.Textbox(label="Number of Articles", placeholder="Enter number")
|
36 |
+
|
37 |
+
with gr.Row():
|
38 |
+
submit_btn = gr.Button("Submit", elem_classes=["btn-green"])
|
39 |
+
clear_btn = gr.Button("Clear")
|
40 |
+
|
41 |
+
with gr.Row():
|
42 |
+
news_df_output = gr.Dataframe(label="News Articles", interactive=False)
|
43 |
+
|
44 |
+
with gr.Row():
|
45 |
+
json_summary_output = gr.JSON(label="JSON Summary")
|
46 |
+
|
47 |
+
with gr.Row():
|
48 |
+
english_news_output = gr.List(label="English News List")
|
49 |
+
hindi_news_output = gr.List(label="Hindi News List")
|
50 |
+
|
51 |
+
with gr.Row():
|
52 |
+
# hindi_news_text_output = gr.Textbox(label="Hindi News Text", interactive=False)
|
53 |
+
hindi_news_audio_output = gr.Audio(label="Hindi News Audio")
|
54 |
+
|
55 |
+
with gr.Row():
|
56 |
+
pie_chart_output = gr.Image(label="Sentiment Pie Chart")
|
57 |
+
bar_chart_output = gr.Image(label="Sentiment Bar Chart")
|
58 |
+
|
59 |
+
submit_event = submit_btn.click(
|
60 |
+
gradio_interface,
|
61 |
+
inputs=[company_name_input, article_number_input],
|
62 |
+
outputs=[
|
63 |
+
news_df_output,
|
64 |
+
json_summary_output,
|
65 |
+
english_news_output,
|
66 |
+
hindi_news_output,
|
67 |
+
hindi_news_audio_output,
|
68 |
+
pie_chart_output,
|
69 |
+
bar_chart_output
|
70 |
+
]
|
71 |
+
)
|
72 |
+
|
73 |
+
company_name_input.submit(fn=gradio_interface, inputs=[company_name_input, article_number_input], outputs=[
|
74 |
+
news_df_output,
|
75 |
+
json_summary_output,
|
76 |
+
english_news_output,
|
77 |
+
hindi_news_output,
|
78 |
+
hindi_news_audio_output,
|
79 |
+
pie_chart_output,
|
80 |
+
bar_chart_output
|
81 |
+
])
|
82 |
+
|
83 |
+
article_number_input.submit(fn=gradio_interface, inputs=[company_name_input, article_number_input], outputs=[
|
84 |
+
news_df_output,
|
85 |
+
json_summary_output,
|
86 |
+
english_news_output,
|
87 |
+
hindi_news_output,
|
88 |
+
hindi_news_audio_output,
|
89 |
+
pie_chart_output,
|
90 |
+
bar_chart_output
|
91 |
+
])
|
92 |
+
|
93 |
+
clear_btn.click(
|
94 |
+
lambda: ("", "", pd.DataFrame(), {}, "", "", None, None),
|
95 |
+
inputs=[],
|
96 |
+
outputs=[
|
97 |
+
company_name_input,
|
98 |
+
article_number_input,
|
99 |
+
news_df_output,
|
100 |
+
json_summary_output,
|
101 |
+
english_news_output,
|
102 |
+
hindi_news_output,
|
103 |
+
hindi_news_audio_output,
|
104 |
+
pie_chart_output,
|
105 |
+
bar_chart_output
|
106 |
+
]
|
107 |
+
)
|
108 |
+
|
109 |
+
# launch app
|
110 |
+
if __name__ == "__main__":
|
111 |
+
interface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core Dependencies
|
2 |
+
transformers==4.49.0
|
3 |
+
torch==2.6.0+cu124
|
4 |
+
nltk==3.9.1
|
5 |
+
feedparser==6.0.11
|
6 |
+
googlesearch-python==1.3.0
|
7 |
+
scikit-learn==1.6.1
|
8 |
+
gensim==4.3.3
|
9 |
+
pandas==2.2.2
|
10 |
+
numpy>=1.23.2
|
11 |
+
deep-translator==1.11.4
|
12 |
+
gtts==2.5.4
|
13 |
+
|
14 |
+
# Web Scraping & HTTP Requests
|
15 |
+
requests
|
16 |
+
httpx
|
17 |
+
beautifulsoup4
|
18 |
+
|
19 |
+
# Natural Language Processing
|
20 |
+
spacy
|
21 |
+
|
22 |
+
# Data Visualization
|
23 |
+
seaborn
|
24 |
+
matplotlib
|
25 |
+
|
26 |
+
# Utility & Performance Optimization
|
27 |
+
tqdm
|
28 |
+
|
29 |
+
# Interface
|
30 |
+
gradio
|
31 |
+
|
32 |
+
#API
|
33 |
+
fastapi
|
34 |
+
uvicorn
|
utils.py
ADDED
@@ -0,0 +1,458 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ==========================
|
2 |
+
# Data Handling & Storage
|
3 |
+
# ==========================
|
4 |
+
import json
|
5 |
+
import ast
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# ==========================
|
10 |
+
# Web Scraping & Data Retrieval
|
11 |
+
# ==========================
|
12 |
+
import requests
|
13 |
+
import httpx
|
14 |
+
import feedparser
|
15 |
+
import concurrent.futures
|
16 |
+
from bs4 import BeautifulSoup
|
17 |
+
from googlesearch import search
|
18 |
+
from urllib.parse import urlparse
|
19 |
+
|
20 |
+
# ==========================
|
21 |
+
# Natural Language Processing (NLP)
|
22 |
+
# ==========================
|
23 |
+
import nltk
|
24 |
+
import spacy
|
25 |
+
import gensim
|
26 |
+
from nltk.corpus import stopwords
|
27 |
+
from nltk.tokenize import word_tokenize
|
28 |
+
from nltk.stem import WordNetLemmatizer
|
29 |
+
from gensim.models import LdaModel
|
30 |
+
from gensim.corpora import Dictionary
|
31 |
+
from transformers import pipeline
|
32 |
+
from deep_translator import GoogleTranslator
|
33 |
+
from gtts import gTTS # Text-to-speech
|
34 |
+
|
35 |
+
# ==========================
|
36 |
+
# Machine Learning & Text Analysis
|
37 |
+
# ==========================
|
38 |
+
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
|
39 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
40 |
+
from sklearn.decomposition import NMF, LatentDirichletAllocation
|
41 |
+
from sklearn.model_selection import RandomizedSearchCV
|
42 |
+
|
43 |
+
# ==========================
|
44 |
+
# Data Visualization
|
45 |
+
# ==========================
|
46 |
+
import matplotlib.pyplot as plt
|
47 |
+
import seaborn as sns
|
48 |
+
|
49 |
+
# ==========================
|
50 |
+
# Utility & Performance Optimization
|
51 |
+
# ==========================
|
52 |
+
import re
|
53 |
+
import os
|
54 |
+
import io
|
55 |
+
from collections import Counter
|
56 |
+
from tqdm import tqdm # progress bar
|
57 |
+
|
58 |
+
|
59 |
+
def fetch_news_data(company_name: str, article_number: int):
|
60 |
+
excluded_domains = ["youtube.com", "en.wikipedia.org", "m.economictimes.com", "www.prnewswire.com", "economictimes.indiatimes.com", "www.moneycontrol.com"]
|
61 |
+
|
62 |
+
def is_valid_news_article(url, company_name):
|
63 |
+
try:
|
64 |
+
domain = urlparse(url).netloc # extracts the domain
|
65 |
+
if company_name.lower() in domain.lower() or any(excluded_domain in domain for excluded_domain in excluded_domains):
|
66 |
+
return False
|
67 |
+
return True
|
68 |
+
except Exception:
|
69 |
+
return False # handle unexpected errors
|
70 |
+
|
71 |
+
def get_top_articles(company_name, article_number):
|
72 |
+
query = f"{company_name} latest news article"
|
73 |
+
valid_urls = []
|
74 |
+
|
75 |
+
for url in search(query, num_results = article_number*2):
|
76 |
+
if is_valid_news_article(url, company_name):
|
77 |
+
valid_urls.append(url)
|
78 |
+
if len(valid_urls) > article_number+1:
|
79 |
+
break
|
80 |
+
|
81 |
+
return valid_urls
|
82 |
+
|
83 |
+
def extract_article_data(url):
|
84 |
+
headers = {
|
85 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
|
86 |
+
}
|
87 |
+
|
88 |
+
try:
|
89 |
+
response = requests.get(url, headers=headers)
|
90 |
+
response.raise_for_status() # handle HTTP errors
|
91 |
+
|
92 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
93 |
+
|
94 |
+
# extract title
|
95 |
+
title = soup.title.string.strip() if soup.title else None
|
96 |
+
source = url.split('/')[2] # Extract domain
|
97 |
+
|
98 |
+
# validate data
|
99 |
+
if not title:
|
100 |
+
return None
|
101 |
+
|
102 |
+
return {"title": title, "link": url, "source": source}
|
103 |
+
|
104 |
+
except (requests.exceptions.RequestException, AttributeError):
|
105 |
+
return None # skip articles with invalid data
|
106 |
+
|
107 |
+
def main(company_name, article_number):
|
108 |
+
urls = get_top_articles(company_name, article_number)
|
109 |
+
# extract and validate article data
|
110 |
+
articles_data = [extract_article_data(url) for url in urls]
|
111 |
+
articles_data = [article for article in articles_data if article] # remove None values
|
112 |
+
|
113 |
+
# create DataFrame only if valid articles exist
|
114 |
+
if articles_data:
|
115 |
+
df = pd.DataFrame(articles_data)
|
116 |
+
else:
|
117 |
+
df = pd.DataFrame(columns=["title", "link"]) # empty DataFrame if nothing was found
|
118 |
+
|
119 |
+
return df
|
120 |
+
|
121 |
+
df = main(company_name, article_number+1)
|
122 |
+
news_df_output = df[["title", "source"]].rename(columns={"title": "Headline", "source": "Source"})
|
123 |
+
news_df_output["Source"] = news_df_output["Source"].str.replace(r"^www\.", "", regex=True).str.split('.').str[0]
|
124 |
+
|
125 |
+
yield {"news_df_output": news_df_output}
|
126 |
+
|
127 |
+
def get_article_text(url):
|
128 |
+
try:
|
129 |
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
130 |
+
response = requests.get(url, headers=headers)
|
131 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
132 |
+
|
133 |
+
# remove unwanted elements
|
134 |
+
for unwanted in soup.select("nav, aside, footer, header, .ad, .advertisement, .promo, .sidebar, .related-articles"):
|
135 |
+
unwanted.extract()
|
136 |
+
|
137 |
+
# try extracting from known article containers
|
138 |
+
article_body = soup.find(['article', 'div', 'section'], class_=['article-body', 'post-body', 'entry-content', 'main-content'])
|
139 |
+
|
140 |
+
if article_body:
|
141 |
+
paragraphs = article_body.find_all('p')
|
142 |
+
article_text = " ".join([p.get_text() for p in paragraphs]).strip()
|
143 |
+
return article_text if article_text else None # return None if empty
|
144 |
+
|
145 |
+
# fallback to all <p> tags
|
146 |
+
paragraphs = soup.find_all('p')
|
147 |
+
article_text = " ".join([p.get_text() for p in paragraphs]).strip()
|
148 |
+
|
149 |
+
return article_text if article_text else None # return None if empty
|
150 |
+
|
151 |
+
except Exception:
|
152 |
+
return None # return None in case of an error
|
153 |
+
df['article_text'] = df['link'].apply(get_article_text)
|
154 |
+
|
155 |
+
df = df.reset_index(drop=True)
|
156 |
+
|
157 |
+
block_patterns = [
|
158 |
+
# Error messages (with variations)
|
159 |
+
r'Oops[!,\.]? something went wrong',
|
160 |
+
r'An error has occurred',
|
161 |
+
r'This content is not available',
|
162 |
+
r'Please enable JavaScript to continue',
|
163 |
+
r'Error loading content',
|
164 |
+
r'Follow Us',
|
165 |
+
|
166 |
+
# JavaScript patterns
|
167 |
+
r'var .*?;',
|
168 |
+
r'alert\(.*?\)',
|
169 |
+
r'console\.log\(.*?\)',
|
170 |
+
r'<script.*?</script>',
|
171 |
+
r'<noscript>.*?</noscript>',
|
172 |
+
r'<style.*?</style>',
|
173 |
+
|
174 |
+
# Loading or restricted content messages
|
175 |
+
r'Loading[\.]*',
|
176 |
+
r'You must be logged in to view this content',
|
177 |
+
r'This content is restricted',
|
178 |
+
r'Access denied',
|
179 |
+
r'Please disable your ad blocker',
|
180 |
+
|
181 |
+
# GDPR and cookie consent banners
|
182 |
+
r'This site uses cookies',
|
183 |
+
r'We use cookies to improve your experience',
|
184 |
+
r'By using this site, you agree to our use of cookies',
|
185 |
+
r'Accept Cookies',
|
186 |
+
|
187 |
+
# Stories or content teasers with any number
|
188 |
+
r'\d+\s*Stories',
|
189 |
+
|
190 |
+
# Miscellaneous
|
191 |
+
r'<iframe.*?</iframe>',
|
192 |
+
r'<meta.*?>',
|
193 |
+
r'<link.*?>',
|
194 |
+
r'Refresh the page and try again',
|
195 |
+
r'Click here if the page does not load',
|
196 |
+
r'© [0-9]{4}.*? All rights reserved',
|
197 |
+
r'Unauthorized access',
|
198 |
+
r'Terms of Service',
|
199 |
+
r'Privacy Policy',
|
200 |
+
r'<.*?>',
|
201 |
+
]
|
202 |
+
|
203 |
+
pattern = '|'.join(block_patterns)
|
204 |
+
df['article_text'] = df['article_text'].str.replace(pattern, '', regex=True).str.strip()
|
205 |
+
df['article_text'] = df['article_text'].str.replace(r'\s+', ' ', regex=True).str.strip()
|
206 |
+
|
207 |
+
custom_stop_words = set(ENGLISH_STOP_WORDS.union({company_name.lower(), 'company', 'ttm', 'rs'}))
|
208 |
+
|
209 |
+
# add numeric values (integer, decimal, comma-separated, monetary)
|
210 |
+
numeric_patterns = re.compile(r'\b\d+(?:[\.,]\d+)?(?:,\d+)*\b|\$\d+(?:[\.,]\d+)?')
|
211 |
+
numeric_matches = set(re.findall(numeric_patterns, ' '.join(df['article_text'])))
|
212 |
+
custom_stop_words.update(numeric_matches)
|
213 |
+
|
214 |
+
# remove unwanted unicode characters (like \u2018, \u2019, etc.)
|
215 |
+
unicode_patterns = re.compile(r'[\u2018\u2019\u2020\u2021\u2014]') # Add more if needed
|
216 |
+
df['article_text'] = df['article_text'].apply(lambda x: unicode_patterns.sub('', x))
|
217 |
+
|
218 |
+
custom_stop_words = list(custom_stop_words)
|
219 |
+
|
220 |
+
summarizer = pipeline("summarization", model="google/long-t5-tglobal-base")
|
221 |
+
|
222 |
+
def generate_summary(text):
|
223 |
+
try:
|
224 |
+
if len(text.split()) > 50: # skip very short texts
|
225 |
+
summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
|
226 |
+
return summary
|
227 |
+
else:
|
228 |
+
return text
|
229 |
+
except Exception as e:
|
230 |
+
print(f"Error processing text: {e}")
|
231 |
+
return None
|
232 |
+
|
233 |
+
# apply summarization to the 'article_text' column
|
234 |
+
df['summary'] = df['article_text'].apply(generate_summary)
|
235 |
+
|
236 |
+
# load a pre-trained BERT-based sentiment model from Hugging Faces
|
237 |
+
sentiment_pipeline = pipeline("sentiment-analysis")
|
238 |
+
|
239 |
+
def analyze_sentiment(text):
|
240 |
+
"""Analyze sentiment with a confidence-based neutral zone."""
|
241 |
+
if not text.strip():
|
242 |
+
return "Neutral"
|
243 |
+
|
244 |
+
try:
|
245 |
+
result = sentiment_pipeline(text)[0]
|
246 |
+
sentiment_label = result["label"]
|
247 |
+
confidence = round(result["score"], 2)
|
248 |
+
|
249 |
+
if confidence < 0.7:
|
250 |
+
return "Neutral"
|
251 |
+
return f"{sentiment_label.capitalize()} ({confidence})"
|
252 |
+
except Exception:
|
253 |
+
return "Error in sentiment analysis."
|
254 |
+
|
255 |
+
# apply sentiment analysis on the summary column
|
256 |
+
df['sentiment'] = df['summary'].apply(analyze_sentiment)
|
257 |
+
|
258 |
+
df['sentiment_label'] = df['sentiment'].str.extract(r'(Positive|Negative|Neutral)')
|
259 |
+
|
260 |
+
sentiment_bars = plt.figure(figsize=(7, 7))
|
261 |
+
sns.countplot(x=df['sentiment_label'], palette={'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'})
|
262 |
+
plt.title("Sentiment Analysis of Articles")
|
263 |
+
plt.xlabel("Sentiment")
|
264 |
+
plt.ylabel("Count")
|
265 |
+
|
266 |
+
# save the figure as an image file to use in gradio interface
|
267 |
+
sentiment_bars_file = "sentiment_bars.png"
|
268 |
+
sentiment_bars.savefig(sentiment_bars_file)
|
269 |
+
plt.close(sentiment_bars)
|
270 |
+
|
271 |
+
sentiment_counts = df['sentiment_label'].value_counts()
|
272 |
+
|
273 |
+
colors = {'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'}
|
274 |
+
|
275 |
+
sentiment_pie = plt.figure(figsize=(7, 7))
|
276 |
+
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=[colors[label] for label in sentiment_counts.index])
|
277 |
+
plt.title("Sentiment Distribution of Articles")
|
278 |
+
|
279 |
+
sentiment_pie_file = "sentiment_pie.png"
|
280 |
+
sentiment_pie.savefig(sentiment_pie_file)
|
281 |
+
plt.close(sentiment_pie)
|
282 |
+
|
283 |
+
df['combined_text'] = df['title'] + ' ' + df['summary'] # combine text for analysis
|
284 |
+
|
285 |
+
vectorizer = TfidfVectorizer(max_features=1000, stop_words=custom_stop_words)
|
286 |
+
tfidf = vectorizer.fit_transform(df['combined_text'])
|
287 |
+
|
288 |
+
n_topics = 5 # number of topics
|
289 |
+
nmf = NMF(n_components=n_topics, random_state=42)
|
290 |
+
W = nmf.fit_transform(tfidf)
|
291 |
+
H = nmf.components_
|
292 |
+
|
293 |
+
feature_names = vectorizer.get_feature_names_out()
|
294 |
+
topics = []
|
295 |
+
for topic_idx, topic in enumerate(H):
|
296 |
+
top_words = [feature_names[i] for i in topic.argsort()[-5:]][::-1] # 5 words per topic
|
297 |
+
topics.append(", ".join(top_words))
|
298 |
+
|
299 |
+
|
300 |
+
def get_top_topics(row):
|
301 |
+
topic_indices = W[row].argsort()[-3:][::-1] # get top 3 topics
|
302 |
+
return [topics[i] for i in topic_indices]
|
303 |
+
|
304 |
+
df['top_topics'] = [get_top_topics(i) for i in range(len(df))]
|
305 |
+
df['dominant_topic'] = W.argmax(axis=1)
|
306 |
+
df['topic_distribution'] = W.tolist()
|
307 |
+
similarity_matrix = cosine_similarity(W)
|
308 |
+
|
309 |
+
df['similarity_scores'] = similarity_matrix.mean(axis=1)
|
310 |
+
df['most_similar_article'] = similarity_matrix.argsort(axis=1)[:, -2] # second highest value
|
311 |
+
df['least_similar_article'] = similarity_matrix.argsort(axis=1)[:, 0] # lowest value
|
312 |
+
|
313 |
+
similarity_heatmap = plt.figure(figsize=(10, 8))
|
314 |
+
sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap="coolwarm", xticklabels=False, yticklabels=False)
|
315 |
+
plt.title("Comparative Analysis of News Coverage Across Articles")
|
316 |
+
|
317 |
+
comparisons = []
|
318 |
+
for i in range(len(df)):
|
319 |
+
# find most similar and least similar articles
|
320 |
+
similar_idx = similarity_matrix[i].argsort()[-2] # most similar (excluding itself)
|
321 |
+
least_similar_idx = similarity_matrix[i].argsort()[0] # least similar
|
322 |
+
|
323 |
+
# build comparison text
|
324 |
+
comparison = {
|
325 |
+
"Most Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', similar to Article {similar_idx + 1} which also discusses '{topics[df['dominant_topic'][similar_idx]]}'.",
|
326 |
+
"Least Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', contrasting with Article {least_similar_idx + 1} which discusses '{topics[df['dominant_topic'][least_similar_idx]]}'."
|
327 |
+
}
|
328 |
+
comparisons.append(comparison)
|
329 |
+
|
330 |
+
df['coverage_comparison'] = comparisons
|
331 |
+
# find common and unique topics
|
332 |
+
all_topics = df['dominant_topic'].tolist()
|
333 |
+
topic_counter = Counter(all_topics)
|
334 |
+
common_topics = [topics[i] for i, count in topic_counter.items() if count > 1]
|
335 |
+
unique_topics = [topics[i] for i, count in topic_counter.items() if count == 1]
|
336 |
+
|
337 |
+
topic_overlap = {
|
338 |
+
"Common Topics": common_topics,
|
339 |
+
"Unique Topics": unique_topics
|
340 |
+
}
|
341 |
+
sentiment_counts = df['sentiment_label'].value_counts()
|
342 |
+
if sentiment_counts.get('Positive', 0) > sentiment_counts.get('Negative', 0):
|
343 |
+
sentiment = "Overall sentiment is positive."
|
344 |
+
elif sentiment_counts.get('Negative', 0) > sentiment_counts.get('Positive', 0):
|
345 |
+
sentiment = "Overall sentiment is negative."
|
346 |
+
else:
|
347 |
+
sentiment = "Overall sentiment is mixed."
|
348 |
+
|
349 |
+
def extract_relevant_topics(topics):
|
350 |
+
if isinstance(topics, str):
|
351 |
+
topics = ast.literal_eval(topics) # convert string to list if needed
|
352 |
+
|
353 |
+
if len(topics) <= 2:
|
354 |
+
return topics
|
355 |
+
|
356 |
+
vectorizer = TfidfVectorizer()
|
357 |
+
tfidf_matrix = vectorizer.fit_transform(topics)
|
358 |
+
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
|
359 |
+
|
360 |
+
# sum similarity scores for each topic
|
361 |
+
topic_scores = similarity_matrix.sum(axis=1)
|
362 |
+
|
363 |
+
# get top 2 highest scoring topics
|
364 |
+
top_indices = topic_scores.argsort()[-2:][::-1]
|
365 |
+
top_topics = [topics[i] for i in top_indices]
|
366 |
+
|
367 |
+
return top_topics
|
368 |
+
|
369 |
+
|
370 |
+
# ensure 'top_topics' is a list
|
371 |
+
df['top_topics'] = df['top_topics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
|
372 |
+
|
373 |
+
# convert lists to sets for easy comparison
|
374 |
+
df['top_topics_set'] = df['top_topics'].apply(lambda x: set(x) if isinstance(x, list) else set())
|
375 |
+
|
376 |
+
# find common topics across all articles
|
377 |
+
if len(df) > 1:
|
378 |
+
common_topics = set.intersection(*df['top_topics_set'])
|
379 |
+
else:
|
380 |
+
common_topics = set() # no common topics if only one article
|
381 |
+
|
382 |
+
# extract unique topics by removing common ones
|
383 |
+
df['unique_topics'] = df['top_topics_set'].apply(lambda x: list(x - common_topics) if x else [])
|
384 |
+
|
385 |
+
# drop the temporary 'top_topics_set' column
|
386 |
+
df.drop(columns=['top_topics_set'], inplace=True)
|
387 |
+
|
388 |
+
|
389 |
+
coverage_differences = []
|
390 |
+
for _, row in df.iterrows():
|
391 |
+
if row['most_similar_article'] in df.index and row['least_similar_article'] in df.index:
|
392 |
+
most_similar = df.loc[row['most_similar_article']]
|
393 |
+
least_similar = df.loc[row['least_similar_article']]
|
394 |
+
|
395 |
+
# extract most relevant topics
|
396 |
+
most_relevant_topics = extract_relevant_topics(row['top_topics'])
|
397 |
+
least_relevant_topics = extract_relevant_topics(least_similar['top_topics'])
|
398 |
+
|
399 |
+
if most_relevant_topics and least_relevant_topics:
|
400 |
+
comparison = {
|
401 |
+
"Comparison": f"{row['title']} highlights {', '.join(row['top_topics'])}, while {most_similar['title']} discusses {', '.join(most_similar['top_topics'])}.",
|
402 |
+
"Impact": f"The article emphasizes {most_relevant_topics[0]} and {most_relevant_topics[1]}, contrasting with {least_relevant_topics[0]} and {least_relevant_topics[1]} in the least similar article."
|
403 |
+
}
|
404 |
+
coverage_differences.append(comparison)
|
405 |
+
structured_summary = {
|
406 |
+
"Company": company_name,
|
407 |
+
"Articles": [
|
408 |
+
{
|
409 |
+
"Title": row['title'],
|
410 |
+
"Summary": row['summary'],
|
411 |
+
"Sentiment": row['sentiment'],
|
412 |
+
"Topics": row['top_topics'],
|
413 |
+
"Unique Topics": row['unique_topics']
|
414 |
+
}
|
415 |
+
for _, row in df.iterrows()
|
416 |
+
],
|
417 |
+
"Comparative Sentiment Score": {
|
418 |
+
"Sentiment Distribution": df['sentiment'].value_counts().to_dict(),
|
419 |
+
},
|
420 |
+
"Topic Overlap": {
|
421 |
+
"Common Topics": list(common_topics) if common_topics else ["No common topics found"],
|
422 |
+
"Unique Topics": [
|
423 |
+
{"Title": row['title'], "Unique Topics": row['unique_topics']}
|
424 |
+
for _, row in df.iterrows()
|
425 |
+
]
|
426 |
+
},
|
427 |
+
"Final Sentiment Analysis": f"{company_name}’s latest news coverage is mostly {df['sentiment'].mode()[0].lower()}. Potential market impact expected."
|
428 |
+
}
|
429 |
+
|
430 |
+
yield {"json_summary": structured_summary}
|
431 |
+
english_news = [f"Name of Company: {company_name}"]
|
432 |
+
|
433 |
+
for i, row in df.iterrows():
|
434 |
+
article_entry = f"Article {i + 1}: "
|
435 |
+
article_entry += f"{row['title']}; "
|
436 |
+
article_entry += f"Summary: {row['summary']} This article has a {row['sentiment_label'].lower()} sentiment."
|
437 |
+
english_news.append(article_entry)
|
438 |
+
yield {"english_news_list": english_news}
|
439 |
+
translator = GoogleTranslator(source='en', target='hi') # 'hi' = Hindi
|
440 |
+
|
441 |
+
translated_news = []
|
442 |
+
for text in tqdm(english_news, desc="Translating"):
|
443 |
+
translated_news.append(translator.translate(text))
|
444 |
+
yield {"hindi_news_list": translated_news}
|
445 |
+
hindi_news = '; '.join(translated_news)
|
446 |
+
# yield {"hindi_news_text": hindi_news}
|
447 |
+
def text_to_speech(text, language='hi'):
|
448 |
+
tts = gTTS(text=text, lang=language, slow=False)
|
449 |
+
filename = "hindi_news.mp3" # save file to path
|
450 |
+
tts.save(filename)
|
451 |
+
return filename
|
452 |
+
print(df)
|
453 |
+
news_audio = text_to_speech(hindi_news)
|
454 |
+
yield {"hindi_news_audio": news_audio}
|
455 |
+
|
456 |
+
yield {"bar_chart": sentiment_bars_file}
|
457 |
+
|
458 |
+
yield {"pie_chart": sentiment_pie_file}
|