Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -1,21 +1,20 @@
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
-
from
|
4 |
-
from sumy.nlp.tokenizers import Tokenizer
|
5 |
-
from sumy.summarizers.lsa import LsaSummarizer
|
6 |
from deep_translator import GoogleTranslator
|
|
|
7 |
import gtts
|
8 |
import os
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
|
15 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
16 |
-
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
|
17 |
|
|
|
|
|
18 |
|
|
|
19 |
NEWS_API_KEY = "7e72763bebb54fd79cb632390738cbb1"
|
20 |
NEWS_API_URL = "https://newsapi.org/v2/everything"
|
21 |
|
@@ -45,55 +44,43 @@ def fetch_news(company):
|
|
45 |
})
|
46 |
return articles
|
47 |
|
48 |
-
# Function to scrape full article text
|
49 |
def scrape_article_text(url):
|
50 |
try:
|
51 |
headers = {"User-Agent": "Mozilla/5.0"}
|
52 |
-
response = requests.get(url, headers=headers)
|
53 |
soup = BeautifulSoup(response.text, "html.parser")
|
54 |
paragraphs = soup.find_all("p")
|
55 |
-
full_text = " ".join(
|
56 |
return full_text
|
57 |
except Exception:
|
58 |
return ""
|
59 |
|
60 |
-
# Function to summarize text
|
61 |
def summarize_text(text, sentences_count=3):
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
|
|
|
69 |
def analyze_sentiment(text):
|
70 |
-
# Tokenize input text
|
71 |
encoded_text = tokenizer(text, return_tensors='pt')
|
72 |
-
|
73 |
-
output = model(**encoded_text)
|
74 |
-
# Extract raw logits and apply softmax
|
75 |
scores = output.logits[0].detach().numpy()
|
76 |
scores = softmax(scores)
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
'neutral': scores[1],
|
82 |
-
'positive': scores[2]
|
83 |
-
}
|
84 |
-
|
85 |
-
# Determine sentiment based on the highest score
|
86 |
-
sentiment = max(scores_dict, key=scores_dict.get)
|
87 |
-
|
88 |
-
# Capitalize the first letter
|
89 |
-
return sentiment.capitalize()
|
90 |
-
|
91 |
|
92 |
-
#
|
93 |
def translate_to_hindi(text):
|
94 |
return GoogleTranslator(source='en', target='hi').translate(text)
|
95 |
|
96 |
-
#
|
97 |
def text_to_speech(text, filename="news_headline.mp3"):
|
98 |
if not text.strip():
|
99 |
return None
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
|
|
|
|
4 |
from deep_translator import GoogleTranslator
|
5 |
+
from scipy.special import softmax
|
6 |
import gtts
|
7 |
import os
|
8 |
|
9 |
+
# Initialize HuggingFace sentiment model
|
10 |
+
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
12 |
+
sentiment_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
|
|
|
|
|
|
13 |
|
14 |
+
# Initialize summarizer
|
15 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
16 |
|
17 |
+
# News API details
|
18 |
NEWS_API_KEY = "7e72763bebb54fd79cb632390738cbb1"
|
19 |
NEWS_API_URL = "https://newsapi.org/v2/everything"
|
20 |
|
|
|
44 |
})
|
45 |
return articles
|
46 |
|
47 |
+
# Function to scrape full article text from a URL
|
48 |
def scrape_article_text(url):
|
49 |
try:
|
50 |
headers = {"User-Agent": "Mozilla/5.0"}
|
51 |
+
response = requests.get(url, headers=headers, timeout=10)
|
52 |
soup = BeautifulSoup(response.text, "html.parser")
|
53 |
paragraphs = soup.find_all("p")
|
54 |
+
full_text = " ".join(p.text for p in paragraphs)
|
55 |
return full_text
|
56 |
except Exception:
|
57 |
return ""
|
58 |
|
59 |
+
# Function to summarize text using BART model
|
60 |
def summarize_text(text, sentences_count=3):
|
61 |
+
if not text.strip():
|
62 |
+
return "No summary available."
|
63 |
+
# Truncate long text to avoid input limit issues
|
64 |
+
text = text[:1024]
|
65 |
+
summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
|
66 |
+
return summary[0]['summary_text']
|
67 |
|
68 |
+
# Function to analyze sentiment
|
69 |
def analyze_sentiment(text):
|
|
|
70 |
encoded_text = tokenizer(text, return_tensors='pt')
|
71 |
+
output = sentiment_model(**encoded_text)
|
|
|
|
|
72 |
scores = output.logits[0].detach().numpy()
|
73 |
scores = softmax(scores)
|
74 |
|
75 |
+
sentiment_labels = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
|
76 |
+
sentiment = sentiment_labels[scores.argmax()]
|
77 |
+
return sentiment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
# Translate text to Hindi
|
80 |
def translate_to_hindi(text):
|
81 |
return GoogleTranslator(source='en', target='hi').translate(text)
|
82 |
|
83 |
+
# Convert headline to speech in Hindi
|
84 |
def text_to_speech(text, filename="news_headline.mp3"):
|
85 |
if not text.strip():
|
86 |
return None
|