Deepakraj2006 commited on
Commit
bf539d4
·
verified ·
1 Parent(s): 49a053d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -16
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import os
2
  from dotenv import load_dotenv
3
  load_dotenv()
4
 
@@ -15,31 +15,34 @@ from transformers import (
15
  MarianMTModel, MarianTokenizer,
16
  BarkModel, AutoProcessor
17
  )
 
 
18
 
19
  # -------------------------
20
  # Global Setup and Environment Variables
21
  # -------------------------
22
  NEWS_API_KEY = os.getenv("NEWS_API_KEY") # Set this in your .env file
23
-
24
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
25
 
26
  # -------------------------
27
  # News Extraction Functions
28
  # -------------------------
29
- def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles.xlsx'):
 
30
  newsapi = NewsApiClient(api_key=api_key)
31
  all_articles = newsapi.get_everything(q=company, language='en', sort_by='relevancy', page_size=count)
32
  articles = all_articles.get('articles', [])
33
 
34
  scraped_data = []
35
- for article in articles:
 
36
  url = article.get('url')
37
  if url:
 
38
  scraped_article = scrape_news(url)
39
  if scraped_article:
40
  scraped_article['url'] = url
41
  scraped_data.append(scraped_article)
42
-
43
  df = pd.DataFrame(scraped_data)
44
  df.to_excel(output_file, index=False, header=True)
45
  print(f"News scraping complete. Data saved to {output_file}")
@@ -47,9 +50,11 @@ def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles
47
 
48
  def scrape_news(url):
49
  headers = {"User-Agent": "Mozilla/5.0"}
50
- response = requests.get(url, headers=headers)
51
- if response.status_code != 200:
52
- print(f"Failed to fetch the page: {url}")
 
 
53
  return None
54
  soup = BeautifulSoup(response.text, "html.parser")
55
  headline = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No headline found"
@@ -60,19 +65,20 @@ def scrape_news(url):
60
  # -------------------------
61
  # Sentiment Analysis Setup
62
  # -------------------------
 
63
  sentiment_model_name = "cross-encoder/nli-distilroberta-base"
64
  sentiment_model = AutoModelForSequenceClassification.from_pretrained(
65
  sentiment_model_name,
66
- torch_dtype=torch.float16,
67
- device_map="auto"
68
  )
69
  sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
70
- classifier = pipeline("zero-shot-classification", model=sentiment_model, tokenizer=sentiment_tokenizer)
71
  labels = ["positive", "negative", "neutral"]
72
 
73
  # -------------------------
74
  # Summarization Setup
75
  # -------------------------
 
76
  bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
77
  bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
78
 
@@ -97,6 +103,7 @@ def split_into_chunks(text, tokenizer, max_tokens=1024):
97
  # -------------------------
98
  # Translation Setup (English to Hindi)
99
  # -------------------------
 
100
  translation_model_name = 'Helsinki-NLP/opus-mt-en-hi'
101
  trans_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
102
  trans_model = MarianMTModel.from_pretrained(translation_model_name)
@@ -109,14 +116,29 @@ def translate_text(text):
109
  # -------------------------
110
  # Bark TTS Setup (Hindi)
111
  # -------------------------
112
- bark_model = BarkModel.from_pretrained("suno/bark-small").to(device)
 
 
113
  processor = AutoProcessor.from_pretrained("suno/bark")
114
 
 
 
 
 
 
 
 
 
 
 
 
115
  # -------------------------
116
  # Main Pipeline Function
117
  # -------------------------
118
  def process_company(company):
 
119
  # Step 1: Fetch and scrape news
 
120
  fetch_and_scrape_news(company, NEWS_API_KEY)
121
  df = pd.read_excel('news_articles.xlsx')
122
  print("Scraped Articles:")
@@ -124,17 +146,20 @@ def process_company(company):
124
 
125
  articles_data = []
126
  for index, row in df.iterrows():
 
127
  article_text = row.get("content", "")
128
  title = row.get("headline", "No title")
129
  url = row.get("url", "")
130
  chunks = split_into_chunks(article_text, bart_tokenizer)
131
  chunk_summaries = []
132
- for chunk in chunks:
 
133
  inputs = bart_tokenizer([chunk], max_length=1024, return_tensors='pt', truncation=True)
134
- summary_ids = bart_model.generate(inputs.input_ids, num_beams=4, max_length=130, min_length=30, early_stopping=True)
135
  chunk_summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
136
  chunk_summaries.append(chunk_summary)
137
  final_summary = ' '.join(chunk_summaries)
 
138
  sentiment_result = classifier(final_summary, labels)
139
  sentiment = sentiment_result["labels"][0]
140
 
@@ -150,14 +175,19 @@ def process_company(company):
150
  for article in articles_data:
151
  key = article["Sentiment"].capitalize()
152
  sentiment_distribution[key] += 1
 
153
 
154
  # Step 2: Translate summaries and generate Hindi speech
 
155
  translated_summaries = [translate_text(article["Summary"]) for article in articles_data]
156
  final_translated_text = "\n\n".join(translated_summaries)
157
- inputs = processor(final_translated_text, return_tensors="pt").to(device)
 
 
158
  speech_output = bark_model.generate(**inputs)
159
  audio_path = "final_summary.wav"
160
  sf.write(audio_path, speech_output[0].cpu().numpy(), bark_model.generation_config.sample_rate)
 
161
 
162
  # Build final report
163
  report = {
@@ -171,10 +201,14 @@ def process_company(company):
171
  "Final Sentiment Analysis": "Overall sentiment analysis not fully computed",
172
  "Audio": audio_path
173
  }
 
174
  return report, audio_path
175
 
 
176
  # Gradio Interface Function
 
177
  def gradio_interface(company):
 
178
  report, audio_path = process_company(company)
179
  return report, audio_path
180
 
 
1
+ import os
2
  from dotenv import load_dotenv
3
  load_dotenv()
4
 
 
15
  MarianMTModel, MarianTokenizer,
16
  BarkModel, AutoProcessor
17
  )
18
+ import librosa
19
+ import re
20
 
21
  # -------------------------
22
  # Global Setup and Environment Variables
23
  # -------------------------
24
  NEWS_API_KEY = os.getenv("NEWS_API_KEY") # Set this in your .env file
25
+ device = "cpu" # Force CPU since no GPU is available in Hugging Face Spaces
 
26
 
27
  # -------------------------
28
  # News Extraction Functions
29
  # -------------------------
30
+ def fetch_and_scrape_news(company, api_key, count=5, output_file='news_articles.xlsx'):
31
+ print("Starting news fetch from NewsAPI...")
32
  newsapi = NewsApiClient(api_key=api_key)
33
  all_articles = newsapi.get_everything(q=company, language='en', sort_by='relevancy', page_size=count)
34
  articles = all_articles.get('articles', [])
35
 
36
  scraped_data = []
37
+ print(f"Found {len(articles)} articles. Starting scraping individual articles...")
38
+ for i, article in enumerate(articles):
39
  url = article.get('url')
40
  if url:
41
+ print(f"Scraping article {i+1}: {url}")
42
  scraped_article = scrape_news(url)
43
  if scraped_article:
44
  scraped_article['url'] = url
45
  scraped_data.append(scraped_article)
 
46
  df = pd.DataFrame(scraped_data)
47
  df.to_excel(output_file, index=False, header=True)
48
  print(f"News scraping complete. Data saved to {output_file}")
 
50
 
51
  def scrape_news(url):
52
  headers = {"User-Agent": "Mozilla/5.0"}
53
+ try:
54
+ response = requests.get(url, headers=headers, timeout=10)
55
+ response.raise_for_status()
56
+ except Exception as e:
57
+ print(f"Failed to fetch the page: {url} ({e})")
58
  return None
59
  soup = BeautifulSoup(response.text, "html.parser")
60
  headline = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No headline found"
 
65
  # -------------------------
66
  # Sentiment Analysis Setup
67
  # -------------------------
68
+ print("Loading sentiment analysis model...")
69
  sentiment_model_name = "cross-encoder/nli-distilroberta-base"
70
  sentiment_model = AutoModelForSequenceClassification.from_pretrained(
71
  sentiment_model_name,
72
+ torch_dtype=torch.float32
 
73
  )
74
  sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
75
+ classifier = pipeline("zero-shot-classification", model=sentiment_model, tokenizer=sentiment_tokenizer, device=-1)
76
  labels = ["positive", "negative", "neutral"]
77
 
78
  # -------------------------
79
  # Summarization Setup
80
  # -------------------------
81
+ print("Loading summarization model (BART)...")
82
  bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
83
  bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
84
 
 
103
  # -------------------------
104
  # Translation Setup (English to Hindi)
105
  # -------------------------
106
+ print("Loading translation model (MarianMT)...")
107
  translation_model_name = 'Helsinki-NLP/opus-mt-en-hi'
108
  trans_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
109
  trans_model = MarianMTModel.from_pretrained(translation_model_name)
 
116
  # -------------------------
117
  # Bark TTS Setup (Hindi)
118
  # -------------------------
119
+ print("Loading Bark TTS model...")
120
+ bark_model = BarkModel.from_pretrained("suno/bark-small")
121
+ bark_model.to(device)
122
  processor = AutoProcessor.from_pretrained("suno/bark")
123
 
124
+ # -------------------------
125
+ # Helper Functions for Audio and Text Preprocessing
126
+ # -------------------------
127
+ def normalize_text(text):
128
+ return re.sub(r"[^\w\s]", "", text.lower()).strip()
129
+
130
+ def resample_audio(audio_array, orig_sr, target_sr=16000):
131
+ if orig_sr != target_sr:
132
+ audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=target_sr)
133
+ return audio_array
134
+
135
  # -------------------------
136
  # Main Pipeline Function
137
  # -------------------------
138
  def process_company(company):
139
+ print(f"Processing company: {company}")
140
  # Step 1: Fetch and scrape news
141
+ print("Fetching and scraping news...")
142
  fetch_and_scrape_news(company, NEWS_API_KEY)
143
  df = pd.read_excel('news_articles.xlsx')
144
  print("Scraped Articles:")
 
146
 
147
  articles_data = []
148
  for index, row in df.iterrows():
149
+ print(f"Processing article {index+1}...")
150
  article_text = row.get("content", "")
151
  title = row.get("headline", "No title")
152
  url = row.get("url", "")
153
  chunks = split_into_chunks(article_text, bart_tokenizer)
154
  chunk_summaries = []
155
+ for i, chunk in enumerate(chunks):
156
+ print(f"Summarizing chunk {i+1}/{len(chunks)}...")
157
  inputs = bart_tokenizer([chunk], max_length=1024, return_tensors='pt', truncation=True)
158
+ summary_ids = bart_model.generate(inputs.input_ids, num_beams=2, max_length=130, min_length=30, early_stopping=True)
159
  chunk_summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
160
  chunk_summaries.append(chunk_summary)
161
  final_summary = ' '.join(chunk_summaries)
162
+ print("Performing sentiment analysis...")
163
  sentiment_result = classifier(final_summary, labels)
164
  sentiment = sentiment_result["labels"][0]
165
 
 
175
  for article in articles_data:
176
  key = article["Sentiment"].capitalize()
177
  sentiment_distribution[key] += 1
178
+ print("Sentiment distribution computed.")
179
 
180
  # Step 2: Translate summaries and generate Hindi speech
181
+ print("Translating summaries to Hindi...")
182
  translated_summaries = [translate_text(article["Summary"]) for article in articles_data]
183
  final_translated_text = "\n\n".join(translated_summaries)
184
+
185
+ print("Generating Hindi speech with Bark TTS...")
186
+ inputs = processor(final_translated_text, return_tensors="pt")
187
  speech_output = bark_model.generate(**inputs)
188
  audio_path = "final_summary.wav"
189
  sf.write(audio_path, speech_output[0].cpu().numpy(), bark_model.generation_config.sample_rate)
190
+ print("Audio generated and saved.")
191
 
192
  # Build final report
193
  report = {
 
201
  "Final Sentiment Analysis": "Overall sentiment analysis not fully computed",
202
  "Audio": audio_path
203
  }
204
+ print("Final report prepared.")
205
  return report, audio_path
206
 
207
+ # -------------------------
208
  # Gradio Interface Function
209
+ # -------------------------
210
  def gradio_interface(company):
211
+ print(f"Received input: {company}")
212
  report, audio_path = process_company(company)
213
  return report, audio_path
214