Deepakraj2006 commited on
Commit
b14b5f0
·
verified ·
1 Parent(s): 0b31e1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -76
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- from threading import Thread
3
  from dotenv import load_dotenv
4
  load_dotenv()
5
 
@@ -9,7 +8,7 @@ from newsapi import NewsApiClient
9
  import pandas as pd
10
  import torch
11
  import soundfile as sf
12
- from flask import Flask, request, jsonify, send_file
13
  from transformers import (
14
  AutoModelForSequenceClassification, AutoTokenizer, pipeline,
15
  BartTokenizer, BartForConditionalGeneration,
@@ -22,17 +21,12 @@ from transformers import (
22
  # -------------------------
23
  NEWS_API_KEY = os.getenv("NEWS_API_KEY") # Set this in your .env file
24
 
25
- # Set device for Torch models
26
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
27
 
28
  # -------------------------
29
- # Part 1: News Scraping Functions
30
  # -------------------------
31
  def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles.xlsx'):
32
- """
33
- Fetch news article URLs related to a given company using News API,
34
- scrape each for headline and content, and save the results to an Excel file.
35
- """
36
  newsapi = NewsApiClient(api_key=api_key)
37
  all_articles = newsapi.get_everything(q=company, language='en', sort_by='relevancy', page_size=count)
38
  articles = all_articles.get('articles', [])
@@ -49,11 +43,9 @@ def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles
49
  df = pd.DataFrame(scraped_data)
50
  df.to_excel(output_file, index=False, header=True)
51
  print(f"News scraping complete. Data saved to {output_file}")
 
52
 
53
  def scrape_news(url):
54
- """
55
- Scrape the news article for headline and content.
56
- """
57
  headers = {"User-Agent": "Mozilla/5.0"}
58
  response = requests.get(url, headers=headers)
59
  if response.status_code != 200:
@@ -66,7 +58,7 @@ def scrape_news(url):
66
  return {"headline": headline, "content": article_text}
67
 
68
  # -------------------------
69
- # Part 2: Sentiment Analysis Setup
70
  # -------------------------
71
  sentiment_model_name = "cross-encoder/nli-distilroberta-base"
72
  sentiment_model = AutoModelForSequenceClassification.from_pretrained(
@@ -79,7 +71,7 @@ classifier = pipeline("zero-shot-classification", model=sentiment_model, tokeniz
79
  labels = ["positive", "negative", "neutral"]
80
 
81
  # -------------------------
82
- # Part 3: Summarization Setup
83
  # -------------------------
84
  bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
85
  bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
@@ -103,7 +95,7 @@ def split_into_chunks(text, tokenizer, max_tokens=1024):
103
  return chunks
104
 
105
  # -------------------------
106
- # Part 4: Translation Setup (English to Hindi)
107
  # -------------------------
108
  translation_model_name = 'Helsinki-NLP/opus-mt-en-hi'
109
  trans_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
@@ -115,13 +107,13 @@ def translate_text(text):
115
  return trans_tokenizer.decode(translated[0], skip_special_tokens=True)
116
 
117
  # -------------------------
118
- # Part 5: Bark TTS Setup (Hindi)
119
  # -------------------------
120
  bark_model = BarkModel.from_pretrained("suno/bark-small").to(device)
121
  processor = AutoProcessor.from_pretrained("suno/bark")
122
 
123
  # -------------------------
124
- # Part 6: Process Company - Main Pipeline Function
125
  # -------------------------
126
  def process_company(company):
127
  # Step 1: Fetch and scrape news
@@ -130,7 +122,7 @@ def process_company(company):
130
  print("Scraped Articles:")
131
  print(df)
132
 
133
- titles, summaries, sentiments, urls = [], [], [], []
134
  for index, row in df.iterrows():
135
  article_text = row.get("content", "")
136
  title = row.get("headline", "No title")
@@ -145,73 +137,60 @@ def process_company(company):
145
  final_summary = ' '.join(chunk_summaries)
146
  sentiment_result = classifier(final_summary, labels)
147
  sentiment = sentiment_result["labels"][0]
148
- titles.append(title)
149
- summaries.append(final_summary)
150
- sentiments.append(sentiment)
151
- urls.append(url)
 
 
 
152
 
153
- final_df = pd.DataFrame({
154
- "Title": titles,
155
- "Summary": summaries,
156
- "Sentiment": sentiments,
157
- "URL": urls
158
- })
159
- final_df["Translated Summary"] = final_df["Summary"].apply(translate_text)
160
- final_df.to_excel('translated_news_articles.xlsx', index=False)
161
- print("Final processed data with translations:")
162
- print(final_df)
163
 
164
- # Combine all translated summaries into one text prompt
165
- final_translated_text = "\n\n".join(final_df["Translated Summary"].tolist())
166
- # Generate speech from the combined Hindi text using Bark
167
  inputs = processor(final_translated_text, return_tensors="pt").to(device)
168
  speech_output = bark_model.generate(**inputs)
169
  audio_path = "final_summary.wav"
170
  sf.write(audio_path, speech_output[0].cpu().numpy(), bark_model.generation_config.sample_rate)
171
- return audio_path
172
-
173
- # -------------------------
174
- # Part 7: Flask Backend Setup
175
- # -------------------------
176
- app = Flask(__name__)
177
-
178
- @app.route("/process", methods=["POST"])
179
- def process_route():
180
- data = request.get_json()
181
- company = data.get("company")
182
- if not company:
183
- return jsonify({"error": "No company provided"}), 400
184
- audio_path = process_company(company)
185
- # Return the audio file path as JSON (Gradio will load the file)
186
- return jsonify({"audio_path": audio_path})
187
-
188
- # -------------------------
189
- # Part 8: Gradio Interface Setup
190
- # -------------------------
191
  def gradio_interface(company):
192
- # Call the Flask endpoint
193
- response = requests.post("http://127.0.0.1:5000/process", json={"company": company})
194
- result = response.json()
195
- # Return the audio file path; Gradio's audio output type will read the file.
196
- return result.get("audio_path")
197
-
198
- def launch_gradio():
199
- import gradio as gr
200
- iface = gr.Interface(
201
- fn=gradio_interface,
202
- inputs=gr.Textbox(label="Enter Company Name"),
203
- outputs=gr.Audio(type="filepath", label="News Summary Audio (Hindi)"),
204
- title="News Summarization & TTS",
205
- description="Enter a company name to fetch news, generate a Hindi summary, and listen to the audio."
206
- )
207
- iface.launch()
208
 
209
  # -------------------------
210
- # Main: Run Flask and Gradio
211
  # -------------------------
 
 
 
 
 
 
 
 
 
 
 
212
  if __name__ == "__main__":
213
- # Run the Flask app in a separate thread.
214
- flask_thread = Thread(target=lambda: app.run(host="0.0.0.0", port=5000, debug=False, use_reloader=False))
215
- flask_thread.start()
216
- # Launch the Gradio interface.
217
- launch_gradio()
 
1
  import os
 
2
  from dotenv import load_dotenv
3
  load_dotenv()
4
 
 
8
  import pandas as pd
9
  import torch
10
  import soundfile as sf
11
+ import gradio as gr
12
  from transformers import (
13
  AutoModelForSequenceClassification, AutoTokenizer, pipeline,
14
  BartTokenizer, BartForConditionalGeneration,
 
21
  # -------------------------
22
  NEWS_API_KEY = os.getenv("NEWS_API_KEY") # Set this in your .env file
23
 
 
24
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
25
 
26
  # -------------------------
27
+ # News Extraction Functions
28
  # -------------------------
29
  def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles.xlsx'):
 
 
 
 
30
  newsapi = NewsApiClient(api_key=api_key)
31
  all_articles = newsapi.get_everything(q=company, language='en', sort_by='relevancy', page_size=count)
32
  articles = all_articles.get('articles', [])
 
43
  df = pd.DataFrame(scraped_data)
44
  df.to_excel(output_file, index=False, header=True)
45
  print(f"News scraping complete. Data saved to {output_file}")
46
+ return df
47
 
48
  def scrape_news(url):
 
 
 
49
  headers = {"User-Agent": "Mozilla/5.0"}
50
  response = requests.get(url, headers=headers)
51
  if response.status_code != 200:
 
58
  return {"headline": headline, "content": article_text}
59
 
60
  # -------------------------
61
+ # Sentiment Analysis Setup
62
  # -------------------------
63
  sentiment_model_name = "cross-encoder/nli-distilroberta-base"
64
  sentiment_model = AutoModelForSequenceClassification.from_pretrained(
 
71
  labels = ["positive", "negative", "neutral"]
72
 
73
  # -------------------------
74
+ # Summarization Setup
75
  # -------------------------
76
  bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
77
  bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
 
95
  return chunks
96
 
97
  # -------------------------
98
+ # Translation Setup (English to Hindi)
99
  # -------------------------
100
  translation_model_name = 'Helsinki-NLP/opus-mt-en-hi'
101
  trans_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
 
107
  return trans_tokenizer.decode(translated[0], skip_special_tokens=True)
108
 
109
  # -------------------------
110
+ # Bark TTS Setup (Hindi)
111
  # -------------------------
112
  bark_model = BarkModel.from_pretrained("suno/bark-small").to(device)
113
  processor = AutoProcessor.from_pretrained("suno/bark")
114
 
115
  # -------------------------
116
+ # Main Pipeline Function
117
  # -------------------------
118
  def process_company(company):
119
  # Step 1: Fetch and scrape news
 
122
  print("Scraped Articles:")
123
  print(df)
124
 
125
+ articles_data = []
126
  for index, row in df.iterrows():
127
  article_text = row.get("content", "")
128
  title = row.get("headline", "No title")
 
137
  final_summary = ' '.join(chunk_summaries)
138
  sentiment_result = classifier(final_summary, labels)
139
  sentiment = sentiment_result["labels"][0]
140
+
141
+ articles_data.append({
142
+ "Title": title,
143
+ "Summary": final_summary,
144
+ "Sentiment": sentiment,
145
+ "URL": url
146
+ })
147
 
148
+ # Comparative Analysis: Build a simple sentiment distribution
149
+ sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
150
+ for article in articles_data:
151
+ key = article["Sentiment"].capitalize()
152
+ sentiment_distribution[key] += 1
 
 
 
 
 
153
 
154
+ # Step 2: Translate summaries and generate Hindi speech
155
+ translated_summaries = [translate_text(article["Summary"]) for article in articles_data]
156
+ final_translated_text = "\n\n".join(translated_summaries)
157
  inputs = processor(final_translated_text, return_tensors="pt").to(device)
158
  speech_output = bark_model.generate(**inputs)
159
  audio_path = "final_summary.wav"
160
  sf.write(audio_path, speech_output[0].cpu().numpy(), bark_model.generation_config.sample_rate)
161
+
162
+ # Build final report
163
+ report = {
164
+ "Company": company,
165
+ "Articles": articles_data,
166
+ "Comparative Sentiment Score": {
167
+ "Sentiment Distribution": sentiment_distribution,
168
+ "Coverage Differences": "Detailed comparative analysis not implemented",
169
+ "Topic Overlap": "Topic extraction not implemented"
170
+ },
171
+ "Final Sentiment Analysis": "Overall sentiment analysis not fully computed",
172
+ "Audio": audio_path
173
+ }
174
+ return report, audio_path
175
+
176
+ # Gradio Interface Function
 
 
 
 
177
  def gradio_interface(company):
178
+ report, audio_path = process_company(company)
179
+ return report, audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
  # -------------------------
182
+ # Gradio UI Setup
183
  # -------------------------
184
+ iface = gr.Interface(
185
+ fn=gradio_interface,
186
+ inputs=gr.Textbox(label="Enter Company Name"),
187
+ outputs=[
188
+ gr.JSON(label="News Sentiment Report"),
189
+ gr.Audio(type="filepath", label="Hindi Summary Audio")
190
+ ],
191
+ title="News Summarization & Text-to-Speech",
192
+ description="Enter a company name to fetch news articles, perform sentiment analysis, and listen to a Hindi TTS summary."
193
+ )
194
+
195
  if __name__ == "__main__":
196
+ iface.launch()