Deepakraj2006 commited on
Commit
fb07ec5
·
verified ·
1 Parent(s): ee90594

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +217 -0
app.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from threading import Thread
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from newsapi import NewsApiClient
9
+ import pandas as pd
10
+ import torch
11
+ import soundfile as sf
12
+ from flask import Flask, request, jsonify, send_file
13
+ from transformers import (
14
+ AutoModelForSequenceClassification, AutoTokenizer, pipeline,
15
+ BartTokenizer, BartForConditionalGeneration,
16
+ MarianMTModel, MarianTokenizer,
17
+ BarkModel, AutoProcessor
18
+ )
19
+
20
+ # -------------------------
21
+ # Global Setup and Environment Variables
22
+ # -------------------------
23
+ NEWS_API_KEY = os.getenv("NEWS_API_KEY") # Set this in your .env file
24
+
25
+ # Set device for Torch models
26
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
27
+
28
+ # -------------------------
29
+ # Part 1: News Scraping Functions
30
+ # -------------------------
31
+ def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles.xlsx'):
32
+ """
33
+ Fetch news article URLs related to a given company using News API,
34
+ scrape each for headline and content, and save the results to an Excel file.
35
+ """
36
+ newsapi = NewsApiClient(api_key=api_key)
37
+ all_articles = newsapi.get_everything(q=company, language='en', sort_by='relevancy', page_size=count)
38
+ articles = all_articles.get('articles', [])
39
+
40
+ scraped_data = []
41
+ for article in articles:
42
+ url = article.get('url')
43
+ if url:
44
+ scraped_article = scrape_news(url)
45
+ if scraped_article:
46
+ scraped_article['url'] = url
47
+ scraped_data.append(scraped_article)
48
+
49
+ df = pd.DataFrame(scraped_data)
50
+ df.to_excel(output_file, index=False, header=True)
51
+ print(f"News scraping complete. Data saved to {output_file}")
52
+
53
+ def scrape_news(url):
54
+ """
55
+ Scrape the news article for headline and content.
56
+ """
57
+ headers = {"User-Agent": "Mozilla/5.0"}
58
+ response = requests.get(url, headers=headers)
59
+ if response.status_code != 200:
60
+ print(f"Failed to fetch the page: {url}")
61
+ return None
62
+ soup = BeautifulSoup(response.text, "html.parser")
63
+ headline = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No headline found"
64
+ paragraphs = soup.find_all("p")
65
+ article_text = " ".join(p.get_text(strip=True) for p in paragraphs)
66
+ return {"headline": headline, "content": article_text}
67
+
68
+ # -------------------------
69
+ # Part 2: Sentiment Analysis Setup
70
+ # -------------------------
71
+ sentiment_model_name = "cross-encoder/nli-distilroberta-base"
72
+ sentiment_model = AutoModelForSequenceClassification.from_pretrained(
73
+ sentiment_model_name,
74
+ torch_dtype=torch.float16,
75
+ device_map="auto"
76
+ )
77
+ sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
78
+ classifier = pipeline("zero-shot-classification", model=sentiment_model, tokenizer=sentiment_tokenizer)
79
+ labels = ["positive", "negative", "neutral"]
80
+
81
+ # -------------------------
82
+ # Part 3: Summarization Setup
83
+ # -------------------------
84
+ bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
85
+ bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
86
+
87
+ def split_into_chunks(text, tokenizer, max_tokens=1024):
88
+ words = text.split()
89
+ chunks = []
90
+ current_chunk = []
91
+ current_length = 0
92
+ for word in words:
93
+ tokenized_word = tokenizer.encode(word, add_special_tokens=False)
94
+ if current_length + len(tokenized_word) <= max_tokens:
95
+ current_chunk.append(word)
96
+ current_length += len(tokenized_word)
97
+ else:
98
+ chunks.append(' '.join(current_chunk))
99
+ current_chunk = [word]
100
+ current_length = len(tokenized_word)
101
+ if current_chunk:
102
+ chunks.append(' '.join(current_chunk))
103
+ return chunks
104
+
105
+ # -------------------------
106
+ # Part 4: Translation Setup (English to Hindi)
107
+ # -------------------------
108
+ translation_model_name = 'Helsinki-NLP/opus-mt-en-hi'
109
+ trans_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
110
+ trans_model = MarianMTModel.from_pretrained(translation_model_name)
111
+
112
+ def translate_text(text):
113
+ tokens = trans_tokenizer(text, return_tensors="pt", padding=True)
114
+ translated = trans_model.generate(**tokens)
115
+ return trans_tokenizer.decode(translated[0], skip_special_tokens=True)
116
+
117
+ # -------------------------
118
+ # Part 5: Bark TTS Setup (Hindi)
119
+ # -------------------------
120
+ bark_model = BarkModel.from_pretrained("suno/bark-small").to(device)
121
+ processor = AutoProcessor.from_pretrained("suno/bark")
122
+
123
+ # -------------------------
124
+ # Part 6: Process Company - Main Pipeline Function
125
+ # -------------------------
126
+ def process_company(company):
127
+ # Step 1: Fetch and scrape news
128
+ fetch_and_scrape_news(company, NEWS_API_KEY)
129
+ df = pd.read_excel('news_articles.xlsx')
130
+ print("Scraped Articles:")
131
+ print(df)
132
+
133
+ titles, summaries, sentiments, urls = [], [], [], []
134
+ for index, row in df.iterrows():
135
+ article_text = row.get("content", "")
136
+ title = row.get("headline", "No title")
137
+ url = row.get("url", "")
138
+ chunks = split_into_chunks(article_text, bart_tokenizer)
139
+ chunk_summaries = []
140
+ for chunk in chunks:
141
+ inputs = bart_tokenizer([chunk], max_length=1024, return_tensors='pt', truncation=True)
142
+ summary_ids = bart_model.generate(inputs.input_ids, num_beams=4, max_length=130, min_length=30, early_stopping=True)
143
+ chunk_summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
144
+ chunk_summaries.append(chunk_summary)
145
+ final_summary = ' '.join(chunk_summaries)
146
+ sentiment_result = classifier(final_summary, labels)
147
+ sentiment = sentiment_result["labels"][0]
148
+ titles.append(title)
149
+ summaries.append(final_summary)
150
+ sentiments.append(sentiment)
151
+ urls.append(url)
152
+
153
+ final_df = pd.DataFrame({
154
+ "Title": titles,
155
+ "Summary": summaries,
156
+ "Sentiment": sentiments,
157
+ "URL": urls
158
+ })
159
+ final_df["Translated Summary"] = final_df["Summary"].apply(translate_text)
160
+ final_df.to_excel('translated_news_articles.xlsx', index=False)
161
+ print("Final processed data with translations:")
162
+ print(final_df)
163
+
164
+ # Combine all translated summaries into one text prompt
165
+ final_translated_text = "\n\n".join(final_df["Translated Summary"].tolist())
166
+ # Generate speech from the combined Hindi text using Bark
167
+ inputs = processor(final_translated_text, return_tensors="pt").to(device)
168
+ speech_output = bark_model.generate(**inputs)
169
+ audio_path = "final_summary.wav"
170
+ sf.write(audio_path, speech_output[0].cpu().numpy(), bark_model.generation_config.sample_rate)
171
+ return audio_path
172
+
173
+ # -------------------------
174
+ # Part 7: Flask Backend Setup
175
+ # -------------------------
176
+ app = Flask(__name__)
177
+
178
+ @app.route("/process", methods=["POST"])
179
+ def process_route():
180
+ data = request.get_json()
181
+ company = data.get("company")
182
+ if not company:
183
+ return jsonify({"error": "No company provided"}), 400
184
+ audio_path = process_company(company)
185
+ # Return the audio file path as JSON (Gradio will load the file)
186
+ return jsonify({"audio_path": audio_path})
187
+
188
+ # -------------------------
189
+ # Part 8: Gradio Interface Setup
190
+ # -------------------------
191
+ def gradio_interface(company):
192
+ # Call the Flask endpoint
193
+ response = requests.post("http://127.0.0.1:5000/process", json={"company": company})
194
+ result = response.json()
195
+ # Return the audio file path; Gradio's audio output type will read the file.
196
+ return result.get("audio_path")
197
+
198
+ def launch_gradio():
199
+ import gradio as gr
200
+ iface = gr.Interface(
201
+ fn=gradio_interface,
202
+ inputs=gr.Textbox(label="Enter Company Name"),
203
+ outputs=gr.Audio(type="filepath", label="News Summary Audio (Hindi)"),
204
+ title="News Summarization & TTS",
205
+ description="Enter a company name to fetch news, generate a Hindi summary, and listen to the audio."
206
+ )
207
+ iface.launch()
208
+
209
+ # -------------------------
210
+ # Main: Run Flask and Gradio
211
+ # -------------------------
212
+ if __name__ == "__main__":
213
+ # Run the Flask app in a separate thread.
214
+ flask_thread = Thread(target=lambda: app.run(host="0.0.0.0", port=5000, debug=False, use_reloader=False))
215
+ flask_thread.start()
216
+ # Launch the Gradio interface.
217
+ launch_gradio()