Spaces:

Deepakraj2006
/

News_Scraper_TTS

Running

App Files Files Community

Deepakraj2006 commited on Mar 18

Commit

b14b5f0

verified ·

1 Parent(s): 0b31e1f

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -76

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-from threading import Thread
 from dotenv import load_dotenv
 load_dotenv()
@@ -9,7 +8,7 @@ from newsapi import NewsApiClient
 import pandas as pd
 import torch
 import soundfile as sf
-from flask import Flask, request, jsonify, send_file
 from transformers import (
     AutoModelForSequenceClassification, AutoTokenizer, pipeline,
     BartTokenizer, BartForConditionalGeneration,
@@ -22,17 +21,12 @@ from transformers import (
 # -------------------------
 NEWS_API_KEY = os.getenv("NEWS_API_KEY")  # Set this in your .env file
-# Set device for Torch models
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # -------------------------
-# Part 1: News Scraping Functions
 # -------------------------
 def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles.xlsx'):
-    """
-    Fetch news article URLs related to a given company using News API,
-    scrape each for headline and content, and save the results to an Excel file.
-    """
     newsapi = NewsApiClient(api_key=api_key)
     all_articles = newsapi.get_everything(q=company, language='en', sort_by='relevancy', page_size=count)
     articles = all_articles.get('articles', [])
@@ -49,11 +43,9 @@ def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles
     df = pd.DataFrame(scraped_data)
     df.to_excel(output_file, index=False, header=True)
     print(f"News scraping complete. Data saved to {output_file}")
 def scrape_news(url):
-    """
-    Scrape the news article for headline and content.
-    """
     headers = {"User-Agent": "Mozilla/5.0"}
     response = requests.get(url, headers=headers)
     if response.status_code != 200:
@@ -66,7 +58,7 @@ def scrape_news(url):
     return {"headline": headline, "content": article_text}
 # -------------------------
-# Part 2: Sentiment Analysis Setup
 # -------------------------
 sentiment_model_name = "cross-encoder/nli-distilroberta-base"
 sentiment_model = AutoModelForSequenceClassification.from_pretrained(
@@ -79,7 +71,7 @@ classifier = pipeline("zero-shot-classification", model=sentiment_model, tokeniz
 labels = ["positive", "negative", "neutral"]
 # -------------------------
-# Part 3: Summarization Setup
 # -------------------------
 bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
 bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
@@ -103,7 +95,7 @@ def split_into_chunks(text, tokenizer, max_tokens=1024):
     return chunks
 # -------------------------
-# Part 4: Translation Setup (English to Hindi)
 # -------------------------
 translation_model_name = 'Helsinki-NLP/opus-mt-en-hi'
 trans_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
@@ -115,13 +107,13 @@ def translate_text(text):
     return trans_tokenizer.decode(translated[0], skip_special_tokens=True)
 # -------------------------
-# Part 5: Bark TTS Setup (Hindi)
 # -------------------------
 bark_model = BarkModel.from_pretrained("suno/bark-small").to(device)
 processor = AutoProcessor.from_pretrained("suno/bark")
 # -------------------------
-# Part 6: Process Company - Main Pipeline Function
 # -------------------------
 def process_company(company):
     # Step 1: Fetch and scrape news
@@ -130,7 +122,7 @@ def process_company(company):
     print("Scraped Articles:")
     print(df)
-    titles, summaries, sentiments, urls = [], [], [], []
     for index, row in df.iterrows():
         article_text = row.get("content", "")
         title = row.get("headline", "No title")
@@ -145,73 +137,60 @@ def process_company(company):
         final_summary = ' '.join(chunk_summaries)
         sentiment_result = classifier(final_summary, labels)
         sentiment = sentiment_result["labels"][0]
-        titles.append(title)
-        summaries.append(final_summary)
-        sentiments.append(sentiment)
-        urls.append(url)
-    final_df = pd.DataFrame({
-        "Title": titles,
-        "Summary": summaries,
-        "Sentiment": sentiments,
-        "URL": urls
-    })
-    final_df["Translated Summary"] = final_df["Summary"].apply(translate_text)
-    final_df.to_excel('translated_news_articles.xlsx', index=False)
-    print("Final processed data with translations:")
-    print(final_df)
-    # Combine all translated summaries into one text prompt
-    final_translated_text = "\n\n".join(final_df["Translated Summary"].tolist())
-    # Generate speech from the combined Hindi text using Bark
     inputs = processor(final_translated_text, return_tensors="pt").to(device)
     speech_output = bark_model.generate(**inputs)
     audio_path = "final_summary.wav"
     sf.write(audio_path, speech_output[0].cpu().numpy(), bark_model.generation_config.sample_rate)
-    return audio_path
-# -------------------------
-# Part 7: Flask Backend Setup
-# -------------------------
-app = Flask(__name__)
-@app.route("/process", methods=["POST"])
-def process_route():
-    data = request.get_json()
-    company = data.get("company")
-    if not company:
-        return jsonify({"error": "No company provided"}), 400
-    audio_path = process_company(company)
-    # Return the audio file path as JSON (Gradio will load the file)
-    return jsonify({"audio_path": audio_path})
-# -------------------------
-# Part 8: Gradio Interface Setup
-# -------------------------
 def gradio_interface(company):
-    # Call the Flask endpoint
-    response = requests.post("http://127.0.0.1:5000/process", json={"company": company})
-    result = response.json()
-    # Return the audio file path; Gradio's audio output type will read the file.
-    return result.get("audio_path")
-def launch_gradio():
-    import gradio as gr
-    iface = gr.Interface(
-        fn=gradio_interface,
-        inputs=gr.Textbox(label="Enter Company Name"),
-        outputs=gr.Audio(type="filepath", label="News Summary Audio (Hindi)"),
-        title="News Summarization & TTS",
-        description="Enter a company name to fetch news, generate a Hindi summary, and listen to the audio."
-    )
-    iface.launch()
 # -------------------------
-# Main: Run Flask and Gradio
 # -------------------------
 if __name__ == "__main__":
-    # Run the Flask app in a separate thread.
-    flask_thread = Thread(target=lambda: app.run(host="0.0.0.0", port=5000, debug=False, use_reloader=False))
-    flask_thread.start()
-    # Launch the Gradio interface.
-    launch_gradio()

 import os
 from dotenv import load_dotenv
 load_dotenv()
 import pandas as pd
 import torch
 import soundfile as sf
+import gradio as gr
 from transformers import (
     AutoModelForSequenceClassification, AutoTokenizer, pipeline,
     BartTokenizer, BartForConditionalGeneration,
 # -------------------------
 NEWS_API_KEY = os.getenv("NEWS_API_KEY")  # Set this in your .env file
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # -------------------------
+# News Extraction Functions
 # -------------------------
 def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles.xlsx'):
     newsapi = NewsApiClient(api_key=api_key)
     all_articles = newsapi.get_everything(q=company, language='en', sort_by='relevancy', page_size=count)
     articles = all_articles.get('articles', [])
     df = pd.DataFrame(scraped_data)
     df.to_excel(output_file, index=False, header=True)
     print(f"News scraping complete. Data saved to {output_file}")
+    return df
 def scrape_news(url):
     headers = {"User-Agent": "Mozilla/5.0"}
     response = requests.get(url, headers=headers)
     if response.status_code != 200:
     return {"headline": headline, "content": article_text}
 # -------------------------
+# Sentiment Analysis Setup
 # -------------------------
 sentiment_model_name = "cross-encoder/nli-distilroberta-base"
 sentiment_model = AutoModelForSequenceClassification.from_pretrained(
 labels = ["positive", "negative", "neutral"]
 # -------------------------
+# Summarization Setup
 # -------------------------
 bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
 bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
     return chunks
 # -------------------------
+# Translation Setup (English to Hindi)
 # -------------------------
 translation_model_name = 'Helsinki-NLP/opus-mt-en-hi'
 trans_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
     return trans_tokenizer.decode(translated[0], skip_special_tokens=True)
 # -------------------------
+# Bark TTS Setup (Hindi)
 # -------------------------
 bark_model = BarkModel.from_pretrained("suno/bark-small").to(device)
 processor = AutoProcessor.from_pretrained("suno/bark")
 # -------------------------
+# Main Pipeline Function
 # -------------------------
 def process_company(company):
     # Step 1: Fetch and scrape news
     print("Scraped Articles:")
     print(df)
+    articles_data = []
     for index, row in df.iterrows():
         article_text = row.get("content", "")
         title = row.get("headline", "No title")
         final_summary = ' '.join(chunk_summaries)
         sentiment_result = classifier(final_summary, labels)
         sentiment = sentiment_result["labels"][0]
+        articles_data.append({
+            "Title": title,
+            "Summary": final_summary,
+            "Sentiment": sentiment,
+            "URL": url
+        })
+    # Comparative Analysis: Build a simple sentiment distribution
+    sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
+    for article in articles_data:
+        key = article["Sentiment"].capitalize()
+        sentiment_distribution[key] += 1
+    # Step 2: Translate summaries and generate Hindi speech
+    translated_summaries = [translate_text(article["Summary"]) for article in articles_data]
+    final_translated_text = "\n\n".join(translated_summaries)
     inputs = processor(final_translated_text, return_tensors="pt").to(device)
     speech_output = bark_model.generate(**inputs)
     audio_path = "final_summary.wav"
     sf.write(audio_path, speech_output[0].cpu().numpy(), bark_model.generation_config.sample_rate)
+    # Build final report
+    report = {
+        "Company": company,
+        "Articles": articles_data,
+        "Comparative Sentiment Score": {
+            "Sentiment Distribution": sentiment_distribution,
+            "Coverage Differences": "Detailed comparative analysis not implemented",
+            "Topic Overlap": "Topic extraction not implemented"
+        },
+        "Final Sentiment Analysis": "Overall sentiment analysis not fully computed",
+        "Audio": audio_path
+    }
+    return report, audio_path
+# Gradio Interface Function
 def gradio_interface(company):
+    report, audio_path = process_company(company)
+    return report, audio_path
 # -------------------------
+# Gradio UI Setup
 # -------------------------
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.Textbox(label="Enter Company Name"),
+    outputs=[
+        gr.JSON(label="News Sentiment Report"),
+        gr.Audio(type="filepath", label="Hindi Summary Audio")
+    ],
+    title="News Summarization & Text-to-Speech",
+    description="Enter a company name to fetch news articles, perform sentiment analysis, and listen to a Hindi TTS summary."
+)
 if __name__ == "__main__":
+    iface.launch()