Spaces:

Deepakraj2006
/

News_Scraper_TTS

Running

App Files Files Community

Deepakraj2006 commited on Mar 18

Commit

fb07ec5

verified ·

1 Parent(s): ee90594

Create app.py

Browse files

Files changed (1) hide show

app.py +217 -0

app.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import os
+from threading import Thread
+from dotenv import load_dotenv
+load_dotenv()
+import requests
+from bs4 import BeautifulSoup
+from newsapi import NewsApiClient
+import pandas as pd
+import torch
+import soundfile as sf
+from flask import Flask, request, jsonify, send_file
+from transformers import (
+    AutoModelForSequenceClassification, AutoTokenizer, pipeline,
+    BartTokenizer, BartForConditionalGeneration,
+    MarianMTModel, MarianTokenizer,
+    BarkModel, AutoProcessor
+)
+# -------------------------
+# Global Setup and Environment Variables
+# -------------------------
+NEWS_API_KEY = os.getenv("NEWS_API_KEY")  # Set this in your .env file
+# Set device for Torch models
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# -------------------------
+# Part 1: News Scraping Functions
+# -------------------------
+def fetch_and_scrape_news(company, api_key, count=11, output_file='news_articles.xlsx'):
+    """
+    Fetch news article URLs related to a given company using News API,
+    scrape each for headline and content, and save the results to an Excel file.
+    """
+    newsapi = NewsApiClient(api_key=api_key)
+    all_articles = newsapi.get_everything(q=company, language='en', sort_by='relevancy', page_size=count)
+    articles = all_articles.get('articles', [])
+    scraped_data = []
+    for article in articles:
+        url = article.get('url')
+        if url:
+            scraped_article = scrape_news(url)
+            if scraped_article:
+                scraped_article['url'] = url
+                scraped_data.append(scraped_article)
+    df = pd.DataFrame(scraped_data)
+    df.to_excel(output_file, index=False, header=True)
+    print(f"News scraping complete. Data saved to {output_file}")
+def scrape_news(url):
+    """
+    Scrape the news article for headline and content.
+    """
+    headers = {"User-Agent": "Mozilla/5.0"}
+    response = requests.get(url, headers=headers)
+    if response.status_code != 200:
+        print(f"Failed to fetch the page: {url}")
+        return None
+    soup = BeautifulSoup(response.text, "html.parser")
+    headline = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No headline found"
+    paragraphs = soup.find_all("p")
+    article_text = " ".join(p.get_text(strip=True) for p in paragraphs)
+    return {"headline": headline, "content": article_text}
+# -------------------------
+# Part 2: Sentiment Analysis Setup
+# -------------------------
+sentiment_model_name = "cross-encoder/nli-distilroberta-base"
+sentiment_model = AutoModelForSequenceClassification.from_pretrained(
+    sentiment_model_name,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
+classifier = pipeline("zero-shot-classification", model=sentiment_model, tokenizer=sentiment_tokenizer)
+labels = ["positive", "negative", "neutral"]
+# -------------------------
+# Part 3: Summarization Setup
+# -------------------------
+bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+def split_into_chunks(text, tokenizer, max_tokens=1024):
+    words = text.split()
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for word in words:
+        tokenized_word = tokenizer.encode(word, add_special_tokens=False)
+        if current_length + len(tokenized_word) <= max_tokens:
+            current_chunk.append(word)
+            current_length += len(tokenized_word)
+        else:
+            chunks.append(' '.join(current_chunk))
+            current_chunk = [word]
+            current_length = len(tokenized_word)
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks
+# -------------------------
+# Part 4: Translation Setup (English to Hindi)
+# -------------------------
+translation_model_name = 'Helsinki-NLP/opus-mt-en-hi'
+trans_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
+trans_model = MarianMTModel.from_pretrained(translation_model_name)
+def translate_text(text):
+    tokens = trans_tokenizer(text, return_tensors="pt", padding=True)
+    translated = trans_model.generate(**tokens)
+    return trans_tokenizer.decode(translated[0], skip_special_tokens=True)
+# -------------------------
+# Part 5: Bark TTS Setup (Hindi)
+# -------------------------
+bark_model = BarkModel.from_pretrained("suno/bark-small").to(device)
+processor = AutoProcessor.from_pretrained("suno/bark")
+# -------------------------
+# Part 6: Process Company - Main Pipeline Function
+# -------------------------
+def process_company(company):
+    # Step 1: Fetch and scrape news
+    fetch_and_scrape_news(company, NEWS_API_KEY)
+    df = pd.read_excel('news_articles.xlsx')
+    print("Scraped Articles:")
+    print(df)
+    titles, summaries, sentiments, urls = [], [], [], []
+    for index, row in df.iterrows():
+        article_text = row.get("content", "")
+        title = row.get("headline", "No title")
+        url = row.get("url", "")
+        chunks = split_into_chunks(article_text, bart_tokenizer)
+        chunk_summaries = []
+        for chunk in chunks:
+            inputs = bart_tokenizer([chunk], max_length=1024, return_tensors='pt', truncation=True)
+            summary_ids = bart_model.generate(inputs.input_ids, num_beams=4, max_length=130, min_length=30, early_stopping=True)
+            chunk_summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            chunk_summaries.append(chunk_summary)
+        final_summary = ' '.join(chunk_summaries)
+        sentiment_result = classifier(final_summary, labels)
+        sentiment = sentiment_result["labels"][0]
+        titles.append(title)
+        summaries.append(final_summary)
+        sentiments.append(sentiment)
+        urls.append(url)
+    final_df = pd.DataFrame({
+        "Title": titles,
+        "Summary": summaries,
+        "Sentiment": sentiments,
+        "URL": urls
+    })
+    final_df["Translated Summary"] = final_df["Summary"].apply(translate_text)
+    final_df.to_excel('translated_news_articles.xlsx', index=False)
+    print("Final processed data with translations:")
+    print(final_df)
+    # Combine all translated summaries into one text prompt
+    final_translated_text = "\n\n".join(final_df["Translated Summary"].tolist())
+    # Generate speech from the combined Hindi text using Bark
+    inputs = processor(final_translated_text, return_tensors="pt").to(device)
+    speech_output = bark_model.generate(**inputs)
+    audio_path = "final_summary.wav"
+    sf.write(audio_path, speech_output[0].cpu().numpy(), bark_model.generation_config.sample_rate)
+    return audio_path
+# -------------------------
+# Part 7: Flask Backend Setup
+# -------------------------
+app = Flask(__name__)
+@app.route("/process", methods=["POST"])
+def process_route():
+    data = request.get_json()
+    company = data.get("company")
+    if not company:
+        return jsonify({"error": "No company provided"}), 400
+    audio_path = process_company(company)
+    # Return the audio file path as JSON (Gradio will load the file)
+    return jsonify({"audio_path": audio_path})
+# -------------------------
+# Part 8: Gradio Interface Setup
+# -------------------------
+def gradio_interface(company):
+    # Call the Flask endpoint
+    response = requests.post("http://127.0.0.1:5000/process", json={"company": company})
+    result = response.json()
+    # Return the audio file path; Gradio's audio output type will read the file.
+    return result.get("audio_path")
+def launch_gradio():
+    import gradio as gr
+    iface = gr.Interface(
+        fn=gradio_interface,
+        inputs=gr.Textbox(label="Enter Company Name"),
+        outputs=gr.Audio(type="filepath", label="News Summary Audio (Hindi)"),
+        title="News Summarization & TTS",
+        description="Enter a company name to fetch news, generate a Hindi summary, and listen to the audio."
+    )
+    iface.launch()
+# -------------------------
+# Main: Run Flask and Gradio
+# -------------------------
+if __name__ == "__main__":
+    # Run the Flask app in a separate thread.
+    flask_thread = Thread(target=lambda: app.run(host="0.0.0.0", port=5000, debug=False, use_reloader=False))
+    flask_thread.start()
+    # Launch the Gradio interface.
+    launch_gradio()