File size: 5,355 Bytes
f98185e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16fad89
f98185e
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Gradio Application Interface

import gradio as gr
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
import pandas as pd
import gensim
import re 
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import os

def summarizer_func():
  return pipeline(
      model="Majon911/pegasus_multi_news_ep1",
      tokenizer = "google/pegasus-xsum",
      min_length=100, max_length=200,
      truncation = True
  )

def sentiment_func():
   return pipeline("text-classification", 
                   model="kbaumgartner/DeBERTa_Finetuned_Financial_News", 
                   tokenizer = "microsoft/deberta-v3-base")

def source_outlet(choise):
    if choise == 'CNBC':
      url = "https://www.cnbc.com/finance/"
      response = requests.get(url)
      soup = BeautifulSoup(response.content, 'html.parser')

      headlines = {}
      headline_elements = soup.find_all('a', class_='Card-title')
      for headline_element in headline_elements:
          headlines[headline_element.text.strip()] = headline_element['href']
    elif choise == "Reuters":
      pass

    df = pd.DataFrame({'headline': headlines.keys(),
                    'url': headlines.values()})

    first_5_articles = df.head()
    first_5_articles = first_5_articles.assign(text='')
    first_5_articles = first_5_articles.assign(summary='')
    first_5_articles = first_5_articles.assign(sentiment='')
    first_5_articles = first_5_articles.assign(topic='')
    return first_5_articles

def sentiment_translation(curr_sentiment):
  if curr_sentiment == "LABEL_0":
    trans_lbl = "NEGATIVE"
  elif curr_sentiment == "LABEL_1":
    trans_lbl = "NEUTRAL"
  elif curr_sentiment == "LABEL_2":
    trans_lbl = "POSITIVE"
  return trans_lbl

def preprocess(text):
    # Remove special characters and digits
    text = text.lower()
    text = re.sub("(\\d|\\W)+", " ", text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in text.lower().split() if word not in stop_words and len(word) > 3]
    return tokens

def lda_topic_modeling(text):
    lda_model = gensim.models.LdaModel.load("lda_gensim_5t/lda_model5.gensim")
    dictionary = gensim.corpora.Dictionary.load("lda_gensim_5t/dictionary5.gensim")

    processed_text = preprocess(text)
    bow = dictionary.doc2bow(processed_text)
    topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0.0)
    topic_distribution = sorted(topic_distribution, key=lambda x: x[1], reverse=True)
    
    topic_names = {
    '0': "Corporate Valuation & Performance",
    '1': "Quarterly Financial Reports",
    '2': "Stock Market & Investment Funds",
    '3': "Corporate Affairs & Products",
    '4': "Investment Research"
    }

    # Extract the most probable topic and its probability
    if topic_distribution:
        dominant_topic, probability = topic_distribution[0]
        topic_name = topic_names.get(str(dominant_topic), "Unknown Topic")
        return (topic_name, probability)
    else:
        # If no topic is found, return a placeholder and zero probability
        return ("No Topic Found", 0.0)

def gradio_stocknews(source_ch, art_number):

  # Defining the summarizer
  summarizer = summarizer_func()
  # Defining the semtiment analysis
  pipe_sentiment = sentiment_func()

  # Identyfying the Articles
  first_5_articles = source_outlet(source_ch)

  # Scraping text for the chosen article
  response = requests.get(first_5_articles.loc[art_number-1, 'url'])
  sub_soup = BeautifulSoup(response.content, 'html.parser')
  article_body_element = sub_soup.find('div', class_='ArticleBody-articleBody') # ArticleBody-articleBody
  article_text = article_body_element.get_text()  # Extracting only the text
  first_5_articles.loc[art_number-1, 'text']  = article_text
  first_5_articles.loc[art_number-1, 'summary']  = summarizer(article_text)[0]['generated_text']

  label_sentiment = pipe_sentiment(article_text)[0]['label']
  first_5_articles.loc[art_number-1, 'sentiment'] = sentiment_translation(label_sentiment)
  
  # Get the human-readable topic name using the topic names mapping
  first_5_articles.loc[art_number-1, 'topic'] = lda_topic_modeling(article_text)[0]


  return first_5_articles.loc[art_number-1, 'headline'], first_5_articles.loc[art_number-1, 'url'], first_5_articles.loc[art_number-1, 'summary'], first_5_articles.loc[art_number-1, 'sentiment'], first_5_articles.loc[art_number-1, 'topic']

def main():
    os.chdir(os.path.dirname(os.path.realpath(__file__)))
    nltk.download('stopwords')

    #print(gradio_stocknews("CNBC", 2))

    iface = gr.Interface(fn=gradio_stocknews,
                        inputs=[gr.Dropdown(choices=["CNBC"], label="Select Source"), gr.Dropdown(choices=[1, 2, 3, 4, 5], label="Select Article Number")],
                        outputs=[gr.Textbox(lines=1, label="Article Title"), gr.Textbox(lines=1, label="Article Link"), gr.Textbox(lines=1, label="Article Summary"), gr.Textbox(lines=1, label="Article Sentiment"), gr.Textbox(lines=1, label="Article Topic")],   # Add this line for topic
                        title="Latest 5 Stock News Dashboard",
                        description="Click the button to refresh the news summary.")

    iface.launch()

if __name__ == "__main__":
    main()