Spaces:

Pranav-K
/

FinBert

Build error

App Files Files Community

Pranav-K commited on Mar 11, 2023

Commit

08722d4

1 Parent(s): a31ae5b

Upload 3 files

Browse files

Files changed (3) hide show

app.py +826 -0
gen-data.csv +0 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,826 @@

+# -*- coding: utf-8 -*-
+"""Survey_Analysis_v_3.2.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1UtAdINgLRkpdKGCzhEIPR8ZgK1u_dMtD
+"""
+#1 - https://www.kaggle.com/code/ramjasmaurya/financial-sentiment-analysis
+#2 - https://www.kaggle.com/code/adarshbiradar/sentiment-analysis-using-bert
+pip install streamlit
+import streamlit
+pip install pygal
+!pip install squarify
+# Commented out IPython magic to ensure Python compatibility.
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import plotly.express as px
+import plotly.graph_objects as go
+import pygal as py
+import squarify as sq
+import matplotlib
+plt.rcParams["figure.figsize"] = (20,15)
+matplotlib.rc('xtick', labelsize=7)
+matplotlib.rc('ytick', labelsize=7)
+font = {'family' : 'normal',
+        'weight' : 'bold',
+        'size'   : 5}
+matplotlib.rc('font', **font)
+from sklearn.feature_extraction.text import CountVectorizer
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+# %matplotlib inline
+df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1")
+df
+col1=df.keys()[0]
+col2=df.keys()[1]
+col2
+df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845])
+df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False)
+df
+df = df.replace("ï»¿neutral","neutral")
+sns.countplot(y="sentiment",data=df)
+df.isnull().sum()
+from textblob import TextBlob
+def preprocess(ReviewText):
+    ReviewText = ReviewText.str.replace("(<br/>)", "")
+    ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
+    ReviewText = ReviewText.str.replace('(&amp)', '')
+    ReviewText = ReviewText.str.replace('(&gt)', '')
+    ReviewText = ReviewText.str.replace('(&lt)', '')
+    ReviewText = ReviewText.str.replace('(\xa0)', ' ')
+    return ReviewText
+df['Review Text'] = preprocess(df['news'])
+df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity)
+df['news_len'] = df['news'].astype(str).apply(len)
+df['word_count'] = df['news'].apply(lambda x: len(str(x).split()))
+df
+print('top 4 random reviews with the highest positive sentiment polarity: \n')
+df1=df.drop_duplicates(subset=['Review Text'])
+cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values
+for c in cl:
+    print(c[0])
+print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
+cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values
+for c in cl1:
+    print(c[0])
+print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n')
+cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values
+for c in cl3:
+    print(c[0])
+sns.boxplot(df["polarity"],palette="rainbow",data=df)
+df['polarity'].plot(
+    kind='hist',
+    bins=50,
+    color="peru",
+    title='Sentiment Polarity Distribution');plt.show()
+p_s=df[df["polarity"]>0].count()["sentiment"]
+neu_s=df[df["polarity"]==0].count()["sentiment"]
+neg_s=df[df["polarity"]<0].count()["sentiment"]
+# Setting labels for items in Chart
+sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"]
+# Setting size in Chart based on
+# given values
+values = [p_s,neu_s,neg_s]
+# colors
+colors = ['#FF0000', 'olive', '#FFFF00']
+# explosion
+explode = (0.05, 0.05, 0.05)
+# Pie Chart
+plt.pie(values, colors=colors, labels=sentiment,
+        autopct='%1.1f%%', pctdistance=0.85,
+        explode=explode)
+# draw circle
+centre_circle = plt.Circle((0, 0), 0.70, fc='white')
+fig = plt.gcf()
+# Adding Circle in Pie chart
+fig.gca().add_artist(centre_circle)
+# Adding Title of chart
+plt.title('count of polarity as per sentiment')
+# Displaing Chart
+plt.show()
+df.plot.box(y=["word_count"],color="hotpink")
+df['word_count'].plot(
+    kind='hist',
+    bins=100,
+    color="orange",
+    title='Review Text Word Count Distribution');plt.show()
+sns.boxenplot(x="news_len",data=df)
+plt.show()
+df['news_len'].plot(
+    kind='hist',
+    bins=50,
+    color="lightblue",
+    title='Review Text Word Count Distribution');plt.show()
+fig = px.scatter(df, x="news_len", y="word_count", color="sentiment",
+                 marginal_x="box", marginal_y="violin",
+                  title="Click on the legend items!")
+fig.show()
+def get_top_n_words(corpus, n=None):
+    vec = CountVectorizer().fit(corpus)
+    bag_of_words = vec.transform(corpus)
+    sum_words = bag_of_words.sum(axis=0)
+    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
+    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
+    return words_freq[:n]
+common_words = get_top_n_words(df['Review Text'], 20)
+for word, freq in common_words:
+    print(word, freq)
+df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
+df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
+    kind='bar',title='Top 20 words in review before removing stop words')
+df1
+def get_top_n_words(corpus, n=None):
+    vec = CountVectorizer(stop_words = 'english').fit(corpus)
+    bag_of_words = vec.transform(corpus)
+    sum_words = bag_of_words.sum(axis=0)
+    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
+    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
+    return words_freq[:n]
+common_words = get_top_n_words(df['Review Text'], 20)
+for word, freq in common_words:
+    print(word, freq)
+df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
+df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words')
+def get_top_n_bigram(corpus, n=None):
+    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
+    bag_of_words = vec.transform(corpus)
+    sum_words = bag_of_words.sum(axis=0)
+    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
+    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
+    return words_freq[:n]
+common_words = get_top_n_bigram(df['Review Text'], 20)
+for word, freq in common_words:
+    print(word, freq)
+df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
+df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
+    kind='bar',title='Top 20 bigrams in review before removing stop words')
+def get_top_n_bigram(corpus, n=None):
+    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
+    bag_of_words = vec.transform(corpus)
+    sum_words = bag_of_words.sum(axis=0)
+    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
+    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
+    return words_freq[:n]
+common_words = get_top_n_bigram(df['Review Text'], 20)
+for word, freq in common_words:
+    print(word, freq)
+df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
+df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
+    kind='bar', title='Top 20 bigrams in review after removing stop words')
+def get_top_n_trigram(corpus, n=None):
+    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
+    bag_of_words = vec.transform(corpus)
+    sum_words = bag_of_words.sum(axis=0)
+    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
+    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
+    return words_freq[:n]
+common_words = get_top_n_trigram(df['Review Text'], 20)
+for word, freq in common_words:
+    print(word, freq)
+df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
+df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
+    kind='bar', title='Top 20 trigrams in review before removing stop words')
+def get_top_n_trigram(corpus, n=None):
+    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
+    bag_of_words = vec.transform(corpus)
+    sum_words = bag_of_words.sum(axis=0)
+    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
+    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
+    return words_freq[:n]
+common_words = get_top_n_trigram(df['Review Text'], 20)
+for word, freq in common_words:
+    print(word, freq)
+df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count'])
+df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
+    kind='bar', title='Top 20 trigrams in review after removing stop words')
+import nltk
+nltk.download('punkt')
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+nltk.download('averaged_perceptron_tagger')
+#import nltk
+blob = TextBlob(str(df['Review Text']))
+pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
+pos_df = pos_df.pos.value_counts()[:20]
+pos_df.plot(
+    kind='bar',
+    title='Top 20 Part-of-speech tagging for review corpus')
+y0 = df.loc[df['sentiment'] == 'positive']['polarity']
+y1 = df.loc[df['sentiment'] == 'negative']['polarity']
+y2 = df.loc[df['sentiment'] == 'neutral']['polarity']
+trace0 = go.Box(
+    y=y0,
+    name = 'positive',
+    marker = dict(
+        color = 'rgb(214, 12, 140)',
+    )
+)
+trace1 = go.Box(
+    y=y1,
+    name = 'negative',
+    marker = dict(
+        color = 'rgb(0, 128, 128)',
+    )
+)
+trace2 = go.Box(
+    y=y2,
+    name = 'neutral',
+    marker = dict(
+        color = 'rgb(10, 140, 208)',
+    )
+)
+data = [trace0, trace1, trace2]
+layout = go.Layout(
+    title = "Polarity Boxplot according to sentiment"
+)
+go.Figure(data=data,layout=layout)
+y0 = df.loc[df['sentiment'] == 'positive']['news_len']
+y1 = df.loc[df['sentiment'] == 'negative']['news_len']
+y2 = df.loc[df['sentiment'] == 'neutral']['news_len']
+trace0 = go.Box(
+    y=y0,
+    name = 'positive',
+    marker = dict(
+        color = 'rgb(214, 12, 140)',
+    )
+)
+trace1 = go.Box(
+    y=y1,
+    name = 'negative',
+    marker = dict(
+        color = 'rgb(0, 128, 128)',
+    )
+)
+trace2 = go.Box(
+    y=y2,
+    name = 'neutral',
+    marker = dict(
+        color = 'rgb(10, 140, 208)',
+    )
+)
+data = [trace0, trace1, trace2]
+layout = go.Layout(
+    title = "news length Boxplot by sentiment"
+)
+go.Figure(data=data,layout=layout)
+xp = df.loc[df['sentiment'] == "positive", 'polarity']
+xneu = df.loc[df['sentiment'] == "neutral", 'polarity']
+xneg= df.loc[df['sentiment'] == "negative", 'polarity']
+trace1 = go.Histogram(
+    x=xp, name='positive',
+    opacity=0.75
+)
+trace2 = go.Histogram(
+    x=xneu, name = 'neutral',
+    opacity=0.75
+)
+trace3 = go.Histogram(
+    x=xneg, name = 'negative',
+    opacity=0.75
+)
+data = [trace1, trace2,trace3]
+layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity')
+go.Figure(data=data, layout=layout)
+trace1 = go.Scatter(
+    x=df['polarity'], y=df['news_len'], mode='markers', name='points',
+    marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
+)
+trace2 = go.Histogram2dContour(
+    x=df['polarity'], y=df['news_len'], name='density', ncontours=50,
+    colorscale='Hot', reversescale=True, showscale=False
+)
+trace3 = go.Histogram(
+    x=df['polarity'], name='Sentiment polarity density',
+    marker=dict(color='rgb(102,0,0)'),
+    yaxis='y2'
+)
+trace4 = go.Histogram(
+    y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'),
+    xaxis='x2'
+)
+data = [trace1, trace2, trace3, trace4]
+layout = go.Layout(
+    showlegend=False,
+    autosize=False,
+    width=600,
+    height=550,
+    xaxis=dict(
+        domain=[0, 0.85],
+        showgrid=False,
+        zeroline=False
+    ),
+    yaxis=dict(
+        domain=[0, 0.85],
+        showgrid=False,
+        zeroline=False
+    ),
+    margin=dict(
+        t=50
+    ),
+    hovermode='x unified',
+    bargap=0,
+    xaxis2=dict(
+        domain=[0.85, 1],
+        showgrid=False,
+        zeroline=False
+    ),
+    yaxis2=dict(
+        domain=[0.85, 1],
+        showgrid=False,
+        zeroline=False
+    )
+)
+go.Figure(data=data, layout=layout)
+trace1 = go.Scatter(
+    x=df['polarity'], y=df['word_count'], mode='markers', name='points',
+    marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
+)
+trace2 = go.Histogram2dContour(
+    x=df['polarity'], y=df['word_count'], name='density', ncontours=20,
+    colorscale='Hot', reversescale=True, showscale=False
+)
+trace3 = go.Histogram(
+    x=df['polarity'], name='Sentiment polarity density',
+    marker=dict(color='rgb(102,0,0)'),
+    yaxis='y2'
+)
+trace4 = go.Histogram(
+    y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'),
+    xaxis='x2'
+)
+data = [trace1, trace2, trace3, trace4]
+layout = go.Layout(
+    showlegend=False,
+    autosize=False,
+    width=600,
+    height=550,
+    xaxis=dict(
+        domain=[0, 0.85],
+        showgrid=False,
+        zeroline=False
+    ),
+    yaxis=dict(
+        domain=[0, 0.85],
+        showgrid=False,
+        zeroline=False
+    ),
+    margin=dict(
+        t=50
+    ),
+    hovermode='closest',
+    bargap=0,
+    xaxis2=dict(
+        domain=[0.85, 1],
+        showgrid=False,
+        zeroline=False
+    ),
+    yaxis2=dict(
+        domain=[0.85, 1],
+        showgrid=False,
+        zeroline=False
+    )
+)
+go.Figure(data=data, layout=layout)
+pip install scattertext
+pip install spacy
+import scattertext as st
+import spacy
+nlp = spacy.blank("en")
+nlp.add_pipe('sentencizer')
+#nlp.add_pipe(nlp.create_pipe('sentencizer'))
+corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build()
+print(list(corpus.get_scaled_f_scores_vs_background().index[:20]))
+term_freq_df = corpus.get_term_freq_df()
+term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive')
+list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20])
+term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral')
+list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20])
+term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative')
+list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20])
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import TruncatedSVD
+from collections import Counter
+tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
+reindexed_data = df['Review Text'].values
+document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
+n_topics = 10
+lsa_model = TruncatedSVD(n_components=n_topics)
+lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)
+def get_keys(topic_matrix):
+    '''
+    returns an integer list of predicted topic
+    categories for a given topic matrix
+    '''
+    keys = topic_matrix.argmax(axis=1).tolist()
+    return keys
+def keys_to_counts(keys):
+    '''
+    returns a tuple of topic categories and their
+    accompanying magnitudes for a given list of keys
+    '''
+    count_pairs = Counter(keys).items()
+    categories = [pair[0] for pair in count_pairs]
+    counts = [pair[1] for pair in count_pairs]
+    return (categories, counts)
+lsa_keys = get_keys(lsa_topic_matrix)
+lsa_categories, lsa_counts = keys_to_counts(lsa_keys)
+def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
+    '''
+    returns a list of n_topic strings, where each string contains the n most common
+    words in a predicted category, in order
+    '''
+    top_word_indices = []
+    for topic in range(n_topics):
+        temp_vector_sum = 0
+        for i in range(len(keys)):
+            if keys[i] == topic:
+                temp_vector_sum += document_term_matrix[i]
+        temp_vector_sum = temp_vector_sum.toarray()
+        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
+        top_word_indices.append(top_n_word_indices)
+    top_words = []
+    for topic in top_word_indices:
+        topic_words = []
+        for index in topic:
+            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
+            temp_word_vector[:,index] = 1
+            the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
+            topic_words.append(the_word.encode('ascii').decode('utf-8'))
+        top_words.append(" ".join(topic_words))
+    return top_words
+top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
+for i in range(len(top_lsa)):
+    print("Topic {}: ".format(i+1), top_lsa[i])
+top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
+labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories]
+fig, ax = plt.subplots(figsize=(16,8))
+ax.bar(lsa_categories, lsa_counts,color="skyblue");
+ax.set_xticks(lsa_categories,);
+ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive");
+ax.set_ylabel('Number of review text on topics');
+ax.set_title('Count of LSA topics');
+plt.show();
+"""#---2----"""
+df['sentiment'].value_counts()
+from sklearn.model_selection import train_test_split
+train,eva = train_test_split(df,test_size = 0.2)
+!pip install simpletransformers
+from simpletransformers.classification import ClassificationModel
+# Create a Transformer Model BERT
+model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)
+# 0,1,2 : positive,negative
+def making_label(st):
+    if(st=='positive'):
+        return 0
+    elif(st=='neutral'):
+        return 2
+    else:
+        return 1
+train['label'] = train['sentiment'].apply(making_label)
+eva['label'] = eva['sentiment'].apply(making_label)
+print(train.shape)
+train_df = pd.DataFrame({
+    'text': train['news'][:1500].replace(r'\n', ' ', regex=True),
+    'label': train['label'][:1500]
+})
+eval_df = pd.DataFrame({
+    'text': eva['news'][-400:].replace(r'\n', ' ', regex=True),
+    'label': eva['label'][-400:]
+})
+model.train_model(train_df)
+result, model_outputs, wrong_predictions = model.eval_model(eval_df)
+result
+model_outputs
+len(wrong_predictions)
+lst = []
+for arr in model_outputs:
+    lst.append(np.argmax(arr))
+true = eval_df['label'].tolist()
+predicted = lst
+import sklearn
+mat = sklearn.metrics.confusion_matrix(true , predicted)
+mat
+df_cm = pd.DataFrame(mat, range(3), range(3))
+sns.heatmap(df_cm, annot=True)
+plt.show()
+print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative']))
+sklearn.metrics.accuracy_score(true,predicted)
+#Give your statement
+def get_result(statement):
+    result = model.predict([statement])
+    pos = np.where(result[1][0] == np.amax(result[1][0]))
+    pos = int(pos[0])
+    sentiment_dict = {0:'positive',1:'negative',2:'neutral'}
+    print(sentiment_dict[pos])
+    return
+## neutral statement
+get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .")
+## positive statement
+get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .")
+## negative statement
+get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .')
+get_result("This company is growing like anything with 23% profit every year")
+get_result("This company is not able to make any profit but make very less profit in last quarter")
+get_result("The doctor treated well and the patient was very healthy")
+get_result("the act of politicians is to serve and help needy and not to create ruck suck")
+get_result("American burger is too good. Can't resisit to go and have one")
+get_result("GDP per capita increased to double in India from 2013")
+get_result("Indian economy is doing very good and will become super power one day.")
+get_result("Indian economy is doing very good and will create millions of jobs in coming years")
+get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years")
+get_result("Indian economy is doing very good.Indian economy is not doing very good ")
+get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy")
+get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export")
+get_result("The stock market of Indian economy is dangling too much")
+"""#VADER"""
+!pip install vaderSentiment
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+obj = SentimentIntensityAnalyzer()
+sentence = "Ram is really good "
+sentiment_dict = obj.polarity_scores(sentence)
+print(sentiment_dict)
+#check this
+sentence = "Ram is better "
+sentiment_dict = obj.polarity_scores(sentence)
+print(sentiment_dict)
+sentence = "Rahul is really bad"
+sentiment_dict = obj.polarity_scores(sentence)
+print(sentiment_dict)
+#punctuation
+print(obj.polarity_scores('Ram is good boy'))
+print(obj.polarity_scores('Ram is good boy!'))
+print(obj.polarity_scores('Ram is good boy!!'))
+#capitalization
+print(obj.polarity_scores('Ram is good'))
+print(obj.polarity_scores('Ram is GOOD'))
+#degree
+print(obj.polarity_scores('Ram is good'))
+print(obj.polarity_scores('Ram is better'))
+print(obj.polarity_scores('Ram is best'))
+print(obj.polarity_scores('Ram is bad'))
+print(obj.polarity_scores('Ram is worse'))
+print(obj.polarity_scores('Ram is worst'))
+#conjuction
+print(obj.polarity_scores('Ram is good'))
+print(obj.polarity_scores('Ram is good, but he is also naughty sometimes'))
+#slang
+print(obj.polarity_scores("That Hotel"))
+print(obj.polarity_scores("That Hotel SUX"))
+print(obj.polarity_scores("That Hotel SUCKS"))
+#emoticons
+print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen"))
+print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen"))
+print(obj.polarity_scores("Your :( is the worst thing I have ever seen"))
+print(obj.polarity_scores("Your smile is the worst thing I have ever seen"))
+#https://360digitmg.com/blog/bert-variants-and-their-differences
+#https://simpletransformers.ai/docs/classification-specifics/#supported-model-types Official reference
+"""#3.a Using FINBERT Model"""
+#PPT
+#https://medium.com/@benjamin_joesy/finbert-financial-sentiment-analysis-with-bert-acf695b64ac6
+from transformers import BertTokenizer, BertForSequenceClassification, pipeline
+# tested in transformers==4.18.0
+import transformers
+transformers.__version__
+finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
+tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
+nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
+results = nlp(['growth is strong and we have plenty of liquidity.',
+               'there is a shortage of capital, and we need extra financing.',
+              'formulation patents might protect Vasotec to a limited extent.'])
+results
+"""#FINBERT ESG"""
+finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
+tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
+nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
+results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.',
+               'Rhonda has been volunteering for several years for a variety of charitable community programs.',
+               'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.',
+               'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.'])
+results
+"""#FINBERT Classification"""
+finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
+tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
+nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
+results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.',
+               'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.',
+               'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in'])
+results
+X = df['Review Text'].to_list()
+y = df['sentiment'].to_list()
+from transformers import BertTokenizer, BertForSequenceClassification
+finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
+tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
+labels = {0:'neutral', 1:'positive',2:'negative'}
+sent_val = list()
+for x in X:
+    inputs = tokenizer_whole(x, return_tensors="pt", padding=True)
+    outputs = finbert_whole(**inputs)[0]
+    val = labels[np.argmax(outputs.detach().numpy())]
+    print(x, '---->', val)
+    print('#######################################################')
+    sent_val.append(val)
+from sklearn.metrics import accuracy_score
+print(accuracy_score(y, sent_val))
+"""#Using DISTILBERT"""
+from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
+tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+labels = {0:'neutral', 1:'positive',2:'negative'}
+sent_val_bert = list()
+for x in X:
+    inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True)
+    outputs = model_distilbert(**inputs)[0]
+    val = labels[np.argmax(outputs.detach().numpy())]
+    print(x, '---->', val)
+    print('#######################################################')
+    sent_val_bert.append(val)
+from sklearn.metrics import accuracy_score
+print(accuracy_score(y, sent_val))
+"""#Bert"""
+tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased")
+model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased")
+labels = {0:'neutral', 1:'positive',2:'negative'}
+sent_val_bert1 = list()
+for x in X:
+    inputs = tokenizer_bert(x, return_tensors="pt", padding=True)
+    outputs = model_bert(**inputs)[0]
+    val = labels[np.argmax(outputs.detach().numpy())]
+    print(x, '---->', val)
+    print('#######################################################')
+    sent_val_bert1.append(val)
+from sklearn.metrics import accuracy_score
+print(accuracy_score(y, sent_val))

gen-data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

Binary file (8.87 kB). View file