Spaces:

devinlee14
/

test

Sleeping

App Files Files Community

devinlee14 commited on Dec 6, 2023

Commit

e0c55bb

1 Parent(s): 26e930a

Upload 28 files

Browse files

Files changed (29) hide show

.gitattributes +1 -0
app.py +69 -0
conclusion.py +23 -0
eda.py +190 -0
functions/__init__.py +0 -0
functions/text_preprocessed.py +46 -0
images/Boxplot_of_Ratings_by_Category.png +0 -0
images/Category_Popularity.png +0 -0
images/Distribution_of_Sentiment_Polarity.png +0 -0
images/Distribution_of_Sentiment_Subjectivity.png +0 -0
images/Distribution_of_Text_Length_Character_by_Sentiment.png +0 -0
images/Model_Evaluatio_GRU.png +0 -0
images/Model_Evaluation_CNN.png +0 -0
images/Model_Evaluation_LSTM.png +0 -0
images/Negative_Sentiment_Words.png +0 -0
images/Neutral_Sentiment_Words.png +0 -0
images/Positive_Sentiment_Words.png +0 -0
images/Price_Distribution_Among_Paid_Apps.png +0 -0
images/Rating_vs_Reviews.png +0 -0
images/Sentiment_Distribution.png +0 -0
images/rating_distribution.png +0 -0
images/reviews_distribution.png +0 -0
lstm/fingerprint.pb +3 -0
lstm/keras_metadata.pb +3 -0
lstm/saved_model.pb +3 -0
lstm/variables/variables.data-00000-of-00001 +3 -0
lstm/variables/variables.index +0 -0
model.py +101 -0
requirements.txt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+lstm/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import streamlit as st
+import eda
+import model
+import conclusion
+# Sidebar
+st.sidebar.header("Choose Here!")
+options = ['Home Page', 'Exploratory Data Analysis', 'Test our Model!', 'Conclusion']
+page = st.sidebar.selectbox(label='Select Page:', options=options)
+# Home Page
+if page == 'Home Page':
+    st.header('Feedback to Foresight: Simplifying App Review Sentiment Analysis')
+    st.caption("This project was carried out as part of Hacktiv8's Data Science programme final collaborative project.")
+    st.caption('Please check our github repository [here!](https://github.com/devinlee14/FTDS-009-HCK-group-002)')
+    st.markdown('---')
+    st.markdown('''
+            #### Group members:
+            * Devin Yaung Lee — Data Analyst
+            * Fernaldy Aristo Wirjowerdojo — Data Engineer
+            * Muhammad Furqon Pakpahan — Data Engineer
+            * Sifra Hilda Juliana Siregar — Data Scientist
+            ''')
+    st.write('')
+    st.caption('Please select another page in the `Select Page` on the left side of your screen to get started!')
+    st.write('')
+    with st.expander("Project Overview"):
+            st.caption('''
+                    This project focuses on performing sentiment analysis on Google Play Store app reviews.
+                    Utilizing Natural Language Processing (NLP), the goal is to analyse user feedback
+                    to gain insights into satisfaction and app perception.
+                    ''')
+    with st.expander("Problem Statement"):
+            st.caption('''
+                    In the competitive landscape of mobile applications, user feedback for app reviews is a
+                    goldmine of insights that can inform product development and marketing strategies.
+                    However, these reviews are often unstructured, making it challenging to efficiently extract,
+                    categorize, and analyze sentiments and opinions. There is a need for an automated system
+                    that can process this feedback to provide actionable insights, identify trends in user sentiment,
+                    and highlight areas for improvement. This project aims to address the lack of structured
+                    analysis of user-generated content in app reviews on the Google Play Store, which,
+                    if leveraged correctly, can significantly enhance user satisfaction and app performance in the market.
+                    ''')
+    with st.expander("Objectives"):
+            st.caption('''
+                    * **Develop an Automated Sentiment Analysis Model**
+                    Build and train a TensorFlow model to classify app reviews into positive, negative, and neutral sentiments with high accuracy.
+                    * **Understand the User Feedbacks in Depth**
+                    Utilize the sentiment analysis model to delve into the nuances of user feedback on the Google Play Store.
+                    ''')
+# EDA
+elif page == 'Exploratory Data Analysis':
+    eda.run()
+# Model
+elif page == 'Test our Model!':
+    model.run()
+# Conclusion
+else:
+    conclusion.run()

conclusion.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import streamlit as st
+# Fill with project conclusion
+def run():
+    st.header('Conclusion')
+    st.markdown('---')
+    # -------------------------------------------------------------------------
+    st.markdown('''
+            **Model Deployment**: Integrating the sentiment analysis model into Google's app review system can be leveraged to achieve several outcomes:
+            * Quality Control: Real-time sentiment scoring can be used to flag apps with consistently poor sentiment for quality review, ensuring that the apps offered on the Play Store maintain a high standard
+            * Trend Detection: Google can monitor sentiment trends for early detection of issues like bugs in recent app updates, or to identify apps that are suddenly gaining positive attention, which could then be featured or recommended
+            ''')
+    st.markdown('---')
+    st.markdown('''
+            **Strategic Actions**: Providing sentiment analysis feedback to app developers can be expanded upon for further strategic initiatives:
+            * Automated Category Insights: An automated system could provide developers with real-time analytics on how their app's sentiment compares to the average within its category, including highlighting specific aspects like customer service, usability, or functionality that may need attention
+            * Benchmarking and Best Practices: Developers can receive benchmark reports comparing their apps with top-performing ones in the same category, offering insights into best practices and areas for improvement
+            * Predictive Analytics for Developers: By analyzing sentiment trends, Google can offer predictive insights to developers, helping them anticipate user needs and expectations, and guiding them on when to release updates or introduce new features
+            * Content Moderation Strategies: Using sentiment analysis to prioritize the review of content can help:
+                * Improve Moderation Efficiency: Focus human moderators' efforts on the most critical content first, improving the efficiency of the moderation process
+                * Enhance App Safety: Quickly address apps with negative sentiments that might be related to safety or compliance issues, maintaining a safe environment for all users
+                * Refine Automated Systems: Feed sentiment analysis data into automated content moderation systems to improve their accuracy and responsiveness
+            ''')

eda.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import streamlit as st
+def run():
+    st.title("Exploratory Data Analysis")
+    st.markdown('---')
+# -----------------------------------------------------------------------------------------
+    # Rating Distribution
+    st.markdown('### Rating Distribution')
+    st.image('images/rating_distribution.png', caption='Figure 1')
+    with st.expander('Explanation'):
+        st.caption('From this Histogram of Rating Distribution, we know:')
+        st.caption('''
+                * The distribution is skewed towards higher ratings, with most apps receiving ratings between 4.0 and 4.7
+                * The highest frequency of ratings is at 4.4, followed closely by 4.3 and 4.5, indicating that a large number of apps are rated favorably
+                * Very few apps have ratings lower than 3.0, suggesting either a selection of generally well-received apps or a tendency for users
+                to rate apps more favorably
+                ''')
+        st.caption('''
+                The drop in frequency for ratings 4.8 and above could indicate a standard where few apps
+                are rated as near-perfect. Overall, this distribution indicates a trend where users rate apps positively,
+                with few instances of very low ratings.
+                ''')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Review Distribution
+    st.markdown('### Review Distribution')
+    st.image('images/reviews_distribution.png', caption='Figure 2')
+    with st.expander('Explanation'):
+        st.caption('''
+                The distribution is highly right-skewed, indicating that a large number of apps have a small number of reviews,
+                This could be due to several reasons like new apps, or even unpopular apps, while only a few apps have a
+                very high number of reviews. This pattern suggests that a small subset of apps is receiving the majority
+                of the attention from users in terms of reviews.
+                ''')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Price distribution among paid apps
+    st.markdown('### Price Distribution among Paid Apps')
+    st.image('images/Price_Distribution_Among_Paid_Apps.png', caption='Figure 3')
+    with st.expander('Explanation'):
+        st.caption('''
+                Most paid apps are priced below $10, with peaks at around the $2 and $4 price points.
+                There are fewer apps at higher price points, indicating that lower-priced apps are
+                more common and potentially more popular among users.
+                ''')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Ratings vs Reviews
+    st.markdown('### Ratings vs Reviews')
+    st.image('images/Rating_vs_Reviews.png', caption='Figure 4')
+    with st.expander('Explanation'):
+        st.caption('''
+                There is a concentration of apps with high ratings and a moderate number of reviews,
+                very few apps have low ratings, and apps with near perfect rating are relatively rare.
+                This may indicate that well-rated apps tend to receive a good number of reviews, but not all popular apps
+                (in terms of the number of reviews) are necessarily high-rated.
+                ''')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Category popularity
+    st.markdown('### Category Popularity')
+    st.image('images/Category_Popularity.png', caption='Figure 5')
+    with st.expander('Explanation'):
+        st.caption('''
+                The 'Game' category is the most popular, followed by 'Family' and 'Health & Fitness',
+                suggesting these are the most common types of apps. Less populated categories like
+                'Events' and 'Comics' may represent more niche markets.
+                ''')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Ratings by Category
+    st.markdown('### Ratings by Category')
+    st.image('images/Boxplot_of_Ratings_by_Category.png', caption='Figure 6')
+    with st.expander('Explanation'):
+        st.caption('''
+                * Most categories have median ratings above 4.0, indicating generally positive reception of apps across all categories
+                * Some categories show a wide range of ratings (evidenced by longer boxes), indicating more variability in how users rate these apps
+                * Categories with tight boxes, where Q1 and Q3 are close together, indicate more consistency in ratings
+                * Outliers are present in many categories, both on the high and low ends, suggesting that there are a
+                few apps that are rated significantly differently than the majority in their category
+                ''')
+        st.caption('''
+                Overall, this plot provides a comprehensive view of how apps are rated within each category,
+                showing general user satisfaction and highlighting categories with more diverse user opinions.
+                ''')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Sentiment Distribution
+    st.markdown('### Sentiment Distribution')
+    st.image('images/Sentiment_Distribution.png', caption='Figure 7')
+    with st.expander('Explanation'):
+        st.caption('''
+                * The 'positive' category has the highest count, exceeding 20,000 items
+                * The 'negative' category has a lower count, roughly around 7,500 items
+                * The 'neutral' category has the least, with just under 5,000 items
+                ''')
+        st.caption('This suggests that the positive sentiment among the items being analyzed predominates significantly over negative and neutral sentiments.')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Sentiment Polarity Distribution
+    st.markdown('### Sentiment Polarity Distribution')
+    st.image('images/Distribution_of_Sentiment_Polarity.png', caption='Figure 8')
+    with st.expander('Explanation'):
+        st.caption('''
+                The chart shows a large concentration of scores around 0, indicating a high frequency of neutral sentiments.
+                There is a notable spike at exactly 0, which is significantly higher than any other value, suggesting a
+                large number of entries with a perfectly neutral sentiment. The distribution is somewhat bimodal, with smaller peaks
+                in the positive range (around 0.5) and negative range (around -0.25 to -0.5), implying clusters of positive and
+                negative sentiments as well. However, the positive sentiments appear to have a slightly wider spread
+                with multiple smaller peaks, while negative sentiments are more concentrated around their peak.
+                Overall, this suggests that the data contains a high volume of neutral sentiments, with a presence of
+                both positive and negative sentiments, and a broader diversity of positive sentiment intensities.
+                ''')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Sentiment Subjectivity Distribution
+    st.markdown('### Sentiment Subjectivity Distribution')
+    st.image('images/Distribution_of_Sentiment_Subjectivity.png', caption='Figure 9')
+    with st.expander('Explanation'):
+        st.caption('Sentiment subjectivity shows the level of objectiveness of the user review where 0 indicates no subjectivity and 1 indicates high subjectivity.')
+        st.caption('''
+                * A high peak at 0, suggesting a significant number of texts are classified with no subjectivity, meaning they are likely to be factual or objective
+                * Several moderate peaks throughout, especially noticeable around 0.2, 0.5, 0.6, and towards the higher end at 1.0
+                * The peaks at 0.5 and higher indicate a considerable number of texts contain subjective opinions
+                ''')
+        st.caption('''
+                The distribution is somewhat uneven, suggesting varying levels of opinion across the dataset,
+                with a notable amount of completely objective (or detected as such) texts and others
+                expressing different degrees of subjectivity. The presence of multiple peaks indicates that
+                texts do not conform to a single level of subjectivity but vary widely, which might be typical in
+                datasets containing both factual information and personal opinions.
+                ''')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Text Length by Sentiment
+    st.markdown('### Text Length by Sentiment')
+    st.image('images/Distribution_of_Text_Length_Character_by_Sentiment.png', caption='Figure 10')
+    with st.expander('Explanation'):
+        st.caption('''
+                The distribution of text length for reviews shows that neutral sentiment texts are generally shorter,
+                with a mean length of around 7 words and a median of 5 words. Positive sentiment texts are longer,
+                with a mean of approximately 19 words and a median of 17 words, while negative sentiment texts have a mean
+                length close to 17 words and a median of 14 words. This could indicate that users tend to be more
+                verbose when expressing positive or negative sentiments, while neutral comments are more concise.
+                ''')
+    st.write('')
+    st.write('')
+# -----------------------------------------------------------------------------------------
+    # Wordclouds by Sentiment
+    st.markdown('## Wordcloud by Sentiment')
+    st.markdown('### Positive')
+    st.image('images/Positive_Sentiment_Words.png', caption='Figure 11')
+    st.write('')
+    st.markdown('### Negative')
+    st.image('images/Negative_Sentiment_Words.png', caption='Figure 12')
+    st.write('')
+    st.markdown('### Neutral')
+    st.image('images/Neutral_Sentiment_Words.png', caption='Figure 13')
+    with st.expander('Explanation'):
+        st.caption('''
+                The word clouds for positive, negative, and neutral sentiments highlight the most frequently used
+                words in each category.
+                * Positive sentiments words like "love", "great", "good" and "best" dominate, reflecting strong satisfaction
+                * Negative sentiment texts frequently include words like "bad", "problem", "worst" and "annoying" pointing to dissatisfaction
+                * Neutral sentiment texts feature words like "update", "phone" and "app" which may relate to more factual
+                or inquiry-based content rather than opinion.
+                ''')

functions/__init__.py ADDED Viewed

File without changes

functions/text_preprocessed.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+// function.py //
+This programme was created to store the function used through out this project.
+"""
+import re
+from nltk.tokenize import word_tokenize
+# Create A Function for Text Preprocessing
+def text_preprocessing(text, lemmatizer, sw):
+  # Case folding
+  text = text.lower()
+  # Mention removal
+  text = re.sub("@[A-Za-z0-9_]+", " ", text)
+  # Hashtags removal
+  text = re.sub("#[A-Za-z0-9_]+", " ", text)
+  # Newline removal (\n)
+  text = re.sub(r"\\n", " ",text)
+  # Whitespace removal
+  text = text.strip()
+  # URL removal
+  text = re.sub(r"http\S+", " ", text)
+  text = re.sub(r"www.\S+", " ", text)
+  # Non-letter removal (such as emoticon, symbol (like μ, $, 兀), etc
+  text = re.sub("[^A-Za-z\s']", " ", text)
+  # Tokenization
+  tokens = word_tokenize(text)
+  # Stopwords removal
+  tokens = [word for word in tokens if word not in sw]
+  # Lemmatization
+  tokens = [lemmatizer.lemmatize(word) for word in tokens]
+  # Combining Tokens
+  text = ' '.join(tokens)
+  return text

images/Boxplot_of_Ratings_by_Category.png ADDED Viewed

images/Category_Popularity.png ADDED Viewed

images/Distribution_of_Sentiment_Polarity.png ADDED Viewed

images/Distribution_of_Sentiment_Subjectivity.png ADDED Viewed

images/Distribution_of_Text_Length_Character_by_Sentiment.png ADDED Viewed

images/Model_Evaluatio_GRU.png ADDED Viewed

images/Model_Evaluation_CNN.png ADDED Viewed

images/Model_Evaluation_LSTM.png ADDED Viewed

images/Negative_Sentiment_Words.png ADDED Viewed

images/Neutral_Sentiment_Words.png ADDED Viewed

images/Positive_Sentiment_Words.png ADDED Viewed

images/Price_Distribution_Among_Paid_Apps.png ADDED Viewed

images/Rating_vs_Reviews.png ADDED Viewed

images/Sentiment_Distribution.png ADDED Viewed

images/rating_distribution.png ADDED Viewed

images/reviews_distribution.png ADDED Viewed

lstm/fingerprint.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:899dd0b94437c31a4a06dbace1fed8937a202069f7ce55f7f16759c4d657ad85
+size 58

lstm/keras_metadata.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5483175b5876789d632cc61e8bae4301be7fdd7c456ab687e3a007150e10333
+size 39507

lstm/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b7f7cb37ce193381331ccb0f964861920d081dd67e1461f8acd57b712b2669e
+size 6416295

lstm/variables/variables.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44f4a0db29f0487af5f79c98389dc6d7225069c328f699908b4006ffcc490354
+size 42873689

lstm/variables/variables.index ADDED Viewed

Binary file (4.33 kB). View file

model.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import streamlit as st
+import pandas as pd
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from functions.text_preprocessed import text_preprocessing
+from tensorflow.keras.models import load_model
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+def run():
+    st.title('Predict')
+    st.write('You can use our model here by inputting your text (review) here:')
+    # -------------------------------------------------------------------------
+    # Dataframe
+    data = pd.DataFrame()
+    # -------------------------------------------------------------------------
+    # App name
+    data['app'] = [st.text_input('Application name')]
+    # -------------------------------------------------------------------------
+    # Category
+    data['category'] = [st.text_input('Application category')]
+    # -------------------------------------------------------------------------
+    # Rating
+    data['rating'] = [round(st.slider('Rating', min_value=0.0, max_value=5.0), 1)]
+    # -------------------------------------------------------------------------
+    # Reviews
+    data['reviews'] = [st.number_input('Total review count', min_value=0)]
+    # -------------------------------------------------------------------------
+    # Size
+    data['size'] = [st.text_input('File size')]
+    # -------------------------------------------------------------------------
+    # Installs
+    data['installs'] = [st.number_input('Total installs', min_value=0)]
+    # -------------------------------------------------------------------------
+    # Type
+    data['type'] = [st.text_input('Paid / Free')]
+    # -------------------------------------------------------------------------
+    # Price
+    data['price'] = [st.number_input('Application price', min_value = 0.00)]
+    # -------------------------------------------------------------------------
+    # Content rating
+    data['content_rating'] = [st.text_input('Age rating')]
+    # -------------------------------------------------------------------------
+    # Genres
+    data['genres'] = [st.text_input("Genres").split(',')]
+    st.caption("Separate by ',' if multiple genres")
+    # -------------------------------------------------------------------------
+    # Last updated
+    data['last_updated'] = [st.date_input('Last updated')]
+    # -------------------------------------------------------------------------
+    # Current version
+    data['current_ver'] = [st.text_input('Current version')]
+    # -------------------------------------------------------------------------
+    # Android version
+    data['android_ver'] = [st.text_input('Android version')]
+    # -------------------------------------------------------------------------
+    # Review
+    review = st.text_input('Application review (in English)')
+    ## Stop words
+    stop_words = set(stopwords.words('english'))
+    ## Lemmatizer
+    lemmatizer = WordNetLemmatizer()
+    ## Processed text
+    text_processed = text_preprocessing(review, lemmatizer, stop_words)
+    data['translated_review'] = [review]
+    data['text_processed'] = [text_processed]
+    # -------------------------------------------------------------------------
+    # User data
+    st.dataframe(data.T, width=800, height=565)
+    # -------------------------------------------------------------------------
+    # Prediction
+    if st.button('Predict'):
+        model = load_model('lstm')
+        sentiment_pred = model.predict(data['text_processed'])
+        # st.write(sentiment_pred)
+        if sentiment_pred > 1.5:
+            st.write('Positive Review')
+        elif (sentiment_pred < 1.5) & (sentiment_pred >= 1.0):
+            st.write('Negative Review')
+        else:
+            st.write('Neutral Review')

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+nltk
+tensorflow==2.14.0
+pandas