Spaces:

Ahtisham1583
/

Social_media_senti

Runtime error

App Files Files Community

Social_media_senti / app.py

Ahtisham1583

Update app.py

cdfa715 verified about 1 year ago

raw

history blame contribute delete

3.39 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import nltk
	from sklearn.feature_extraction.text import CountVectorizer
	import re
	from nltk.corpus import stopwords
	from nltk.stem import SnowballStemmer

	# Download NLTK punkt tokenizer and stopwords
	nltk.download('punkt')
	nltk.download('stopwords')

	# Function to preprocess text
	def preprocess_text(text):
	# Convert text to lowercase
	text = text.lower()
	# Remove URLs
	text = re.sub(r'http\S+', '', text)
	# Remove @ mentions
	text = re.sub(r'@\S+', '', text)
	# Remove hashtags
	text = re.sub(r'#\S+', '', text)
	# Remove non-alphabetic characters
	text = re.sub(r'[^a-zA-Z]', ' ', text)
	# Tokenize text
	tokens = nltk.word_tokenize(text)
	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [word for word in tokens if word not in stop_words]
	# Stemming
	stemmer = SnowballStemmer('english')
	stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
	# Join tokens into a single string
	processed_text = ' '.join(stemmed_tokens)
	return processed_text

	# Function to load and preprocess DataFrame
	def load_and_preprocess_dataframe(file_path):
	dataframe = pd.read_csv(file_path)
	dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
	return dataframe

	# Function to create Document-Term Matrix (DTM)
	def create_dtm(dataframe):
	vectorizer = CountVectorizer()
	dtm = vectorizer.fit_transform(dataframe['processed_text'])
	return dtm, vectorizer

	# Function to plot word frequency
	def plot_word_frequency(dtm, vectorizer):
	# Sum word frequencies
	word_freq = dtm.sum(axis=0)
	words = vectorizer.get_feature_names_out()
	# Create DataFrame
	word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
	# Sort by frequency
	word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
	# Plot
	plt.figure(figsize=(10, 6))
	sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
	plt.title('Top 20 Words Frequency')
	plt.xlabel('Words')
	plt.ylabel('Frequency')
	plt.xticks(rotation=45, ha='right')
	plt.tight_layout()
	st.pyplot()

	# Streamlit app
	def main():
	st.title("Tweet Data Analysis")
	st.sidebar.header("Upload CSV file")

	uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")

	if uploaded_file is not None:
	st.sidebar.success('File uploaded successfully!')
	st.sidebar.markdown("### Preprocessing Options")
	preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame")

	# Load DataFrame
	df = pd.read_csv(uploaded_file)

	if preprocess_checkbox:
	# Preprocess DataFrame
	df['processed_text'] = df['text'].apply(preprocess_text)
	st.subheader("Preprocessed DataFrame")
	st.write(df.head())

	# Create DTM
	st.subheader("Document-Term Matrix (DTM)")
	dtm, vectorizer = create_dtm(df)
	st.write(dtm)

	# Plot word frequency
	st.subheader("Word Frequency Plot")
	plot_word_frequency(dtm, vectorizer)
	else:
	st.subheader("Original DataFrame")
	st.write(df.head())

	if __name__ == "__main__":
	main()