Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from wordcloud import WordCloud, STOPWORDS
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import PyPDF2
|
7 |
+
from docx import Document
|
8 |
+
import plotly.express as px
|
9 |
+
import base64
|
10 |
+
from io import BytesIO
|
11 |
+
|
12 |
+
# Functions for file reading
|
13 |
+
def read_txt(file):
|
14 |
+
return file.getvalue().decode("utf-8")
|
15 |
+
|
16 |
+
def read_docx(file):
|
17 |
+
doc = Document(file)
|
18 |
+
return " ".join([para.text for para in doc.paragraphs])
|
19 |
+
|
20 |
+
def read_pdf(file):
|
21 |
+
pdf = PyPDF2.PdfReader(file)
|
22 |
+
return " ".join([page.extract_text() for page in pdf.pages])
|
23 |
+
|
24 |
+
# Function to filter out stopwords
|
25 |
+
def filter_stopwords(text, additional_stopwords=[]):
|
26 |
+
words = text.split()
|
27 |
+
all_stopwords = STOPWORDS.union(set(additional_stopwords))
|
28 |
+
filtered_words = [word for word in words if word.lower() not in all_stopwords]
|
29 |
+
return " ".join(filtered_words)
|
30 |
+
|
31 |
+
# Function to create download link for plot
|
32 |
+
def get_image_download_link(buffered, format_):
|
33 |
+
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
34 |
+
return f'<a href="data:image/{format_};base64,{image_base64}" download="wordcloud.{format_}">Download Plot as {format_}</a>'
|
35 |
+
|
36 |
+
# Function to generate a download link for a DataFrame
|
37 |
+
def get_table_download_link(df, filename, file_label):
|
38 |
+
csv = df.to_csv(index=False)
|
39 |
+
b64 = base64.b64encode(csv.encode()).decode()
|
40 |
+
return f'<a href="data:file/csv;base64,{b64}" download="{filename}">{file_label}</a>'
|
41 |
+
|
42 |
+
# Streamlit code
|
43 |
+
st.title("Word Cloud Generator")
|
44 |
+
st.subheader("📁 Upload a pdf, docx or text file to generate a word cloud")
|
45 |
+
|
46 |
+
uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
|
47 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
48 |
+
|
49 |
+
if uploaded_file:
|
50 |
+
file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
|
51 |
+
st.write(file_details)
|
52 |
+
|
53 |
+
# Check the file type and read the file
|
54 |
+
if uploaded_file.type == "text/plain":
|
55 |
+
text = read_txt(uploaded_file)
|
56 |
+
elif uploaded_file.type == "application/pdf":
|
57 |
+
text = read_pdf(uploaded_file)
|
58 |
+
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
59 |
+
text = read_docx(uploaded_file)
|
60 |
+
else:
|
61 |
+
st.error("File type not supported. Please upload a txt, pdf or docx file.")
|
62 |
+
st.stop()
|
63 |
+
|
64 |
+
# Generate word count table
|
65 |
+
words = text.split()
|
66 |
+
word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
|
67 |
+
|
68 |
+
# Sidebar: Checkbox and Multiselect box for stopwords
|
69 |
+
use_standard_stopwords = st.sidebar.checkbox("Use standard stopwords?", True)
|
70 |
+
top_words = word_count['Word'].head(50).tolist()
|
71 |
+
additional_stopwords = st.sidebar.multiselect("Additional stopwords:", sorted(top_words))
|
72 |
+
|
73 |
+
if use_standard_stopwords:
|
74 |
+
all_stopwords = STOPWORDS.union(set(additional_stopwords))
|
75 |
+
else:
|
76 |
+
all_stopwords = set(additional_stopwords)
|
77 |
+
|
78 |
+
text = filter_stopwords(text, all_stopwords)
|
79 |
+
|
80 |
+
if text:
|
81 |
+
# Word Cloud dimensions
|
82 |
+
width = st.sidebar.slider("Select Word Cloud Width", 400, 2000, 1200, 50)
|
83 |
+
height = st.sidebar.slider("Select Word Cloud Height", 200, 2000, 800, 50)
|
84 |
+
|
85 |
+
# Generate wordcloud
|
86 |
+
st.subheader("Generated Word Cloud")
|
87 |
+
fig, ax = plt.subplots(figsize=(width/100, height/100)) # Convert pixels to inches for figsize
|
88 |
+
wordcloud_img = WordCloud(width=width, height=height, background_color='white', max_words=200, contour_width=3, contour_color='steelblue').generate(text)
|
89 |
+
ax.imshow(wordcloud_img, interpolation='bilinear')
|
90 |
+
ax.axis('off')
|
91 |
+
|
92 |
+
# Save plot functionality
|
93 |
+
format_ = st.selectbox("Select file format to save the plot", ["png", "jpeg", "svg", "pdf"])
|
94 |
+
resolution = st.slider("Select Resolution", 100, 500, 300, 50)
|
95 |
+
# Generate word count table
|
96 |
+
st.subheader("Word Count Table")
|
97 |
+
words = text.split()
|
98 |
+
word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
|
99 |
+
st.write(word_count)
|
100 |
+
st.pyplot(fig)
|
101 |
+
if st.button(f"Save as {format_}"):
|
102 |
+
buffered = BytesIO()
|
103 |
+
plt.savefig(buffered, format=format_, dpi=resolution)
|
104 |
+
st.markdown(get_image_download_link(buffered, format_), unsafe_allow_html=True)
|
105 |
+
|
106 |
+
|
107 |
+
# Word count table at the end
|
108 |
+
st.sidebar.markdown("---")
|
109 |
+
st.sidebar.subheader("Learn Advanced AI| Hope to Skill with Irfan Malik")
|
110 |
+
# add a youtube video
|
111 |
+
st.sidebar.video("https://www.youtube.com/watch?v=vCKF1vYVNTs&list=PLxf3-FrL8GzRALeq_9BtdQclN6SF4bTCG")
|
112 |
+
st.sidebar.markdown("---")
|
113 |
+
# add author name and info
|
114 |
+
st.sidebar.markdown("Created by: [Virtual Student](https://github.com/dashboard)")
|
115 |
+
st.sidebar.markdown("website:(https://xevenskills.com)")
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
st.subheader("Word Count Table")
|
121 |
+
st.write(word_count)
|
122 |
+
# Provide download link for table
|
123 |
+
if st.button('Download Word Count Table as CSV'):
|
124 |
+
st.markdown(get_table_download_link(word_count, "word_count.csv", "Click Here to Download"), unsafe_allow_html=True)
|
125 |
+
|