Update app.py
Browse files
app.py
CHANGED
@@ -1,125 +1,29 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
|
4 |
-
from wordcloud import WordCloud, STOPWORDS
|
5 |
-
import matplotlib.pyplot as plt
|
6 |
import PyPDF2
|
7 |
from docx import Document
|
8 |
-
import
|
9 |
-
import base64
|
10 |
-
from io import BytesIO
|
11 |
-
|
12 |
-
# Functions for file reading
|
13 |
-
def read_txt(file):
|
14 |
-
return file.getvalue().decode("utf-8")
|
15 |
-
|
16 |
-
def read_docx(file):
|
17 |
-
doc = Document(file)
|
18 |
-
return " ".join([para.text for para in doc.paragraphs])
|
19 |
-
|
20 |
-
def read_pdf(file):
|
21 |
-
pdf = PyPDF2.PdfReader(file)
|
22 |
-
return " ".join([page.extract_text() for page in pdf.pages])
|
23 |
-
|
24 |
-
# Function to filter out stopwords
|
25 |
-
def filter_stopwords(text, additional_stopwords=[]):
|
26 |
-
words = text.split()
|
27 |
-
all_stopwords = STOPWORDS.union(set(additional_stopwords))
|
28 |
-
filtered_words = [word for word in words if word.lower() not in all_stopwords]
|
29 |
-
return " ".join(filtered_words)
|
30 |
-
|
31 |
-
# Function to create download link for plot
|
32 |
-
def get_image_download_link(buffered, format_):
|
33 |
-
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
34 |
-
return f'<a href="data:image/{format_};base64,{image_base64}" download="wordcloud.{format_}">Download Plot as {format_}</a>'
|
35 |
-
|
36 |
-
# Function to generate a download link for a DataFrame
|
37 |
-
def get_table_download_link(df, filename, file_label):
|
38 |
-
csv = df.to_csv(index=False)
|
39 |
-
b64 = base64.b64encode(csv.encode()).decode()
|
40 |
-
return f'<a href="data:file/csv;base64,{b64}" download="{filename}">{file_label}</a>'
|
41 |
-
|
42 |
-
# Streamlit code
|
43 |
-
st.title("Word Cloud Generator")
|
44 |
-
st.subheader("📁 Upload a pdf, docx or text file to generate a word cloud")
|
45 |
-
|
46 |
-
uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
|
47 |
-
st.set_option('deprecation.showPyplotGlobalUse', False)
|
48 |
-
|
49 |
-
if uploaded_file:
|
50 |
-
file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
|
51 |
-
st.write(file_details)
|
52 |
-
|
53 |
-
# Check the file type and read the file
|
54 |
-
if uploaded_file.type == "text/plain":
|
55 |
-
text = read_txt(uploaded_file)
|
56 |
-
elif uploaded_file.type == "application/pdf":
|
57 |
-
text = read_pdf(uploaded_file)
|
58 |
-
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
59 |
-
text = read_docx(uploaded_file)
|
60 |
-
else:
|
61 |
-
st.error("File type not supported. Please upload a txt, pdf or docx file.")
|
62 |
-
st.stop()
|
63 |
-
|
64 |
-
# Generate word count table
|
65 |
-
words = text.split()
|
66 |
-
word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
|
67 |
-
|
68 |
-
# Sidebar: Checkbox and Multiselect box for stopwords
|
69 |
-
use_standard_stopwords = st.sidebar.checkbox("Use standard stopwords?", True)
|
70 |
-
top_words = word_count['Word'].head(50).tolist()
|
71 |
-
additional_stopwords = st.sidebar.multiselect("Additional stopwords:", sorted(top_words))
|
72 |
-
|
73 |
-
if use_standard_stopwords:
|
74 |
-
all_stopwords = STOPWORDS.union(set(additional_stopwords))
|
75 |
-
else:
|
76 |
-
all_stopwords = set(additional_stopwords)
|
77 |
-
|
78 |
-
text = filter_stopwords(text, all_stopwords)
|
79 |
-
|
80 |
-
if text:
|
81 |
-
# Word Cloud dimensions
|
82 |
-
width = st.sidebar.slider("Select Word Cloud Width", 400, 2000, 1200, 50)
|
83 |
-
height = st.sidebar.slider("Select Word Cloud Height", 200, 2000, 800, 50)
|
84 |
-
|
85 |
-
# Generate wordcloud
|
86 |
-
st.subheader("Generated Word Cloud")
|
87 |
-
fig, ax = plt.subplots(figsize=(width/100, height/100)) # Convert pixels to inches for figsize
|
88 |
-
wordcloud_img = WordCloud(width=width, height=height, background_color='white', max_words=200, contour_width=3, contour_color='steelblue').generate(text)
|
89 |
-
ax.imshow(wordcloud_img, interpolation='bilinear')
|
90 |
-
ax.axis('off')
|
91 |
-
|
92 |
-
# Save plot functionality
|
93 |
-
format_ = st.selectbox("Select file format to save the plot", ["png", "jpeg", "svg", "pdf"])
|
94 |
-
resolution = st.slider("Select Resolution", 100, 500, 300, 50)
|
95 |
-
# Generate word count table
|
96 |
-
st.subheader("Word Count Table")
|
97 |
-
words = text.split()
|
98 |
-
word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
|
99 |
-
st.write(word_count)
|
100 |
-
st.pyplot(fig)
|
101 |
-
if st.button(f"Save as {format_}"):
|
102 |
-
buffered = BytesIO()
|
103 |
-
plt.savefig(buffered, format=format_, dpi=resolution)
|
104 |
-
st.markdown(get_image_download_link(buffered, format_), unsafe_allow_html=True)
|
105 |
-
|
106 |
|
107 |
-
|
108 |
-
st.sidebar.markdown("---")
|
109 |
-
st.sidebar.subheader("Learn Advanced AI| Hope to Skill with Irfan Malik")
|
110 |
-
# add a youtube video
|
111 |
-
st.sidebar.video("https://www.youtube.com/watch?v=vCKF1vYVNTs&list=PLxf3-FrL8GzRALeq_9BtdQclN6SF4bTCG")
|
112 |
-
st.sidebar.markdown("---")
|
113 |
-
# add author name and info
|
114 |
-
st.sidebar.markdown("Created by: [Virtual Student](https://github.com/dashboard)")
|
115 |
-
st.sidebar.markdown("website:(https://xevenskills.com)")
|
116 |
|
|
|
117 |
|
|
|
|
|
|
|
118 |
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
if st.button('Download Word Count Table as CSV'):
|
124 |
-
st.markdown(get_table_download_link(word_count, "word_count.csv", "Click Here to Download"), unsafe_allow_html=True)
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
from io import BytesIO
|
3 |
+
from PIL import Image
|
|
|
|
|
4 |
import PyPDF2
|
5 |
from docx import Document
|
6 |
+
from wordcloud import WordCloud, STOPWORDS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
app = Flask(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
ALLOWED_EXTENSIONS = {'txt', 'pdf', 'docx'}
|
11 |
|
12 |
+
def allowed_file(filename):
|
13 |
+
return '.' in filename and \
|
14 |
+
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
15 |
|
16 |
+
@app.route('/api/generate-wordcloud', methods=['POST'])
|
17 |
+
def generate_wordcloud():
|
18 |
+
if 'file' not in request.files:
|
19 |
+
return jsonify({'error': 'Missing file upload'}), 400
|
20 |
|
21 |
+
file = request.files['file']
|
22 |
+
if file.filename == '':
|
23 |
+
return jsonify({'error': 'No selected file'}), 400
|
|
|
|
|
24 |
|
25 |
+
if file and allowed_file(file.filename):
|
26 |
+
text = ''
|
27 |
+
if file.filename.endswith('.txt'):
|
28 |
+
text = file.read().decode('utf-8')
|
29 |
+
elif file.filename.
|