Spaces:
Runtime error
Runtime error
import streamlit as st | |
st.set_page_config(f'SDSN x GIZ Policy Tracing', layout="wide") | |
import seaborn as sns | |
import pdfplumber | |
from pandas import DataFrame | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import streamlit as st | |
import sentence-transformers | |
##@st.cache(allow_output_mutation=True) | |
def load_model(): | |
return KeyBERT() | |
def read_(file): | |
if file is not None: | |
text = [] | |
with pdfplumber.open(file) as pdf: | |
for page in pdf.pages: | |
text.append(page.extract_text()) | |
text_str = ' '.join([page for page in text]) | |
st.write('Document:', pdf.metadata) | |
st.write('Number of pages:',len(pdf.pages)) | |
pdf.close() | |
return text_str | |
st.sidebar.image( | |
"https://github.com/gizdatalab/policy_tracing/blob/main/img/sdsn.png?raw=true", | |
use_column_width=True | |
) | |
st.sidebar.markdown("## π Step One: Upload document ") | |
with st.sidebar: | |
file = st.file_uploader('Upload PDF File', type=['pdf']) | |
st.sidebar.title( | |
"Options:" | |
) | |
st.sidebar.markdown( | |
"You can freely browse the different chapters - ie example prompts from different people - and see the results." | |
) | |
selected_date = st.sidebar.selectbox( | |
"Please select the chapter you want to read:", | |
['c1','c2'] | |
) | |
with st.container(): | |
st.markdown("<h1 style='text-align: center; color: black;'> SDSN X GIZ - Policy Action Tracking</h1>", unsafe_allow_html=True) | |
st.write(' ') | |
st.write(' ') | |
with st.expander("βΉοΈ - About this app", expanded=True): | |
st.write( | |
""" | |
The *Policy Action Tracker* app is an easy-to-use interface built with Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network. | |
It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) π€ to create keywords/keyphrases that are most similar to a document. | |
""" | |
) | |
st.markdown("") | |
st.markdown("") | |
#st.markdown("## π Step One: Upload document ") | |
with st.container(): | |
st.markdown("## π Step One: Upload document ") | |
##file = st.file_uploader('Upload PDF File', type=['pdf']) | |
text_str = read_(file) | |
import seaborn as sns | |
import pdfplumber | |
from pandas import DataFrame | |
from keybert import KeyBERT | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import streamlit as st | |
def load_model(): | |
return KeyBERT() | |
kw_model = load_model() | |
keywords = kw_model.extract_keywords( | |
text_str, | |
keyphrase_ngram_range=(1, 2), | |
use_mmr=True, | |
stop_words="english", | |
top_n=10, | |
diversity=0.7, | |
) | |
st.markdown("## π What is my document about?") | |
df = ( | |
DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"]) | |
.sort_values(by="Relevancy", ascending=False) | |
.reset_index(drop=True) | |
) | |
df.index += 1 | |
# Add styling | |
cmGreen = sns.light_palette("green", as_cmap=True) | |
cmRed = sns.light_palette("red", as_cmap=True) | |
df = df.style.background_gradient( | |
cmap=cmGreen, | |
subset=[ | |
"Relevancy", | |
], | |
) | |
c1, c2, c3 = st.columns([1, 3, 1]) | |
format_dictionary = { | |
"Relevancy": "{:.1%}", | |
} | |
df = df.format(format_dictionary) | |
with c2: | |
st.table(df) | |
######## SDG! | |
from transformers import pipeline | |
finetuned_checkpoint = "jonas/sdg_classifier_osdg" | |
classifier = pipeline("text-classification", model=finetuned_checkpoint) | |
word_list = text_str.split() | |
len_word_list = len(word_list) | |
par_list = [] | |
par_len = 130 | |
for i in range(0,len_word_list // par_len): | |
string_part = ' '.join(word_list[i*par_len:(i+1)*par_len]) | |
par_list.append(string_part) | |
labels = classifier(par_list) | |
labels_= [(l['label'],l['score']) for l in labels] | |
df = DataFrame(labels_, columns=["SDG", "Relevancy"]) | |
df['text'] = ['... '+par+' ...' for par in par_list] | |
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True) | |
df.index += 1 | |
df =df[df['Relevancy']>.9] | |
x = df['SDG'].value_counts() | |
plt.rcParams['font.size'] = 25 | |
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x))) | |
# plot | |
fig, ax = plt.subplots() | |
ax.pie(x, colors=colors, radius=2, center=(4, 4), | |
wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index)) | |
st.markdown("## π Anything related to SDGs?") | |
c4, c5, c6 = st.columns([5, 7, 1]) | |
# Add styling | |
cmGreen = sns.light_palette("green", as_cmap=True) | |
cmRed = sns.light_palette("red", as_cmap=True) | |
df = df.style.background_gradient( | |
cmap=cmGreen, | |
subset=[ | |
"Relevancy", | |
], | |
) | |
format_dictionary = { | |
"Relevancy": "{:.1%}", | |
} | |
df = df.format(format_dictionary) | |
with c4: | |
st.pyplot(fig) | |
with c5: | |
st.table(df) |