Spaces:
Runtime error
Runtime error
File size: 5,097 Bytes
41460de d162bf7 41460de 3e0a87b 402e9be ef80c58 de5cd4d 11cb408 6c308dc fa6710b eac4597 de5cd4d 402e9be d4a926a 402e9be fa6710b abdb7b6 de5cd4d abdb7b6 0bd3e3f 0110fa1 0bd3e3f 3e0a87b c5118ce 0110fa1 c5118ce 41460de c5118ce 41460de c5118ce 68739a8 41460de c5118ce 41460de 3e0a87b c5118ce d162bf7 c6338e6 fa6710b 6c308dc de5cd4d d162bf7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import streamlit as st
st.set_page_config(f'SDSN x GIZ Policy Tracing', layout="wide")
import seaborn as sns
import pdfplumber
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import sentence-transformers
##@st.cache(allow_output_mutation=True)
def load_model():
return KeyBERT()
def read_(file):
if file is not None:
text = []
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text.append(page.extract_text())
text_str = ' '.join([page for page in text])
st.write('Document:', pdf.metadata)
st.write('Number of pages:',len(pdf.pages))
pdf.close()
return text_str
st.sidebar.image(
"https://github.com/gizdatalab/policy_tracing/blob/main/img/sdsn.png?raw=true",
use_column_width=True
)
st.sidebar.markdown("## π Step One: Upload document ")
with st.sidebar:
file = st.file_uploader('Upload PDF File', type=['pdf'])
st.sidebar.title(
"Options:"
)
st.sidebar.markdown(
"You can freely browse the different chapters - ie example prompts from different people - and see the results."
)
selected_date = st.sidebar.selectbox(
"Please select the chapter you want to read:",
['c1','c2']
)
with st.container():
st.markdown("<h1 style='text-align: center; color: black;'> SDSN X GIZ - Policy Action Tracking</h1>", unsafe_allow_html=True)
st.write(' ')
st.write(' ')
with st.expander("βΉοΈ - About this app", expanded=True):
st.write(
"""
The *Policy Action Tracker* app is an easy-to-use interface built with Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network.
It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) π€ to create keywords/keyphrases that are most similar to a document.
"""
)
st.markdown("")
st.markdown("")
#st.markdown("## π Step One: Upload document ")
with st.container():
st.markdown("## π Step One: Upload document ")
##file = st.file_uploader('Upload PDF File', type=['pdf'])
text_str = read_(file)
import seaborn as sns
import pdfplumber
from pandas import DataFrame
from keybert import KeyBERT
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
@st.cache(allow_output_mutation=True)
def load_model():
return KeyBERT()
kw_model = load_model()
keywords = kw_model.extract_keywords(
text_str,
keyphrase_ngram_range=(1, 2),
use_mmr=True,
stop_words="english",
top_n=10,
diversity=0.7,
)
st.markdown("## π What is my document about?")
df = (
DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
.sort_values(by="Relevancy", ascending=False)
.reset_index(drop=True)
)
df.index += 1
# Add styling
cmGreen = sns.light_palette("green", as_cmap=True)
cmRed = sns.light_palette("red", as_cmap=True)
df = df.style.background_gradient(
cmap=cmGreen,
subset=[
"Relevancy",
],
)
c1, c2, c3 = st.columns([1, 3, 1])
format_dictionary = {
"Relevancy": "{:.1%}",
}
df = df.format(format_dictionary)
with c2:
st.table(df)
######## SDG!
from transformers import pipeline
finetuned_checkpoint = "jonas/sdg_classifier_osdg"
classifier = pipeline("text-classification", model=finetuned_checkpoint)
word_list = text_str.split()
len_word_list = len(word_list)
par_list = []
par_len = 130
for i in range(0,len_word_list // par_len):
string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
par_list.append(string_part)
labels = classifier(par_list)
labels_= [(l['label'],l['score']) for l in labels]
df = DataFrame(labels_, columns=["SDG", "Relevancy"])
df['text'] = ['... '+par+' ...' for par in par_list]
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
df.index += 1
df =df[df['Relevancy']>.9]
x = df['SDG'].value_counts()
plt.rcParams['font.size'] = 25
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
# plot
fig, ax = plt.subplots()
ax.pie(x, colors=colors, radius=2, center=(4, 4),
wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
st.markdown("## π Anything related to SDGs?")
c4, c5, c6 = st.columns([5, 7, 1])
# Add styling
cmGreen = sns.light_palette("green", as_cmap=True)
cmRed = sns.light_palette("red", as_cmap=True)
df = df.style.background_gradient(
cmap=cmGreen,
subset=[
"Relevancy",
],
)
format_dictionary = {
"Relevancy": "{:.1%}",
}
df = df.format(format_dictionary)
with c4:
st.pyplot(fig)
with c5:
st.table(df) |