Spaces:
Runtime error
Runtime error
# set path | |
import glob, os, sys; sys.path.append('../udfPreprocess') | |
#import helper | |
import udfPreprocess.docPreprocessing as pre | |
import udfPreprocess.cleaning as clean | |
#import needed libraries | |
import seaborn as sns | |
from pandas import DataFrame | |
from sentence_transformers import SentenceTransformer, CrossEncoder, util | |
from sklearn.metrics.pairwise import cosine_similarity | |
# from keybert import KeyBERT | |
from transformers import pipeline | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import streamlit as st | |
import pandas as pd | |
from rank_bm25 import BM25Okapi | |
from sklearn.feature_extraction import _stop_words | |
import string | |
from tqdm.autonotebook import tqdm | |
import numpy as np | |
import urllib.request | |
import ast | |
import tempfile | |
import sqlite3 | |
import json | |
import urllib.request | |
import ast | |
import docx | |
from docx.shared import Inches | |
from docx.shared import Pt | |
from docx.enum.style import WD_STYLE_TYPE | |
def app(): | |
# Sidebar | |
st.sidebar.title('Check Coherence') | |
st.sidebar.write(' ') | |
with open('ndcs/countryList.txt') as dfile: | |
countryList = dfile.read() | |
countryList = ast.literal_eval(countryList) | |
countrynames = list(countryList.keys()) | |
option = st.sidebar.selectbox('Select Country', (countrynames)) | |
countryCode = countryList[option] | |
with st.container(): | |
st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True) | |
st.write(' ') | |
st.write(' ') | |
with st.expander("ℹ️ - About this app", expanded=True): | |
st.write( | |
""" | |
The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network. | |
""" | |
) | |
st.markdown("") | |
st.markdown("") | |
st.markdown("## 📌 Step One: Upload document of the country selected ") | |
with st.container(): | |
docs = None | |
# asking user for either upload or select existing doc | |
choice = st.radio(label = 'Select the Document', | |
help = 'You can upload the document \ | |
or else you can try a example document.', | |
options = ('Upload Document', 'Try Example'), | |
horizontal = True) | |
if choice == 'Upload Document': | |
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt']) | |
if uploaded_file is not None: | |
with tempfile.NamedTemporaryFile(mode="wb") as temp: | |
bytes_data = uploaded_file.getvalue() | |
temp.write(bytes_data) | |
st.write("Uploaded Filename: ", uploaded_file.name) | |
file_name = uploaded_file.name | |
file_path = temp.name | |
docs = pre.load_document(file_path, file_name) | |
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs) | |
else: | |
# listing the options | |
option = st.selectbox('Select the example document', | |
('South Africa:Low Emission strategy', | |
'Ethiopia: 10 Year Development Plan')) | |
if option is 'South Africa:Low Emission strategy': | |
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt' | |
countryCode = countryList['South Africa'] | |
st.write("Selected document:", file_name.split('/')[1]) | |
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile: | |
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb') | |
else: | |
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile: | |
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt' | |
countryCode = countryList['Ethiopia'] | |
st.write("Selected document:", file_name.split('/')[1]) | |
if option is not None: | |
docs = pre.load_document(file_path,file_name) | |
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs) | |
with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile: | |
cca_sent = dfile.read() | |
cca_sent = ast.literal_eval(cca_sent) | |
with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile: | |
ccm_sent = dfile.read() | |
ccm_sent = ast.literal_eval(ccm_sent) | |
with open('ndcs/countryList.txt') as dfile: | |
countryList = dfile.read() | |
countryList = ast.literal_eval(countryList) | |
def get_document(countryCode: str): | |
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json" | |
with urllib.request.urlopen(link) as urlfile: | |
data = json.loads(urlfile.read()) | |
categoriesData = {} | |
categoriesData['categories']= data['categories'] | |
categoriesData['subcategories']= data['subcategories'] | |
keys_sub = categoriesData['subcategories'].keys() | |
documentType= 'NDCs' | |
if documentType in data.keys(): | |
if countryCode in data[documentType].keys(): | |
get_dict = {} | |
for key, value in data[documentType][countryCode].items(): | |
if key not in ['country_name','region_id', 'region_name']: | |
get_dict[key] = value['classification'] | |
else: | |
get_dict[key] = value | |
else: | |
return None | |
else: | |
return None | |
country = {} | |
for key in categoriesData['categories']: | |
country[key]= {} | |
for key,value in categoriesData['subcategories'].items(): | |
country[value['category']][key] = get_dict[key] | |
return country | |
# country_ndc = get_document('NDCs', countryList[option]) | |
def countrySpecificCCA(cca_sent, threshold, countryCode): | |
temp = {} | |
doc = get_document(countryCode) | |
for key,value in cca_sent.items(): | |
id_ = doc['climate change adaptation'][key]['id'] | |
if id_ >threshold: | |
temp[key] = value['id'][id_] | |
return temp | |
def countrySpecificCCM(ccm_sent, threshold, countryCode): | |
temp = {} | |
doc = get_document(countryCode) | |
for key,value in ccm_sent.items(): | |
id_ = doc['climate change mitigation'][key]['id'] | |
if id_ >threshold: | |
temp[key] = value['id'][id_] | |
return temp | |
if docs is not None: | |
sent_cca = countrySpecificCCA(cca_sent,1,countryCode) | |
sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode) | |
#st.write(sent_ccm) | |
def load_sentenceTransformer(name): | |
return SentenceTransformer(name) | |
model = load_sentenceTransformer('all-MiniLM-L6-v2') | |
document_embeddings = model.encode(paraList, show_progress_bar=True) | |
genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation')) | |
if genre == 'Climate Change Adaptation': | |
sent_dict = sent_cca | |
sent_labels = [] | |
for key,sent in sent_dict.items(): | |
sent_labels.append(sent) | |
label_embeddings = model.encode(sent_labels, show_progress_bar=True) | |
similarity_high_threshold = 0.55 | |
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings) | |
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold) | |
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist())) | |
else: | |
sent_dict = sent_ccm | |
sent_labels = [] | |
for key,sent in sent_dict.items(): | |
sent_labels.append(sent) | |
label_embeddings = model.encode(sent_labels, show_progress_bar=True) | |
similarity_high_threshold = 0.55 | |
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings) | |
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold) | |
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist())) | |
# sent_labels = [] | |
# for key,sent in sent_dict.items(): | |
# sent_labels.append(sent) | |
# label_embeddings = model.encode(sent_labels, show_progress_bar=True) | |
#similarity_high_threshold = 0.55 | |
# similarity_matrix = cosine_similarity(label_embeddings, document_embeddings) | |
#label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold) | |
#positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist())) | |
document = docx.Document() | |
document.add_heading('Document name:{}'.format(file_name), 2) | |
section = document.sections[0] | |
# Calling the footer | |
footer = section.footer | |
# Calling the paragraph already present in | |
# the footer section | |
footer_para = footer.paragraphs[0] | |
font_styles = document.styles | |
font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER) | |
font_object = font_charstyle.font | |
font_object.size = Pt(7) | |
# Adding the centered zoned footer | |
footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle') | |
document.add_paragraph("Country Code for which NDC is carried out {}".format(countryCode)) | |
for _label_idx, _paragraph_idx in positive_indices: | |
st.write("This paragraph: \n") | |
document.add_paragraph("This paragraph: \n") | |
st.write(paraList[_paragraph_idx]) | |
st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}") | |
document.add_paragraph(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}") | |
st.write('-'*10) | |
document.add_paragraph('-'*10) | |
document.save('demo.docx') | |
with open("demo.docx", "rb") as file: | |
btn = st.download_button( | |
label="Download file", | |
data=file, | |
file_name="demo.docx", | |
mime="txt/docx" | |
) | |