ppsingh commited on
Commit
031e5e2
·
0 Parent(s):

Duplicate from ppsingh/cpu-demo

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ docStore/sample/South[[:space:]]Africa_s[[:space:]]Low[[:space:]]Emission[[:space:]]Development[[:space:]]Strategy.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
.vscode/launch.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Streamlit",
9
+ "type": "python",
10
+ "request": "launch",
11
+ "program": ".venv/bin/streamlit",
12
+ "args": [
13
+ "run",
14
+ "app.py"
15
+ ],
16
+ "console": "integratedTerminal",
17
+ "justMyCode": false
18
+ }
19
+ ]
20
+ }
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cpu Demo
3
+ emoji: 🦀
4
+ colorFrom: blue
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.19.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: ppsingh/cpu-demo
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import appStore.target as target_extraction
2
+ import appStore.netzero as netzero
3
+ import appStore.sector as sector
4
+ import appStore.adapmit as adapmit
5
+ # import appStore.info as info
6
+ from appStore.multiapp import MultiApp
7
+ import streamlit as st
8
+
9
+ st.set_page_config(page_title = 'Climate Policy Intelligence',
10
+ initial_sidebar_state='expanded', layout="wide")
11
+
12
+ app = MultiApp()
13
+
14
+ # app.add_app("About","house", info.app)
15
+ app.add_app("Economy-Wide Target Extraction","gear",target_extraction.app)
16
+ app.add_app("NetZero Target Extraction","gear", netzero.app)
17
+ app.add_app("Sector Classification","gear", sector.app)
18
+ app.add_app("Adaptation-Mitigation","gear", adapmit.app)
19
+
20
+ app.run()
appStore/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # creating appstore package
appStore/adapmit.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ # from st_aggrid import AgGrid
12
+ # from st_aggrid.shared import ColumnsAutoSizeMode
13
+ from utils.adapmit_classifier import adapmit_classification
14
+ from utils.adapmit_classifier import runAdapMitPreprocessingPipeline, load_adapmitClassifier
15
+ # from utils.keyword_extraction import textrank
16
+ import logging
17
+ logger = logging.getLogger(__name__)
18
+ from utils.config import get_classifier_params
19
+ from utils.preprocessing import paraLengthCheck
20
+ from io import BytesIO
21
+ import xlsxwriter
22
+ import plotly.express as px
23
+
24
+ # Declare all the necessary variables
25
+ classifier_identifier = 'adapmit'
26
+ params = get_classifier_params(classifier_identifier)
27
+
28
+ @st.cache_data
29
+ def to_excel(df):
30
+ len_df = len(df)
31
+ output = BytesIO()
32
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
33
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
34
+ workbook = writer.book
35
+ worksheet = writer.sheets['Sheet1']
36
+ worksheet.data_validation('E2:E{}'.format(len_df),
37
+ {'validate': 'list',
38
+ 'source': ['No', 'Yes', 'Discard']})
39
+ worksheet.data_validation('F2:F{}'.format(len_df),
40
+ {'validate': 'list',
41
+ 'source': ['No', 'Yes', 'Discard']})
42
+ worksheet.data_validation('G2:G{}'.format(len_df),
43
+ {'validate': 'list',
44
+ 'source': ['No', 'Yes', 'Discard']})
45
+ writer.save()
46
+ processed_data = output.getvalue()
47
+ return processed_data
48
+
49
+ def app():
50
+
51
+ #### APP INFO #####
52
+ with st.container():
53
+ st.markdown("<h1 style='text-align: center; color: black;'> Adaptation-Mitigation Classification </h1>", unsafe_allow_html=True)
54
+ st.write(' ')
55
+ st.write(' ')
56
+
57
+ with st.expander("ℹ️ - About this app", expanded=False):
58
+
59
+ st.write(
60
+ """
61
+ The **Adaptation-Mitigation Classification** app is an easy-to-use interface built \
62
+ in Streamlit for analyzing policy documents for \
63
+ Classification of the paragraphs/texts in the document *If it \
64
+ belongs to 'Adaptation' and 'Mitigation' category or not. The paragraph \
65
+ can belong to both category too. \
66
+ - developed by GIZ Data Service Center, GFA, IKI Tracs, \
67
+ SV Klima and SPA. \n
68
+ """)
69
+ st.write("""**Document Processing:** The Uploaded/Selected document is \
70
+ automatically cleaned and split into paragraphs with a maximum \
71
+ length of 60 words using a Haystack preprocessing pipeline. The \
72
+ length of 60 is an empirical value which should reflect the length \
73
+ of a “context” and should limit the paragraph length deviation. \
74
+ However, since we want to respect the sentence boundary the limit \
75
+ can breach and hence this limit of 60 is tentative. \n
76
+ """)
77
+
78
+ st.write("")
79
+
80
+ ### Main app code ###
81
+ with st.container():
82
+ if st.button("RUN Adaptation-Mitigation Classification"):
83
+ if 'key4' not in st.session_state:
84
+ st.session_state['key4'] = None
85
+
86
+ if 'filepath' in st.session_state:
87
+ file_name = st.session_state['filename']
88
+ file_path = st.session_state['filepath']
89
+
90
+
91
+ all_documents = runAdapMitPreprocessingPipeline(file_name= file_name,
92
+ file_path= file_path, split_by= params['split_by'],
93
+ split_length= params['split_length'],
94
+ split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
95
+ split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
96
+ classifier = load_adapmitClassifier(classifier_name=params['model_name'])
97
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
98
+ verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
99
+ if len(verified_paralist) > 100:
100
+ warning_msg = ": This might take sometime, please sit back and relax."
101
+ else:
102
+ warning_msg = ""
103
+
104
+ # # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
105
+ df = adapmit_classification(haystack_doc=verified_paralist,
106
+ threshold= params['threshold'])
107
+
108
+ threshold= params['threshold']
109
+ truth_df = df.drop(['text'],axis=1)
110
+ truth_df = truth_df.astype(float) >= threshold
111
+ truth_df = truth_df.astype(str)
112
+ categories = list(truth_df.columns)
113
+
114
+ placeholder = {}
115
+ for val in categories:
116
+ placeholder[val] = dict(truth_df[val].value_counts())
117
+ count_df = pd.DataFrame.from_dict(placeholder)
118
+ count_df = count_df.T
119
+ count_df = count_df.reset_index()
120
+ # st.write(count_df)
121
+ placeholder = []
122
+ for i in range(len(count_df)):
123
+ placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
124
+ placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
125
+ count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
126
+ # st.write("Total Paragraphs: {}".format(len(df)))
127
+ fig = px.bar(count_df, y='category', x='count',
128
+ color='truth_value',orientation='h', height =200)
129
+ c1, c2 = st.columns([1,1])
130
+ with c1:
131
+ st.plotly_chart(fig,use_container_width= True)
132
+
133
+ truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
134
+ truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
135
+ # st.write(truth_df)
136
+ df = pd.concat([df,truth_df['labels']],axis=1)
137
+ st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
138
+ df = df.sort_values(by = ['Mitigation'], ascending=False)
139
+ for i in range(3):
140
+ if df.iloc[i]['Mitigation'] >= 0.50:
141
+ st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
142
+ st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
143
+
144
+ st.markdown("###### Top few 'Adaptation' related paragraph/text ######")
145
+ df = df.sort_values(by = ['Adaptation'], ascending=False)
146
+ for i in range(3):
147
+ if df.iloc[i]['Adaptation'] > 0.5:
148
+ st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Adaptation']))
149
+ st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
150
+ # st.write(df[['text','labels']])
151
+ df['Validation'] = 'No'
152
+ df['Val-Mitigation'] = 'No'
153
+ df['Val-Adaptation'] = 'No'
154
+ df_xlsx = to_excel(df)
155
+ st.download_button(label='📥 Download Current Result',
156
+ data=df_xlsx ,
157
+ file_name= 'file_adaptation-mitigation.xlsx')
158
+ # st.session_state.key4 =
159
+
160
+ # category =set(df.columns)
161
+ # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
162
+ # category = list(category - removecols)
163
+
164
+ else:
165
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
166
+ logging.warning("Terminated as no document provided")
167
+
168
+ # # Creating truth value dataframe
169
+ # if 'key4' in st.session_state:
170
+ # if st.session_state.key4 is not None:
171
+ # df = st.session_state.key4
172
+ # st.markdown("###### Select the threshold for classifier ######")
173
+ # c4, c5 = st.columns([1,1])
174
+
175
+ # with c4:
176
+ # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
177
+ # step=0.01, value=0.5,
178
+ # help = "Keep High Value if want refined result, low if dont want to miss anything" )
179
+ # category =set(df.columns)
180
+ # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
181
+ # category = list(category - removecols)
182
+
183
+ # placeholder = {}
184
+ # for val in category:
185
+ # temp = df[val].astype(float) > threshold
186
+ # temp = temp.astype(str)
187
+ # placeholder[val] = dict(temp.value_counts())
188
+
189
+ # count_df = pd.DataFrame.from_dict(placeholder)
190
+ # count_df = count_df.T
191
+ # count_df = count_df.reset_index()
192
+ # placeholder = []
193
+ # for i in range(len(count_df)):
194
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
195
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
196
+
197
+ # count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
198
+ # fig = px.bar(count_df, x='category', y='count',
199
+ # color='truth_value',
200
+ # height=400)
201
+ # st.write("")
202
+ # st.plotly_chart(fig)
203
+
204
+ # df['Validation'] = 'No'
205
+ # df['Val-Mitigation'] = 'No'
206
+ # df['Val-Adaptation'] = 'No'
207
+ # df_xlsx = to_excel(df)
208
+ # st.download_button(label='📥 Download Current Result',
209
+ # data=df_xlsx ,
210
+ # file_name= 'file_adaptation-mitigation.xlsx')
211
+
212
+
appStore/info.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from PIL import Image
4
+ _ROOT = os.path.abspath(os.path.dirname(__file__))
5
+ def get_data(path):
6
+ return os.path.join(_ROOT, 'data', path)
7
+
8
+ def app():
9
+
10
+
11
+ with open('style.css') as f:
12
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
13
+
14
+ st.markdown("<h2 style='text-align: center; \
15
+ color: black;'> Climate Policy Understanding App</h2>",
16
+ unsafe_allow_html=True)
17
+
18
+
19
+ st.markdown("<div style='text-align: center; \
20
+ color: grey;'>Climate Policy Understanding App is an open-source\
21
+ digital tool which aims to assist policy analysts and \
22
+ other users in extracting and filtering relevant \
23
+ information from public documents.</div>",
24
+ unsafe_allow_html=True)
25
+ footer = """
26
+ <div class="footer-custom">
27
+ Guidance & Feedback - <a>Nadja Taeger</a> |<a>Marie Hertel</a> | <a>Cecile Schneider</a> |
28
+ Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
29
+ <a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
30
+
31
+ </div>
32
+ """
33
+ st.markdown(footer, unsafe_allow_html=True)
34
+
35
+ c1, c2, c3 = st.columns([8,1,12])
36
+ with c1:
37
+ image = Image.open('docStore/img/ndc.png')
38
+ st.image(image)
39
+ with c3:
40
+ st.markdown('<div style="text-align: justify;">The manual extraction \
41
+ of relevant information from text documents is a \
42
+ time-consuming task for any policy analysts. As the amount and length of \
43
+ public policy documents in relation to sustainable development (such as \
44
+ National Development Plans and Nationally Determined Contributions) \
45
+ continuously increases, a major challenge for policy action tracking – the \
46
+ evaluation of stated goals and targets and their actual implementation on \
47
+ the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
48
+ Language Processing (NLP) methods can help in shortening and easing this \
49
+ task for policy analysts.</div><br>',
50
+ unsafe_allow_html=True)
51
+
52
+ intro = """
53
+ <div style="text-align: justify;">
54
+
55
+ For this purpose, IKI Tracs, SV KLIMA, SPA and Data Service Center (Deutsche Gesellschaft für Internationale \
56
+ Zusammenarbeit (GIZ) GmbH) are collaborating since 2022 in the development \
57
+ of an AI-powered open-source web application that helps find and extract \
58
+ relevant information from public policy documents faster to facilitate \
59
+ evidence-based decision-making processes in sustainable development and beyond.
60
+
61
+
62
+ </div>
63
+ <br>
64
+ """
65
+ st.markdown(intro, unsafe_allow_html=True)
66
+ image2 = Image.open('docStore/img/paris.png')
67
+ st.image(image2)
appStore/multiapp.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Frameworks for running multiple Streamlit applications as a single app.
2
+ """
3
+ import streamlit as st
4
+ from PIL import Image
5
+ from utils.uploadAndExample import add_upload
6
+
7
+ class MultiApp:
8
+ """Framework for combining multiple streamlit applications.
9
+ Usage:
10
+ def foo():
11
+ st.title("Hello Foo")
12
+ def bar():
13
+ st.title("Hello Bar")
14
+ app = MultiApp()
15
+ app.add_app("Foo", foo)
16
+ app.add_app("Bar", bar)
17
+ app.run()
18
+ It is also possible keep each application in a separate file.
19
+ import foo
20
+ import bar
21
+ app = MultiApp()
22
+ app.add_app("Foo", foo.app)
23
+ app.add_app("Bar", bar.app)
24
+ app.run()
25
+ """
26
+ def __init__(self):
27
+ self.apps = []
28
+
29
+ def add_app(self,title,icon, func):
30
+ """Adds a new application.
31
+ Parameters
32
+ ----------
33
+ func:
34
+ the python function to render this app.
35
+ title:
36
+ title of the app. Appears in the dropdown in the sidebar.
37
+ """
38
+ self.apps.append({
39
+ "title": title,
40
+ "icon": icon,
41
+ "function": func
42
+ })
43
+
44
+ def run(self):
45
+
46
+ st.sidebar.write(format_func=lambda app: app['title'])
47
+ #image = Image.open('docStore/img/dsc_giz.png')
48
+ #st.sidebar.image(image, width =200)
49
+
50
+ with st.sidebar:
51
+ selected = st.selectbox("Select the Task to perform", [page["title"] for page in self.apps],)
52
+ st.markdown("---")
53
+
54
+
55
+ for index, item in enumerate(self.apps):
56
+ if item["title"] == selected:
57
+ self.apps[index]["function"]()
58
+ break
59
+
60
+
61
+ choice = st.sidebar.radio(label = 'Select the Document',
62
+ help = 'You can upload the document \
63
+ or else you can try a example document',
64
+ options = ('Upload Document', 'Try Example'),
65
+ horizontal = True)
66
+ add_upload(choice)
67
+
appStore/netzero.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ # from st_aggrid import AgGrid
12
+ # from st_aggrid.shared import ColumnsAutoSizeMode
13
+ from utils.netzero_classifier import netzero_classification
14
+ from utils.netzero_classifier import runNetZeroPreprocessingPipeline, load_netzeroClassifier
15
+ # from utils.keyword_extraction import textrank
16
+ import logging
17
+ logger = logging.getLogger(__name__)
18
+ from utils.config import get_classifier_params
19
+ from io import BytesIO
20
+ import xlsxwriter
21
+ import plotly.express as px
22
+
23
+
24
+ # Declare all the necessary variables
25
+ classifier_identifier = 'netzero'
26
+ params = get_classifier_params(classifier_identifier)
27
+
28
+ # Labels dictionary ###
29
+ _lab_dict = {
30
+ 'NEGATIVE':'NO NETZERO TARGET',
31
+ 'NETZERO':'NETZERO TARGET',
32
+ }
33
+
34
+
35
+ @st.cache_data
36
+ def to_excel(df):
37
+ len_df = len(df)
38
+ output = BytesIO()
39
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
40
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
41
+ workbook = writer.book
42
+ worksheet = writer.sheets['Sheet1']
43
+ worksheet.data_validation('E2:E{}'.format(len_df),
44
+ {'validate': 'list',
45
+ 'source': ['No', 'Yes', 'Discard']})
46
+ writer.save()
47
+ processed_data = output.getvalue()
48
+ return processed_data
49
+
50
+ def app():
51
+
52
+ #### APP INFO #####
53
+ with st.container():
54
+ st.markdown("<h1 style='text-align: center; color: black;'> NetZero Target Extraction </h1>", unsafe_allow_html=True)
55
+ st.write(' ')
56
+ st.write(' ')
57
+
58
+ with st.expander("ℹ️ - About this app", expanded=False):
59
+
60
+ st.write(
61
+ """
62
+ The **NetZero Extraction** app is an easy-to-use interface built \
63
+ in Streamlit for analyzing policy documents for \
64
+ Classification of the paragraphs/texts in the document *If it \
65
+ contains any Net-Zero target related information* - \
66
+ developed by GIZ Data Service Center, GFA, IKI Tracs, \
67
+ SV Klima and SPA. \n
68
+ """)
69
+ st.write("""**Document Processing:** The Uploaded/Selected document is \
70
+ automatically cleaned and split into paragraphs with a maximum \
71
+ length of 60 words using a Haystack preprocessing pipeline. The \
72
+ length of 60 is an empirical value which should reflect the length \
73
+ of a “context” and should limit the paragraph length deviation. \
74
+ However, since we want to respect the sentence boundary the limit \
75
+ can breach and hence this limit of 60 is tentative. \n
76
+ """)
77
+
78
+ st.write("")
79
+
80
+ ### Main app code ###
81
+ with st.container():
82
+ if st.button("RUN NetZero Related Paragraph Extractions"):
83
+ if 'key2' not in st.session_state:
84
+ st.session_state['key2'] = None
85
+
86
+ if 'filepath' in st.session_state:
87
+ file_name = st.session_state['filename']
88
+ file_path = st.session_state['filepath']
89
+
90
+ # Do the preprocessing of the PDF
91
+
92
+ all_documents = runNetZeroPreprocessingPipeline(file_name= file_name,
93
+ file_path= file_path, split_by= params['split_by'],
94
+ split_length= params['split_length'],
95
+ split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
96
+ split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
97
+
98
+ # st.dataframe(all_documents['documents'])
99
+
100
+ # Load the classifier model
101
+
102
+ classifier = load_netzeroClassifier(classifier_name=params['model_name'])
103
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
104
+
105
+ if len(all_documents['documents']) > 100:
106
+ warning_msg = ": This might take sometime, please sit back and relax."
107
+ else:
108
+ warning_msg = ""
109
+
110
+ # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
111
+ # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
112
+
113
+ df = netzero_classification(haystack_doc=all_documents['documents'],
114
+ threshold= params['threshold'])
115
+ st.session_state.key2 = df
116
+ hits = df[df['Target Label'] == 'NETZERO']
117
+ range_val = min(5,len(hits))
118
+ if range_val !=0:
119
+ count_df = df['Target Label'].value_counts()
120
+ count_df = count_df.rename('count')
121
+ count_df = count_df.rename_axis('Target Label').reset_index()
122
+ count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
123
+
124
+ fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
125
+ c1, c2 = st.columns([1,1])
126
+ with c1:
127
+ st.plotly_chart(fig,use_container_width= True)
128
+
129
+ hits = hits.sort_values(by=['Relevancy'], ascending=False)
130
+ st.write("")
131
+ st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
132
+ range_val = min(5,len(hits))
133
+ for i in range(range_val):
134
+ # the page number reflects the page that contains the main paragraph
135
+ # according to split limit, the overlapping part can be on a separate page
136
+ st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
137
+ st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
138
+ else:
139
+ st.info("🤔 No Netzero target found")
140
+ df['Validation'] = 'No'
141
+ df_xlsx = to_excel(df)
142
+ st.download_button(label='📥 Download Current Result',
143
+ data=df_xlsx ,
144
+ file_name= 'file_target.xlsx')
145
+
146
+
147
+ else:
148
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
149
+ logging.warning("Terminated as no document provided")
150
+
151
+ # # Creating truth value dataframe
152
+ # if 'key2' in st.session_state:
153
+ # if st.session_state.key2 is not None:
154
+ # df = st.session_state.key2
155
+ # st.markdown("###### Select the threshold for classifier ######")
156
+ # c1, c2 = st.columns([1,1])
157
+
158
+ # netzero_df = df[df['Target Label'] == 'NETZERO'].reset_index(drop = True)
159
+ # if len(netzero_df) >0:
160
+ # with c1:
161
+ # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
162
+ # step=0.01, value=0.5,
163
+ # help = "Keep High Value if want refined result, low if dont want to miss anything" )
164
+
165
+ # # creating the dataframe for value counts of Labels, along with 'title' of Labels
166
+ # temp = df[df['Relevancy']>threshold]
167
+ # count_df = temp['Target Label'].value_counts()
168
+ # count_df = count_df.rename('count')
169
+ # count_df = count_df.rename_axis('Target Label').reset_index()
170
+ # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
171
+
172
+ # plt.rcParams['font.size'] = 25
173
+ # colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
174
+ # # plot
175
+ # fig, ax = plt.subplots()
176
+ # ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
177
+ # wedgeprops={"linewidth": 1, "edgecolor": "white"},
178
+ # textprops={'fontsize': 14},
179
+ # frame=False,labels =list(count_df.Label_def),
180
+ # labeldistance=1.2)
181
+ # st.markdown("#### Anything related to NetZero Targets? ####")
182
+
183
+ # c4, c5, c6 = st.columns([1,2,2])
184
+
185
+ # with c5:
186
+ # st.pyplot(fig)
187
+ # with c6:
188
+ # st.write(count_df[['Label_def','count']])
189
+
190
+ # st.write("")
191
+
192
+ # st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
193
+
194
+ # st.dataframe(netzero_df.head())
195
+ # else:
196
+ # st.write("🤔 No Results found")
197
+
198
+
199
+ # df['Validation'] = 'No'
200
+ # df_xlsx = to_excel(df)
201
+ # st.download_button(label='📥 Download Current Result',
202
+ # data=df_xlsx ,
203
+ # file_name= 'file_netzero.xlsx')
204
+
205
+
206
+
appStore/sector.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ # from st_aggrid import AgGrid
12
+ # from st_aggrid.shared import ColumnsAutoSizeMode
13
+ from utils.sector_classifier import sector_classification
14
+ from utils.sector_classifier import runSectorPreprocessingPipeline, load_sectorClassifier
15
+ # from utils.keyword_extraction import textrank
16
+ import logging
17
+ logger = logging.getLogger(__name__)
18
+ from utils.config import get_classifier_params
19
+ from utils.preprocessing import paraLengthCheck
20
+ from io import BytesIO
21
+ import xlsxwriter
22
+ import plotly.express as px
23
+
24
+
25
+ # Declare all the necessary variables
26
+ classifier_identifier = 'sector'
27
+ params = get_classifier_params(classifier_identifier)
28
+
29
+ @st.cache_data
30
+ def to_excel(df,sectorlist):
31
+ len_df = len(df)
32
+ output = BytesIO()
33
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
34
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
35
+ workbook = writer.book
36
+ worksheet = writer.sheets['Sheet1']
37
+ worksheet.data_validation('S2:S{}'.format(len_df),
38
+ {'validate': 'list',
39
+ 'source': ['No', 'Yes', 'Discard']})
40
+ worksheet.data_validation('X2:X{}'.format(len_df),
41
+ {'validate': 'list',
42
+ 'source': sectorlist + ['Blank']})
43
+ worksheet.data_validation('T2:T{}'.format(len_df),
44
+ {'validate': 'list',
45
+ 'source': sectorlist + ['Blank']})
46
+ worksheet.data_validation('U2:U{}'.format(len_df),
47
+ {'validate': 'list',
48
+ 'source': sectorlist + ['Blank']})
49
+ worksheet.data_validation('V2:V{}'.format(len_df),
50
+ {'validate': 'list',
51
+ 'source': sectorlist + ['Blank']})
52
+ worksheet.data_validation('W2:U{}'.format(len_df),
53
+ {'validate': 'list',
54
+ 'source': sectorlist + ['Blank']})
55
+ writer.save()
56
+ processed_data = output.getvalue()
57
+ return processed_data
58
+
59
+ def app():
60
+
61
+ #### APP INFO #####
62
+ with st.container():
63
+ st.markdown("<h1 style='text-align: center; color: black;'> Sector Classification </h1>", unsafe_allow_html=True)
64
+ st.write(' ')
65
+ st.write(' ')
66
+
67
+ with st.expander("ℹ️ - About this app", expanded=False):
68
+
69
+ st.write(
70
+ """
71
+ The **Sector Classification** app is an easy-to-use interface built \
72
+ in Streamlit for analyzing policy documents for \
73
+ Classification of the paragraphs/texts in the document *If it \
74
+ belongs to particular sector or not*. The paragraph can belong to multiple sectors - \
75
+ developed by GIZ Data Service Center, GFA, IKI Tracs, \
76
+ SV Klima and SPA. \n
77
+ """)
78
+ st.write("""**Document Processing:** The Uploaded/Selected document is \
79
+ automatically cleaned and split into paragraphs with a maximum \
80
+ length of 60 words using a Haystack preprocessing pipeline. The \
81
+ length of 60 is an empirical value which should reflect the length \
82
+ of a “context” and should limit the paragraph length deviation. \
83
+ However, since we want to respect the sentence boundary the limit \
84
+ can breach and hence this limit of 60 is tentative. \n
85
+ """)
86
+
87
+ st.write("")
88
+
89
+ ### Main app code ###
90
+ with st.container():
91
+ if st.button("RUN Sector Classification"):
92
+ if 'key' not in st.session_state:
93
+ st.session_state['key'] = None
94
+
95
+ if 'filepath' in st.session_state:
96
+ file_name = st.session_state['filename']
97
+ file_path = st.session_state['filepath']
98
+
99
+
100
+ all_documents = runSectorPreprocessingPipeline(file_name= file_name,
101
+ file_path= file_path, split_by= params['split_by'],
102
+ split_length= params['split_length'],
103
+ split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
104
+ split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
105
+ # st.write(all_documents['documents'])
106
+ classifier = load_sectorClassifier(classifier_name=params['model_name'])
107
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
108
+ verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
109
+ if len(verified_paralist) > 100:
110
+ warning_msg = ": This might take sometime, please sit back and relax."
111
+ else:
112
+ warning_msg = ""
113
+
114
+ # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
115
+ # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
116
+
117
+ df = sector_classification(haystack_doc=verified_paralist,
118
+ threshold= params['threshold'])
119
+ # st.write(df)
120
+ threshold= params['threshold']
121
+ truth_df = df.drop(['text'],axis=1)
122
+ truth_df = truth_df.astype(float) >= threshold
123
+ truth_df = truth_df.astype(str)
124
+ categories = list(truth_df.columns)
125
+
126
+ placeholder = {}
127
+ for val in categories:
128
+ placeholder[val] = dict(truth_df[val].value_counts())
129
+ count_df = pd.DataFrame.from_dict(placeholder)
130
+ count_df = count_df.T
131
+ count_df = count_df.reset_index()
132
+ # st.write(count_df)
133
+ placeholder = []
134
+ for i in range(len(count_df)):
135
+ placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
136
+ placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
137
+ count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
138
+ # st.write("Total Paragraphs: {}".format(len(df)))
139
+ fig = px.bar(count_df, x='category', y='count',
140
+ color='truth_value')
141
+ # c1, c2 = st.columns([1,1])
142
+ # with c1:
143
+ st.plotly_chart(fig,use_container_width= True)
144
+
145
+ truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
146
+ truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
147
+ # st.write(truth_df)
148
+ df = pd.concat([df,truth_df['labels']],axis=1)
149
+ df['Validation'] = 'No'
150
+ df['Sector1'] = 'Blank'
151
+ df['Sector2'] = 'Blank'
152
+ df['Sector3'] = 'Blank'
153
+ df['Sector4'] = 'Blank'
154
+ df['Sector5'] = 'Blank'
155
+ df_xlsx = to_excel(df,categories)
156
+ st.download_button(label='📥 Download Current Result',
157
+ data=df_xlsx ,
158
+ file_name= 'file_sector.xlsx')
159
+ else:
160
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
161
+ logging.warning("Terminated as no document provided")
162
+
163
+ # # Creating truth value dataframe
164
+ # if 'key' in st.session_state:
165
+ # if st.session_state.key is not None:
166
+ # df = st.session_state.key
167
+ # st.markdown("###### Select the threshold for classifier ######")
168
+ # c4, c5 = st.columns([1,1])
169
+
170
+ # with c4:
171
+ # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
172
+ # step=0.01, value=0.5,
173
+ # help = "Keep High Value if want refined result, low if dont want to miss anything" )
174
+ # sectors =set(df.columns)
175
+ # removecols = {'Validation','Sector1','Sector2','Sector3','Sector4',
176
+ # 'Sector5','text'}
177
+ # sectors = list(sectors - removecols)
178
+
179
+ # placeholder = {}
180
+ # for val in sectors:
181
+ # temp = df[val].astype(float) > threshold
182
+ # temp = temp.astype(str)
183
+ # placeholder[val] = dict(temp.value_counts())
184
+
185
+ # count_df = pd.DataFrame.from_dict(placeholder)
186
+ # count_df = count_df.T
187
+ # count_df = count_df.reset_index()
188
+ # placeholder = []
189
+ # for i in range(len(count_df)):
190
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
191
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
192
+
193
+ # count_df = pd.DataFrame(placeholder, columns = ['sector','count','truth_value'])
194
+ # fig = px.bar(count_df, x='sector', y='count',
195
+ # color='truth_value',
196
+ # height=400)
197
+ # st.write("")
198
+ # st.plotly_chart(fig)
199
+
200
+ # df['Validation'] = 'No'
201
+ # df['Sector1'] = 'Blank'
202
+ # df['Sector2'] = 'Blank'
203
+ # df['Sector3'] = 'Blank'
204
+ # df['Sector4'] = 'Blank'
205
+ # df['Sector5'] = 'Blank'
206
+ # df_xlsx = to_excel(df,sectors)
207
+ # st.download_button(label='📥 Download Current Result',
208
+ # data=df_xlsx ,
209
+ # file_name= 'file_sector.xlsx')
210
+
211
+
appStore/target.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ # from st_aggrid import AgGrid
12
+ # from st_aggrid.shared import ColumnsAutoSizeMode
13
+ from utils.target_classifier import target_classification
14
+ from utils.target_classifier import runTargetPreprocessingPipeline, load_targetClassifier
15
+ # from utils.keyword_extraction import textrank
16
+ import logging
17
+ logger = logging.getLogger(__name__)
18
+ from utils.config import get_classifier_params
19
+ from io import BytesIO
20
+ import xlsxwriter
21
+ import plotly.express as px
22
+
23
+ # Declare all the necessary variables
24
+ classifier_identifier = 'target'
25
+ params = get_classifier_params(classifier_identifier)
26
+
27
+ ## Labels dictionary ###
28
+ _lab_dict = {
29
+ 'LABEL_0':'NO TARGET INFO',
30
+ 'LABEL_1':'ECONOMY-WIDE TARGET',
31
+ }
32
+
33
+ @st.cache_data
34
+ def to_excel(df):
35
+ len_df = len(df)
36
+ output = BytesIO()
37
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
38
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
39
+ workbook = writer.book
40
+ worksheet = writer.sheets['Sheet1']
41
+ worksheet.data_validation('E2:E{}'.format(len_df),
42
+ {'validate': 'list',
43
+ 'source': ['No', 'Yes', 'Discard']})
44
+ writer.save()
45
+ processed_data = output.getvalue()
46
+ return processed_data
47
+
48
+ def app():
49
+
50
+ #### APP INFO #####
51
+ with st.container():
52
+ st.markdown("<h1 style='text-align: center; color: black;'> Targets Extraction </h1>", unsafe_allow_html=True)
53
+ st.write(' ')
54
+ st.write(' ')
55
+
56
+ with st.expander("ℹ️ - About this app", expanded=False):
57
+
58
+ st.write(
59
+ """
60
+ The **Target Extraction** app is an easy-to-use interface built \
61
+ in Streamlit for analyzing policy documents for \
62
+ Classification of the paragraphs/texts in the document *If it \
63
+ contains any Economy-Wide Targets related information* - \
64
+ developed by GIZ Data Service Center, GFA, IKI Tracs, \
65
+ SV Klima and SPA. \n
66
+ """)
67
+ st.write("""**Document Processing:** The Uploaded/Selected document is \
68
+ automatically cleaned and split into paragraphs with a maximum \
69
+ length of 60 words using a Haystack preprocessing pipeline. The \
70
+ length of 60 is an empirical value which should reflect the length \
71
+ of a “context” and should limit the paragraph length deviation. \
72
+ However, since we want to respect the sentence boundary the limit \
73
+ can breach and hence this limit of 60 is tentative. \n
74
+ """)
75
+
76
+ st.write("")
77
+
78
+ ### Main app code ###
79
+ with st.container():
80
+ if st.button("RUN Target Related Paragraph Extractions"):
81
+ if 'key1' not in st.session_state:
82
+ st.session_state['key1'] = None
83
+
84
+ if 'filepath' in st.session_state:
85
+ file_name = st.session_state['filename']
86
+ file_path = st.session_state['filepath']
87
+
88
+
89
+ all_documents = runTargetPreprocessingPipeline(file_name= file_name,
90
+ file_path= file_path, split_by= params['split_by'],
91
+ split_length= params['split_length'],
92
+ split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
93
+ split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
94
+ # st.write(all_documents['documents'])
95
+
96
+ #load Classifier
97
+ classifier = load_targetClassifier(classifier_name=params['model_name'])
98
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
99
+ if len(all_documents['documents']) > 100:
100
+ warning_msg = ": This might take sometime, please sit back and relax."
101
+ else:
102
+ warning_msg = ""
103
+
104
+ # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
105
+ # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
106
+
107
+ df = target_classification(haystack_doc=all_documents['documents'],
108
+ threshold= params['threshold'])
109
+ st.session_state.key1 = df
110
+ # temp = df[df['Relevancy']>threshold]
111
+ hits = df[df['Target Label'] == 'LABEL_1']
112
+ range_val = min(5,len(hits))
113
+ if range_val !=0:
114
+ count_df = df['Target Label'].value_counts()
115
+ count_df = count_df.rename('count')
116
+ count_df = count_df.rename_axis('Target Label').reset_index()
117
+ count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
118
+
119
+ fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
120
+ c1, c2 = st.columns([1,1])
121
+ with c1:
122
+ st.plotly_chart(fig,use_container_width= True)
123
+
124
+ hits = hits.sort_values(by=['Relevancy'], ascending=False)
125
+ st.write("")
126
+ st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
127
+ range_val = min(5,len(hits))
128
+ for i in range(range_val):
129
+ # the page number reflects the page that contains the main paragraph
130
+ # according to split limit, the overlapping part can be on a separate page
131
+ st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
132
+ st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
133
+
134
+ else:
135
+ st.info("🤔 No Economy Wide Target found")
136
+ df['Validation'] = 'No'
137
+ df_xlsx = to_excel(df)
138
+ st.download_button(label='📥 Download Current Result',
139
+ data=df_xlsx ,
140
+ file_name= 'file_target.xlsx')
141
+
142
+
143
+ else:
144
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
145
+ logging.warning("Terminated as no document provided")
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+ # # Creating truth value dataframe
162
+ # if 'key1' in st.session_state:
163
+ # if st.session_state.key1 is not None:
164
+ # df = st.session_state.key1
165
+ # st.markdown("###### Select the threshold for classifier ######")
166
+ # c1, c2 = st.columns([1,1])
167
+
168
+ # with c1:
169
+ # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
170
+ # step=0.01, value=0.5,
171
+ # help = "Keep High Value if want refined result, low if dont want to miss anything" )
172
+ # sectors =set(df.columns)
173
+ # removecols = {'Validation','Sectors','text'}
174
+ # sectors = list(sectors - removecols)
175
+
176
+ # # creating the dataframe for value counts of Labels, along with 'title' of Labels
177
+ # temp = df[df['Relevancy']>threshold]
178
+ # count_df = temp['Target Label'].value_counts()
179
+ # count_df = count_df.rename('count')
180
+ # count_df = count_df.rename_axis('Target Label').reset_index()
181
+ # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
182
+
183
+ # plt.rcParams['font.size'] = 25
184
+ # colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
185
+ # # plot
186
+ # fig, ax = plt.subplots()
187
+ # ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
188
+ # wedgeprops={"linewidth": 1, "edgecolor": "white"},
189
+ # textprops={'fontsize': 14},
190
+ # frame=False,labels =list(count_df.Label_def),
191
+ # labeldistance=1.2)
192
+ # st.markdown("#### Anything related to Targets? ####")
193
+
194
+ # c4, c5, c6 = st.columns([1,2,2])
195
+
196
+ # with c5:
197
+ # st.pyplot(fig)
198
+ # with c6:
199
+ # st.write(count_df[['Label_def','count']])
200
+
201
+ # st.write("")
202
+ # st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
203
+ # st.dataframe(df[df['Target Label'] == 'LABEL_1'].reset_index(drop = True))
204
+
205
+ # df['Validation'] = 'No'
206
+ # df_xlsx = to_excel(df)
207
+ # st.download_button(label='📥 Download Current Result',
208
+ # data=df_xlsx ,
209
+ # file_name= 'file_target.xlsx')
210
+
211
+
docStore/img/dsc_giz.png ADDED
docStore/img/ndc.png ADDED
docStore/img/paris.png ADDED
docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Ethiopia 2030: The Pathway to Prosperity
2
+ Ten Years Perspective Development Plan (2021 � 2030)
3
+ 1. Baselines and Assumptions
4
+ 2. Strategic pillars
5
+ 3. Departures
6
+ 4. Macroeconomic goals
7
+ 5. Implications of the COVID-19 pandemic and necessary mitigation measures
8
+ 6. Potentials/capabilities
9
+ 7. Focus areas
10
+ 7.1. Productive sectors
11
+ 7.2. Services sector
12
+ 7.3. Enabling sectors
13
+ 8. Balanced and competitive development (nationally, regionally and locally)
14
+ 9. Monitoring and Evaluation
15
+ Content
16
+ 1. Baselines and Assumptions
17
+ Poverty Reduction (%)
18
+ Key performances of previous years
19
+ 45.5 44.2
20
+ 38.7
21
+ 29.6
22
+ 23.5
23
+ 19
24
+ 0
25
+ 5
26
+ 10
27
+ 15
28
+ 20
29
+ 25
30
+ 30
31
+ 35
32
+ 40
33
+ 45
34
+ 50
35
+ 1994 2000 2005 2011 2016 2020
36
+ Percent
37
+ Year
38
+ Proportion of people living below poverty line
39
+ 10.5
40
+ 8.8
41
+ 10.1
42
+ 7.7
43
+ 9
44
+ 5.19-6.20
45
+ 0 2 4 6 8 10 12
46
+ GTP I: 2011-2015
47
+ GTP II: 2015/16
48
+ GTP II: 2016/17
49
+ GTP II: 2017/18
50
+ GTP II: 2018/19
51
+ GTP II: 2019/20 (projection, with
52
+ COVID-19)
53
+ GDP growth rate (%)
54
+ 1. Baselines and Assumptions
55
+ Share of economic sectors in GDP (%) Merchandise export as % of GDP
56
+ 8.66
57
+ 7.33
58
+ 6.57
59
+ 5.93
60
+ 4.91
61
+ 3.86 3.56 3.37
62
+ 2.77
63
+ 0
64
+ 1
65
+ 2
66
+ 3
67
+ 4
68
+ 5
69
+ 6
70
+ 7
71
+ 8
72
+ 9
73
+ 10
74
+ Percent
75
+ Year
76
+ 46.9
77
+ 45
78
+ 43.5
79
+ 41.4
80
+ 39.5
81
+ 37.1 35.9
82
+ 34.5
83
+ 32.8
84
+ 13.4
85
+ 15
86
+ 17.3
87
+ 18.8
88
+ 21
89
+ 23.5
90
+ 25.7 26.9 27.8
91
+ 4.7 4.8 5 5.3 5.6 6.1 6.9 6.8 6.8
92
+ 7.1
93
+ 8.6
94
+ 10.7 12
95
+ 14.2
96
+ 16.2
97
+ 17.8 19.1 20.1
98
+ 39.8 40.1 39.2 39.8 39.4 38.4 38.6 39.4
99
+ 0
100
+ 5
101
+ 10
102
+ 15
103
+ 20
104
+ 25
105
+ 30
106
+ 35
107
+ 40
108
+ 45
109
+ 50
110
+ 2010/11 2011/12 2012/13 2013/14 2014/15 2015/16 2016/17 2017/18 2018/19
111
+ Percent
112
+ Agriculture Industry Manufacturing Construction Services
113
+ 1. Baselines and Assumptions
114
+ Labour force participation (2013)
115
+ 73%
116
+ 7%
117
+ 20%
118
+ Agriculture
119
+ Industry
120
+ Services
121
+ 7%
122
+ 22%
123
+ 71%
124
+ Agriculture
125
+ Industry
126
+ Services
127
+ Urban labour force participation (2013)
128
+ 1. Baselines and Assumptions
129
+ High and increasing Unemployment Rate
130
+ � Urban unemployment rate = 19.1% in 2018
131
+ � Youth unemployment rate = 25.3 %
132
+ ? Male = 18.6%
133
+ ? Female 30.9 %
134
+ � Rural unemployment rate = 2% in 2013
135
+ � Declining per capita rural land creating
136
+ disguised unemployment
137
+ 402,869
138
+ 471,535
139
+ Male Female Total Male Female Total
140
+ 2014 2018
141
+ 15-19 yr. 20-24 yr. 25-29 yr. Linear (20-24 yr.)
142
+ Number of unemployed people in urban areas
143
+ 1. Baselines and Assumptions
144
+ Challenges
145
+ 1. Macroeconomic imbalances
146
+ ?Sustained high inflation
147
+ ?High and rising unemployment especially
148
+ in urban areas
149
+ ?High and rising debt burden
150
+ ?Chronic foreign currency shortage
151
+ ?Sluggish (though encouraging) rate of
152
+ structural change
153
+ 2. Vulnerability to shocks (COVID-19, Climate
154
+ changes, Desert Locust infestation, etc)
155
+ 3. Poor quality and high inequity in
156
+ infrastructure projects
157
+ 4. Poor quality services in health and
158
+ education
159
+ � High repetition and dropout rates from school
160
+ 1. Baselines and Assumptions
161
+ � Poor quality of growth and slow
162
+ structural change
163
+ � Excessive aid and loan
164
+ dependence for financing
165
+ infrastructural and construction
166
+ investments
167
+ � Limited success in expanding
168
+ manufacturing and modern
169
+ agriculture which have high job
170
+ creation potentials
171
+ � Weak institutional capacity as
172
+ the main culprit of all failures
173
+ ? Provision of quality services
174
+ (electricity, water, telephone,
175
+ internet)
176
+ ? Creation of enough jobs and
177
+ improved living standards
178
+ ? Generation of reliable foreign
179
+ exchange revenue and debtsustainable
180
+ national economic
181
+ capacity
182
+ ? Completion of development
183
+ projects and investment plans
184
+ under public-private
185
+ partnerships
186
+ � Low reward for merit, productivity and effort
187
+ while low disincentive for laziness, wastefulness
188
+ and corruption
189
+ � Slow institutional change and transformation in:
190
+ ? Government policies
191
+ ? Investor attitude
192
+ ? Youth behaviour
193
+ ? Role of the intellectuals
194
+ � The need for sustained increase in production
195
+ and productivity
196
+ � The need to set a common national vision to
197
+ achieve major successes with consensus and
198
+ popular legitimacy
199
+ Major areas of failure in the economy
200
+ 1. Baselines and Assumptions
201
+ � Poor quality of growth and slow
202
+ structural change
203
+ � Excessive aid and loan
204
+ dependence for financing
205
+ infrastructural and construction
206
+ investments
207
+ � Limited success in expanding
208
+ manufacturing and modern
209
+ agriculture which have high job
210
+ creation potentials
211
+ � Weak institutional capacity as
212
+ the main culprit of all failures
213
+ ? Provision of quality services
214
+ (electricity, water, telephone,
215
+ internet)
216
+ ? Creation of enough jobs and
217
+ improved living standards
218
+ ? Generation of reliable foreign
219
+ exchange revenue and debtsustainable
220
+ national economic
221
+ capacity
222
+ ? Completion of development
223
+ projects and investment plans
224
+ under public-private
225
+ partnerships
226
+ � Low reward for merit, productivity and effort
227
+ while low disincentive for laziness, wastefulness
228
+ and corruption
229
+ � Slow institutional change and transformation in:
230
+ ? Government policies
231
+ ? Investor attitude
232
+ ? Youth behaviour
233
+ ? Role of the intellectuals
234
+ � The need for sustained increase in production
235
+ and productivity
236
+ � The need to set a common national vision to
237
+ achieve major successes with consensus and
238
+ popular legitimacy
239
+ Major areas of failure in the economy
240
+ 2. Departures
241
+ 1. Emphasis on quality of economic growth
242
+ 2. Participation and coordination of sectors in the planning process
243
+ 3. Sectoral linkages and multi-sectoral development focus
244
+ 4. Preparation of national development corridors based on development potentials
245
+ 5. Focus on solving institutional bottlenecks
246
+ 6. The ongoing home grown economic reform programme as a sprinting board
247
+ 7. Emphasis on resilience building, innovation and entrepreneurship
248
+ 3. Strategic pillars
249
+ 1. Ensure quality growth
250
+ 2. Improve productivity and competitiveness
251
+ 3. Undertake institutional transformation
252
+ 4. Ensure private sector's leadership in the economy
253
+ 5. Ensure equitable participation of women and children
254
+ 6. Build climate resilient green economy
255
+ 3. Strategic pillars
256
+ � Increasing export revenues and substituting imports by
257
+ reducing production costs
258
+ � Availing quality and massive infrastructure
259
+ ? Linking infrastructural development with development corridors
260
+ � Producing required human resources with quality
261
+ � Producing enough and quality human resources
262
+ � Prioritizing innovative production systems
263
+ � Linking incentives with export revenue and job creation
264
+ performances
265
+ � Modernizing and enhancing the logistic system
266
+ � Creating technological competences needed for longterm
267
+ growth
268
+ � The economic growth should ensure:
269
+ ? Participation of all citizens and equitable utilization of the
270
+ growth proceeds
271
+ ? Improved standard of living of every citizen
272
+ ? Reduced poverty in all indicators
273
+ ? Reduced inflation and unemployment
274
+ � The economic growth should lead to increased
275
+ aggregate supply
276
+ � Focus on modern agriculture, manufacturing and
277
+ mining
278
+ � Emphasis on exploiting the sources of growth through
279
+ structural change
280
+ 1.Ensuring quality economic growth 2. Raising production and productivity
281
+ 3. Strategic pillars
282
+ � Build democratic and judicial institutions that ensure elite bargain,
283
+ national consensus, common vision and government legitimacy
284
+ � Build private sector and competition friendly bureaucracy
285
+ � Coordinate with parents, the society and teachers to make
286
+ educational institutions centers of excellence and virtuous citizens
287
+ � Coordinate with parents as well as social and religious leaders to
288
+ encourage religious institutions and their teachings contribute
289
+ towards poverty reduction efforts
290
+ � Prepare policies, strategies and legal frameworks for achieving
291
+ prosperity
292
+ � Increased focus on innovation and research
293
+ � Creating strong social security system
294
+ 3. Institutional Transformation 4. Private sector's leadership in the economy
295
+ � Create conducive investment climate and incentivize
296
+ domestic investors in key sectors
297
+ � Build strong and market-led public-private partnerships in
298
+ order to ensure the establishment of inclusive and
299
+ pragmatic market economy
300
+ � Enhance access and quality of infrastructure to attract
301
+ quality foreign direct investment
302
+ � Identify new sources of growth, empower and stimulate
303
+ the private sector, and supplement the private sector in
304
+ strategic areas
305
+ � Emphasis for public-private partnership on problem
306
+ solving innovations and research activities
307
+ 3. Strategic pillars
308
+ � Ensure gender equity in economic and social
309
+ sectors
310
+ ? Participation of women at all levels of education
311
+ ? Asset ownership of women
312
+ � Ensure fair participation of women and youth in
313
+ leadership and decision making positions
314
+ � Create awareness among citizens about the role of
315
+ women and youth in the country�s overall
316
+ development
317
+ � Increase basin development efforts to fight land
318
+ degradation and to reduce pollutions
319
+ � Improve productivity and reduce GHG emissions
320
+ � Increase forest protection and development
321
+ � Increase production of electricity from renewable
322
+ sources for domestic use and for export
323
+ � Focus on modern and energy saving technologies
324
+ 5. Equitable participation of women and children 6. Climate resilient green economy
325
+ 4. Macroeconomic Goals
326
+ Assumptions
327
+ ? Requirement to significantly reduce
328
+ poverty
329
+ ? Available national potentials
330
+ ? Potential for investment in the economy
331
+ ? Existing potentials in each sector
332
+ ? Low productivity that needs to be
333
+ improved
334
+ � Make Ethiopia a middle income
335
+ economy by 2022
336
+ � Raise per capita income to USD 1,115
337
+ in 2022
338
+ ? Threshold for middle-income is USD 1,026
339
+ ? Plus human development index and
340
+ economic vulnerability index
341
+ � Raise per capita income to USD 2,220
342
+ by 2030
343
+ Sectoral growth Targets (2021-2030)
344
+ Assured middle- income potential
345
+ 10.2%
346
+ Average
347
+ Growth
348
+ Target
349
+ Percentage of population below poverty line
350
+ 4. Macroeconomic Goals
351
+ Structural change
352
+ Financing Gaps
353
+ Reduce urban unemployment to less than 9%
354
+ ?1.36 million new jobs need to be
355
+ created per annum
356
+ Sectoral composition of GDP Labour force participation
357
+ Economic
358
+ Sectors
359
+ Performance Target
360
+ 2011 2015 2018/19 2030
361
+ Agriculture 45 39.7 32.8 22.0
362
+ Industry 15.1 21.2 27.6 35.9
363
+ Manufacturing 4.7 5.5 6.8 17.2
364
+ Services 39.9 39 39.4 42.1
365
+ 5. Implications of the COVID-19 pandemic and necessary mitigation measures
366
+ � GDP growth for 2019/20 fiscal year is projected to be lower than its target of 9.0% by between 2.81
367
+ and 3.80 percentage points (equivalent to 58.3 - 78.8 billion birr) due to COVID-19 pandemic
368
+ � If the current scenario continues, next year�s GDP growth could decline by 2.8 percentage points
369
+ � Returning the economy to its high growth trajectory requires focusing on sectors with high
370
+ productivity and job creation potentials
371
+ � Public investment should focus on empowering the private sector
372
+ � Promoting both domestic and foreign investments with the right set of incentives (merit based)
373
+ � Modernizing production systems and improving uptake of technology
374
+ � Conducting demand analysis for export commodities to remedy for the declining trend in exports
375
+ and foreign exchange earnings.
376
+ 6. Potentials
377
+ � Endowment of various natural resources contributing to the growth potential
378
+ � Huge unutilized arable land creates great potential for the success of the plan
379
+ � Endowment of gemstones, ornamental, energy, metals, and metallic minerals
380
+ � Gold, coal, iron ore, potash, tantalum, marble, petroleum and other natural resources
381
+ Natural
382
+ Resources
383
+ � Large youth population and potential for demographic dividend
384
+ � Cumulative capacity in education and health
385
+ � Positive attitude and noble culture of reaching agreement among citizens
386
+ Human
387
+ capital
388
+ 6. Potentials
389
+ Built physical and material capitals
390
+ ?Transport and communication
391
+ ? Irrigation infrastructures for modern agriculture
392
+ ?Industrial Parks
393
+ ?Mega energy infrastructures
394
+ Physical
395
+ capital
396
+ Unexploited
397
+ growth
398
+ potentials
399
+ � Utilizing the tourism potential through modernization
400
+ � Using the mining subsector as a source of input as well as a competitive industry in its
401
+ own right
402
+ 6. Potentials
403
+ � Solving supply side bottlenecks to satisfy the existing demand
404
+ � Improving international acceptance and reliable partnerships
405
+ ? The �medemer�/synergy philosophy
406
+ ? The ongoing political reform measures
407
+ ? The Homegrown Economic Reform programme
408
+ � Increased finance from partners and multilateral institutions
409
+ ? Increased availability of foreign exchange
410
+ ? Reduced debt stress for the short to medium term
411
+ ? Increased potential for development
412
+ Increased
413
+ demand as
414
+ potential
415
+ Political Capital
416
+ Continental
417
+ and regional
418
+ integrations
419
+ � Regional and continental economic integration agreements
420
+ � International and continental free trade agreements
421
+ 6. Potentials
422
+ Low
423
+ technology as
424
+ a potential
425
+ � Undeniably low status of technological development
426
+ � International mobility and spillover effect of technology
427
+ � Potential for development and catching up by filling the technological gaps
428
+ � Doubling crop productivity from the current 24-36 quintals per hectare will result
429
+ in 7% increase in crop production
430
+ � Raise the production efficiency of manufacturing from the current 50% to 80%
431
+ 7. Focus Areas
432
+ 7.1. Productive sectors: agriculture, manufacturing, mining
433
+ 7.2. Service sector: tourism
434
+ 7.3. Enabling sectors: energy, transport, sustainable finance,
435
+ innovation and technology, urban development, irrigation,
436
+ human capital development
437
+ 7.1. Productive sectors
438
+ Agriculture Objectives
439
+ 1. Free agriculture from rain dependence
440
+ 2. Agricultural mechanization services
441
+ 3. Contract farming, cluster approach and
442
+ land consolidation
443
+ 4. Livestock, animal feed and animal health
444
+ 5. Horticulture (irrigation and urban farming)
445
+ 6. Private sector participation
446
+ 7. Institutional implementation capacity
447
+ 8. Climate resilient sustainable agricultural
448
+ development
449
+ 1. Improve income and livelihood options for farming and pastoral
450
+ communities through increased productivity and competitiveness
451
+ 2. Modernize agriculture and ensure national food and nutrition security
452
+ 3. Raise export of agricultural output and substitute imports
453
+ 4. Make agriculture a viable and profitable enterprise through value addition
454
+ 5. Create rural employment opportunities
455
+ 6. Enhance livestock health access and quality
456
+ 7. Preserve animal genetic resources and increase pastoral research
457
+ 8. Improve the development of animal feed and access to markets
458
+ 9. Develop livestock specific extension package for each livestock type
459
+ Focus Areas
460
+ 7.1. Productive sector
461
+ Manufacturing Industry
462
+ Objectives
463
+ 1. Production of quality and competitive food, textile, housing and
464
+ pharmaceutical products for export and domestic markets
465
+ 2. Production and productivity of existing manufacturing industries
466
+ 3. Utilization of locally available inputs
467
+ 4. Value chains, linkages and interdependencies
468
+ 5. Linkages between large scale metallurgical and engineering,
469
+ chemical and pharmaceutical industries with other industries
470
+ 6. Job creation, cluster approaches and expanding small and medium
471
+ scale manufacturing
472
+ 7. Private sector participation and partnership
473
+ 1. Establish basis for domestic industrialization
474
+ 2. Value addition through enhanced inter-sectoral
475
+ linkages
476
+ 3. Enhance productivity through private sector
477
+ leadership and supportive role of the
478
+ government
479
+ ? Create job opportunities for the youth leaving
480
+ agriculture and concentrating in urban areas
481
+ ? Make exportable commodities internationally
482
+ competitive
483
+ ? Ensure structural change
484
+ Focus areas
485
+ 7.1. Productive sectors
486
+ Mining
487
+ Objectives
488
+ � Foreign exchange earning and
489
+ domestic revenues
490
+ � Increased investment in mining
491
+ � Participation of manufacturing
492
+ industries that add value
493
+ � Job creation
494
+ � Add value for improved contribution of the subsector
495
+ � Increase inter-sectoral linkages to raise raw material inputs to other
496
+ sectors
497
+ � Make mining a competent subsector and induce structural change
498
+ � Increase human resource and technological capabilities through
499
+ research and trainings
500
+ � Raise foreign exchange revenue from mining through increased
501
+ exploration and production
502
+ � Improve traditional mining production and marketing systems
503
+ � Improve the country�s geological information
504
+ Focus areas
505
+ 7.2. Service sector
506
+ Tourism
507
+ Objectives
508
+ � Identification and developing destinations
509
+ � Infrastructure
510
+ � Competitiveness
511
+ ?improve existing destinations
512
+ ?develop new destinations
513
+ ? diversify service and raise quality
514
+ � Market linkages, branding, and promotion
515
+ � Technology, research and development
516
+ � Preservation, maintenance and proper
517
+ utilization of heritage resources
518
+ � Expand job opportunities
519
+ � Raise incomes
520
+ � Build information management
521
+ systems
522
+ � Increase implementation capacity
523
+ Focus areas
524
+ 7.3. Enabling sectors
525
+ Urban development
526
+ Objectives
527
+ ? Prioritize productive sectors in job creation and enterprise
528
+ development plans
529
+ ? Rapid development and equity goals in land provision system
530
+ ? Participation of indigenous people in land redevelopment and
531
+ expansion
532
+ ? Urban land registration and cadaster system, modern
533
+ property valuation
534
+ ? Greenery and public spaces as well as waste disposal and
535
+ management in urban planning and implementation
536
+ ? Housing development and financing options to reduce
537
+ housing shortages
538
+ ? Integrated infrastructure and services provision
539
+ ? Role of private sector in infrastructure development and
540
+ service provision
541
+ � Expand micro and small-scale
542
+ enterprises to reduce urban
543
+ unemployment
544
+ � Develop and avail urban land based on
545
+ demand, equity and cost effectiveness
546
+ � Make quality housing accessible both in
547
+ rural and urban areas
548
+ � Develop quality and integrated
549
+ infrastructure as well as service
550
+ provision in towns
551
+ � Improve financial management and
552
+ resource utilization in urban areas
553
+ Focus areas
554
+ 7.3. Enabling sectors
555
+ Innovation and Technology
556
+ Objectives
557
+ ? Access to innovation and
558
+ technological information
559
+ ? Developing a digital economy
560
+ ? Productivity enhancement and
561
+ competitiveness
562
+ ? Build a digital economy
563
+ ? Develop national scientific research and technological
564
+ capabilities
565
+ ? Support problem solving research and development of
566
+ technologies necessary for raising production,
567
+ productivity and service provision
568
+ ? Create jobs and capital that are based on technology
569
+ ? Develop technological and data security protection
570
+ systems
571
+ Focus areas
572
+ 7.3. Enabling sectors
573
+ Sustainable finance
574
+ Objectives
575
+ � Access to modern finance and saving culture in rural
576
+ areas
577
+ � Support to the private sector and corporations to
578
+ reinvest profits in productive sectors
579
+ � Role of private financial institutions in manufacturing
580
+ and agriculture
581
+ � Digital revenue collection system
582
+ � Tax equity (contraband, tax evasion, and bringing the
583
+ underground economy to the tax system)
584
+ � Domestic and foreign strategic partnerships
585
+ � Transform financing from short term to long-term,
586
+ sustainable and quality sources
587
+ � Ensure financing quality based on sectoral prioritization
588
+ and reduction of wastage
589
+ � Increase the number of domestic saving institutions both
590
+ in rural and urban areas
591
+ � Support domestic finance with foreign exchange capacity
592
+ and foreign direct investment
593
+ � Modernize domestic revenue collection system
594
+ � Raise voluntary tax payment attitude
595
+ � Bring the informal sector to the formal tax system
596
+ Focus areas
597
+ 7.3. Enabling sectors
598
+ Transport
599
+ Objectives
600
+ � Access to infrastructure
601
+ � Implementation capacity
602
+ � Participation of the private sector and the general
603
+ public
604
+ � Financing capacity
605
+ � Ensure equitable access to transport infrastructure and
606
+ services
607
+ � Improve transport safety
608
+ � Make logistics services fast and reliable
609
+ � Build transport infrastructure and service that is
610
+ resilient to climate change
611
+ Focus areas
612
+ 7.3. Enabling sectors
613
+ Energy
614
+ Objectives
615
+ ? Equity in access to electricity services
616
+ ? Energy access and quality
617
+ ? Alternative sources of energy
618
+ ? Reliability of electricity infrastructure
619
+ ? Investment and income in energy subsector
620
+ � Ensure equitable access to transport
621
+ infrastructure and services
622
+ � Improve transport safety
623
+ � Make logistics services fast and reliable
624
+ � Build transport infrastructure and service that is
625
+ resilient to climate change
626
+ Focus areas
627
+ 7.3. Enabling sectors
628
+ Irrigation
629
+ Objectives
630
+ ? Medium and large scale irrigation infrastructure
631
+ ? Job creation
632
+ ? Share of government expenditure and alternative
633
+ financing options
634
+ ? Institutional capacity and human resource
635
+ development
636
+ ? Improve agricultural output and productivity
637
+ ? Reduce government spending and enhance
638
+ institutional capacity and human resources
639
+ development
640
+ ? Ensure the inclusion of all genders and
641
+ disabled citizens
642
+ ? Develop alternative financing options for
643
+ irrigation development
644
+ Focus areas
645
+ 7.3. Enabling sectors
646
+ Human capital development
647
+ Objectives
648
+ � Make education and training inclusive and equitable by
649
+ harmonizing the system with ability, need and capacity
650
+ � Develop capacity of educational institutions (teacher capacity,
651
+ inputs and technology)
652
+ � Establish education and training quality assurance system
653
+ � Avail free and compulsory education for pre-primary to junior
654
+ secondary levels and free education at the senior secondary levels
655
+ equitably
656
+ � Ensure the relevance of education and training system and
657
+ synchronize education policy with economic and social
658
+ development needs
659
+ � Make the education and training policy compatible with the
660
+ nation�s contemporary capacities as well as global and regional
661
+ market opportunities
662
+ � Enhance commitment, capability and responsibility of citizens
663
+ ? Ensure equitable and quality health services
664
+ ? Raise average life expectancy
665
+ ? Achieve universal health coverage through
666
+ proactive and prevention health system
667
+ ? Curtail preventable maternal and child deaths
668
+ ? Reduce incidences of contagious and noncontagious
669
+ related diseases and deaths
670
+ ? Build capacity for health tourism through
671
+ increased treatment capabilities
672
+ ? Create a healthy society that is free from
673
+ addictions and use technology for supporting
674
+ knowledge led economic development
675
+ Focus areas
676
+ 8 Nationally, regionally and locally balanced and competitive development
677
+ 1. Lack of synchronization of investment with
678
+ resource potentials and development needs
679
+ 2. Poor alignment of federal, regional and
680
+ district level investment plans with the
681
+ national development goals and envisioned
682
+ settlement patterns
683
+ 3. Poor regional coordination due to low
684
+ consideration for trans-regional and
685
+ spatial issues in development plans of
686
+ regional states
687
+ 4. Inter-regional and intra-regional
688
+ disparities in infrastructural development
689
+ and access to services
690
+ Challenges
691
+ 8. Nationally, regionally and locally balanced and competitive development
692
+ 1. Ensure that the investment flow and
693
+ infrastructural development plans fairly go hand in
694
+ hand with resource potential and development
695
+ needs
696
+ ?Developing underutilized natural resources
697
+ ?Equitable distribution and access to
698
+ infrastructure
699
+ ?Sustainable environmental protection
700
+ 2. Ensure the inclusion of pastoral and agro-pastoral
701
+ areas in the development
702
+ ?Focused infrastructural development in pastoral
703
+ areas such as education and health sector input
704
+ provision as well as governance
705
+ ?Market linkages with other areas and the central
706
+ markets
707
+ ?Improve rural finance (credit and insurance) to
708
+ encourage fattening, milk processing, leather
709
+ production and irrigation agriculture
710
+ Focus areas
711
+ 9. Monitoring and Evaluation
712
+ 10 Years Perspective
713
+ Plan KPIs
714
+ Federal Implementing
715
+ Institutions
716
+ Planning and
717
+ Development Commission
718
+ Generate Data (Census,
719
+ Sample and administrative
720
+ data)
721
+ Annual Reports
722
+ Dialogue forums
723
+ (Civic Organizations, professional
724
+ associations, development partners,
725
+ intellectuals)
726
+ Central Statistical Agency
727
+ Database
728
+ National
729
+ Information Portal
730
+ National Statistics
731
+ Development Strategic
732
+ plan
733
+ Evaluation Reports
734
+ Prime Minister�s Office
735
+ House of People�s
736
+ Representatives
737
+ Thank you!
docStore/sample/Seychelles-revised_first_ndc-EN.pdf ADDED
Binary file (372 kB). View file
 
docStore/sample/South Africa_s Low Emission Development Strategy.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd18bff36fff79b97c5a343912f1296ea2d9d5481cf92c2887774fb4f2800418
3
+ size 1503168
docStore/sample/files.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"Ethiopia: 10 Year Development Plan":"docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt",
2
+ "Seychells:Revised NDC":"docStore/sample/Seychelles-revised_first_ndc-EN.pdf",
3
+ "South Africa:Low Emission strategy":"docStore/sample/South Africa_s Low Emission Development Strategy.pdf"
4
+ }
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ poppler-utils
2
+ xpdf
3
+ tesseract-ocr
4
+ libtesseract-dev
paramconfig.cfg ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [target]
2
+ THRESHOLD = 0.50
3
+ MODEL = mtyrrell/ikitracs_economywide
4
+ SPLIT_BY = word
5
+ REMOVE_PUNC = 0
6
+ SPLIT_LENGTH = 60
7
+ SPLIT_OVERLAP = 10
8
+ RESPECT_SENTENCE_BOUNDARY = 1
9
+ TOP_KEY = 10
10
+
11
+ [netzero]
12
+ THRESHOLD = 0.50
13
+ MODEL = ilaria-oneofftech/ikitracks_netzero
14
+ SPLIT_BY = word
15
+ REMOVE_PUNC = 0
16
+ SPLIT_LENGTH = 60
17
+ SPLIT_OVERLAP = 10
18
+ RESPECT_SENTENCE_BOUNDARY = 1
19
+ TOP_KEY = 10
20
+
21
+ [sector]
22
+ THRESHOLD = 0.50
23
+ MODEL = ppsingh/bert-multilabel-sector-classifier
24
+ SPLIT_BY = word
25
+ REMOVE_PUNC = 0
26
+ SPLIT_LENGTH = 60
27
+ SPLIT_OVERLAP = 10
28
+ RESPECT_SENTENCE_BOUNDARY = 1
29
+ TOP_KEY = 10
30
+
31
+ [adapmit]
32
+ THRESHOLD = 0.50
33
+ MODEL = ppsingh/mpnet-adaptation_mitigation-classifier
34
+ SPLIT_BY = word
35
+ REMOVE_PUNC = 0
36
+ SPLIT_LENGTH = 60
37
+ SPLIT_OVERLAP = 10
38
+ RESPECT_SENTENCE_BOUNDARY = 1
39
+ TOP_KEY = 10
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ farm-haystack == 1.16
2
+ farm-haystack[ocr,pdf]==1.16.0
3
+ spacy==3.2.0
4
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
5
+ matplotlib==3.5.1
6
+ nltk==3.7
7
+ numpy==1.22.1
8
+ pandas==1.4.0
9
+ pdfplumber==0.6.2
10
+ Pillow==9.1.1
11
+ seaborn==0.11.2
12
+ transformers==4.25.1
13
+ st-annotated-text==3.0.0
14
+ markdown==3.4.1
15
+ summa==1.2.0
16
+ plotly
17
+ xlsxwriter
18
+ streamlit-aggrid
19
+ python-docx
style.css ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .row-widget.stTextInput > div:first-of-type {
3
+ background: #fff;
4
+ display: flex;
5
+ border: 1px solid #dfe1e5;
6
+ box-shadow: none;
7
+ border-radius: 24px;
8
+ height: 50px;
9
+ width: auto;
10
+ margin: 10px auto 30px;
11
+ }
12
+
13
+ .row-widget.stTextInput > div:first-of-type:hover,
14
+ .row-widget.stTextInput > div:first-of-type:focus {
15
+ box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
16
+ }
17
+
18
+ .row-widget.stTextInput .st-bq {
19
+ background-color: #fff;
20
+ }
21
+
22
+ .row-widget.stTextInput > label {
23
+ color: #b3b3b3;
24
+ }
25
+
26
+ .row-widget.stButton > button {
27
+ border-radius: 24px;
28
+ background-color: #B6C9B1;
29
+ color: #fff;
30
+ border: none;
31
+ padding: 6px 20px;
32
+ float: right;
33
+ background-image: none;
34
+ }
35
+
36
+ .row-widget.stButton > button:hover {
37
+ box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
38
+ }
39
+
40
+ .row-widget.stButton > button:focus {
41
+ border: none;
42
+ color: #fff;
43
+ }
44
+
45
+ .footer-custom {
46
+ position: fixed;
47
+ bottom: 0;
48
+ width: 100%;
49
+ color: var(--text-color);
50
+ max-width: 698px;
51
+ font-size: 14px;
52
+ height: 50px;
53
+ padding: 10px 0;
54
+ z-index: 50;
55
+ }
56
+
57
+ .main {
58
+ padding: 20px;
59
+ }
60
+
61
+ footer {
62
+ display: none !important;
63
+ }
64
+
65
+ .footer-custom a {
66
+ color: var(--text-color);
67
+ }
68
+
69
+ #wikipedia-assistant {
70
+ font-size: 36px;
71
+ }
72
+
73
+ .generated-answer p {
74
+ font-size: 16px;
75
+ font-weight: bold;
76
+ }
77
+
78
+ .react-json-view {
79
+ margin: 40px 0 80px;
80
+ }
81
+
82
+ .tooltip {
83
+ text-align: center;
84
+ line-height: 20px;
85
+ display: table-caption;
86
+ font-size: 10px;
87
+ border-radius: 50%;
88
+ height: 20px;
89
+ width: 20px;
90
+ position: relative;
91
+ cursor: pointer;
92
+ color:#000;
93
+ }
94
+
95
+ .tooltip .tooltiptext {
96
+ visibility: hidden;
97
+ width: 280px;
98
+ text-align: center;
99
+ border-radius: 6px;
100
+ padding: 10px;
101
+ position: absolute;
102
+ z-index: 1;
103
+ top: 25px;
104
+ left: 50%;
105
+ margin-left: -140px;
106
+ font-size: 14px;
107
+ background-color: #fff;
108
+ border: 1px solid #ccc;
109
+ box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
110
+ color: #000;
111
+ }
112
+
113
+ .tooltip:hover .tooltiptext {
114
+ visibility: visible;
115
+ }
116
+
117
+ .sentence-wrapper {
118
+ border-left: 4px solid #ffc423;
119
+ padding-left: 20px;
120
+ margin-bottom: 40px;
121
+ }
122
+
123
+ #context {
124
+ padding: 2rem 0 1rem;
125
+ }
126
+
127
+ hr {
128
+ margin: 2em 0 1em;
129
+ }
130
+
131
+
132
+ .technical-details-info {
133
+ margin-bottom: 100px;
134
+ }
135
+
136
+ .loader-wrapper {
137
+ display: flex;
138
+ align-items: center;
139
+ background-color: rgba(250, 202, 43, 0.2);
140
+ padding: 15px 20px;
141
+ border-radius: 6px;
142
+ }
143
+
144
+ .loader-wrapper p {
145
+ margin-bottom: 0;
146
+ margin-left: 20px;
147
+ }
148
+
149
+ .loader {
150
+ width: 30px;
151
+ height: 30px;
152
+ border: dotted 5px #868686;
153
+ border-radius: 100%;
154
+ animation: spin 1s linear infinite;
155
+ }
156
+
157
+ .loader-note {
158
+ font-size: 14px;
159
+ color: #b3b3b3;
160
+ margin-left: 5px;
161
+ }
162
+
163
+ @keyframes spin {
164
+ 0% {
165
+ transform: rotate(0deg) scale(0.8);
166
+ border-top-color: transparent;
167
+ border-right-color: transparent;
168
+ }
169
+ 50% { transform: rotate(180deg) scale(1.2);
170
+ border-color: #949494;
171
+ border-top-color: transparent;
172
+ border-right-color: transparent;
173
+ }
174
+ 100% { transform: rotate(360deg) scale(0.8);
175
+ border-color: #bbbbbb;
176
+ border-top-color: transparent;
177
+ border-right-color: transparent;
178
+ }
179
+ }
180
+
utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # adding for package implementation
utils/adapmit_classifier.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.schema import Document
2
+ from typing import List, Tuple
3
+ from typing_extensions import Literal
4
+ import logging
5
+ import pandas as pd
6
+ from pandas import DataFrame, Series
7
+ from utils.config import getconfig
8
+ from utils.preprocessing import processingpipeline
9
+ import streamlit as st
10
+ from haystack.nodes import TransformersDocumentClassifier
11
+ from transformers import pipeline
12
+
13
+ @st.cache_resource
14
+ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
15
+ """
16
+ loads the document classifier using haystack, where the name/path of model
17
+ in HF-hub as string is used to fetch the model object.Either configfile or
18
+ model should be passed.
19
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
20
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
21
+ Params
22
+ --------
23
+ config_file: config file path from which to read the model name
24
+ classifier_name: if modelname is passed, it takes a priority if not \
25
+ found then will look for configfile, else raise error.
26
+ Return: document classifier model
27
+ """
28
+ if not classifier_name:
29
+ if not config_file:
30
+ logging.warning("Pass either model name or config file")
31
+ return
32
+ else:
33
+ config = getconfig(config_file)
34
+ classifier_name = config.get('adapmit','MODEL')
35
+
36
+ logging.info("Loading Adaptation Mitigation classifier")
37
+ # doc_classifier = TransformersDocumentClassifier(
38
+ # model_name_or_path=classifier_name,
39
+ # task="text-classification",
40
+ # top_k = None)
41
+ doc_classifier = pipeline("text-classification",
42
+ model=classifier_name,
43
+ return_all_scores=True,
44
+ function_to_apply= "sigmoid")
45
+
46
+
47
+ return doc_classifier
48
+
49
+
50
+ def runAdapMitPreprocessingPipeline(file_name:str, file_path:str,
51
+ split_by: Literal["sentence", "word"] = 'sentence',
52
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
53
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
54
+ """
55
+ creates the pipeline and runs the preprocessing pipeline,
56
+ the params for pipeline are fetched from paramconfig
57
+ Params
58
+ ------------
59
+ file_name: filename, in case of streamlit application use
60
+ st.session_state['filename']
61
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
62
+ split_by: document splitting strategy either as word or sentence
63
+ split_length: when synthetically creating the paragrpahs from document,
64
+ it defines the length of paragraph.
65
+ split_respect_sentence_boundary: Used when using 'word' strategy for
66
+ splititng of text.
67
+ split_overlap: Number of words or sentences that overlap when creating
68
+ the paragraphs. This is done as one sentence or 'some words' make sense
69
+ when read in together with others. Therefore the overlap is used.
70
+ remove_punc: to remove all Punctuation including ',' and '.' or not
71
+ Return
72
+ --------------
73
+ List[Document]: When preprocessing pipeline is run, the output dictionary
74
+ has four objects. For the Haysatck implementation of SDG classification we,
75
+ need to use the List of Haystack Document, which can be fetched by
76
+ key = 'documents' on output.
77
+ """
78
+
79
+ adapmit_processing_pipeline = processingpipeline()
80
+
81
+ output_adapmit_pre = adapmit_processing_pipeline.run(file_paths = file_path,
82
+ params= {"FileConverter": {"file_path": file_path, \
83
+ "file_name": file_name},
84
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
85
+ "split_by": split_by, \
86
+ "split_length":split_length,\
87
+ "split_overlap": split_overlap, \
88
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
89
+
90
+ return output_adapmit_pre
91
+
92
+
93
+ @st.cache_data
94
+ def adapmit_classification(haystack_doc:List[Document],
95
+ threshold:float = 0.5,
96
+ classifier_model:pipeline= None
97
+ )->Tuple[DataFrame,Series]:
98
+ """
99
+ Text-Classification on the list of texts provided. Classifier provides the
100
+ most appropriate label for each text. these labels are in terms of if text
101
+ belongs to which particular Sustainable Devleopment Goal (SDG).
102
+ Params
103
+ ---------
104
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
105
+ contains the list of paragraphs in different format,here the list of
106
+ Haystack Documents is used.
107
+ threshold: threshold value for the model to keep the results from classifier
108
+ classifiermodel: you can pass the classifier model directly,which takes priority
109
+ however if not then looks for model in streamlit session.
110
+ In case of streamlit avoid passing the model directly.
111
+ Returns
112
+ ----------
113
+ df: Dataframe with two columns['SDG:int', 'text']
114
+ x: Series object with the unique SDG covered in the document uploaded and
115
+ the number of times it is covered/discussed/count_of_paragraphs.
116
+ """
117
+ logging.info("Working on Adaptation-Mitigation Identification")
118
+ if not classifier_model:
119
+ classifier_model = st.session_state['adapmit_classifier']
120
+
121
+ predictions = classifier_model(haystack_doc)
122
+ # converting the predictions to desired format
123
+ list_ = []
124
+ for i in range(len(predictions)):
125
+
126
+ temp = predictions[i]
127
+ placeholder = {}
128
+ for j in range(len(temp)):
129
+ placeholder[temp[j]['label']] = temp[j]['score']
130
+ list_.append(placeholder)
131
+ labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
132
+ # labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
133
+ df = DataFrame.from_dict(labels_)
134
+ df = df.round(2)
135
+
136
+ return df
utils/config.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ import logging
3
+
4
+ def getconfig(configfile_path:str):
5
+ """
6
+ configfile_path: file path of .cfg file
7
+ """
8
+
9
+ config = configparser.ConfigParser()
10
+
11
+ try:
12
+ config.read_file(open(configfile_path))
13
+ return config
14
+ except:
15
+ logging.warning("config file not found")
16
+
17
+
18
+ # Declare all the necessary variables
19
+ def get_classifier_params(model_name):
20
+ config = getconfig('paramconfig.cfg')
21
+ params = {}
22
+ params['model_name'] = config.get(model_name,'MODEL')
23
+ params['split_by'] = config.get(model_name,'SPLIT_BY')
24
+ params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
25
+ params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
26
+ params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
27
+ params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
28
+ params['threshold'] = float(config.get(model_name,'THRESHOLD'))
29
+ params['top_n'] = int(config.get(model_name,'TOP_KEY'))
30
+
31
+ return params
utils/netzero_classifier.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TransformersDocumentClassifier
2
+ from haystack.schema import Document
3
+ from typing import List, Tuple
4
+ from typing_extensions import Literal
5
+ import logging
6
+ import pandas as pd
7
+ from pandas import DataFrame, Series
8
+ from utils.config import getconfig
9
+ from utils.preprocessing import processingpipeline
10
+ import streamlit as st
11
+
12
+ # Labels dictionary ###
13
+ _lab_dict = {
14
+ 'NEGATIVE':'NO NETZERO TARGET',
15
+ 'NETZERO':'NETZERO TARGET',
16
+ }
17
+
18
+ @st.cache_resource
19
+ def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
20
+ """
21
+ loads the document classifier using haystack, where the name/path of model
22
+ in HF-hub as string is used to fetch the model object.Either configfile or
23
+ model should be passed.
24
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
25
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
26
+ Params
27
+ --------
28
+ config_file: config file path from which to read the model name
29
+ classifier_name: if modelname is passed, it takes a priority if not \
30
+ found then will look for configfile, else raise error.
31
+ Return: document classifier model
32
+ """
33
+ if not classifier_name:
34
+ if not config_file:
35
+ logging.warning("Pass either model name or config file")
36
+ return
37
+ else:
38
+ config = getconfig(config_file)
39
+ classifier_name = config.get('netzero','MODEL')
40
+
41
+ logging.info("Loading netzero classifier")
42
+ doc_classifier = TransformersDocumentClassifier(
43
+ model_name_or_path=classifier_name,
44
+ task="text-classification")
45
+
46
+ return doc_classifier
47
+
48
+
49
+ def runNetZeroPreprocessingPipeline(file_name:str, file_path:str,
50
+ split_by: Literal["sentence", "word"] = 'sentence',
51
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
52
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
53
+ """
54
+ creates the pipeline and runs the preprocessing pipeline,
55
+ the params for pipeline are fetched from paramconfig
56
+ Params
57
+ ------------
58
+ file_name: filename, in case of streamlit application use
59
+ st.session_state['filename']
60
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
61
+ split_by: document splitting strategy either as word or sentence
62
+ split_length: when synthetically creating the paragrpahs from document,
63
+ it defines the length of paragraph.
64
+ split_respect_sentence_boundary: Used when using 'word' strategy for
65
+ splititng of text.
66
+ split_overlap: Number of words or sentences that overlap when creating
67
+ the paragraphs. This is done as one sentence or 'some words' make sense
68
+ when read in together with others. Therefore the overlap is used.
69
+ remove_punc: to remove all Punctuation including ',' and '.' or not
70
+ Return
71
+ --------------
72
+ List[Document]: When preprocessing pipeline is run, the output dictionary
73
+ has four objects. For the Haysatck implementation of SDG classification we,
74
+ need to use the List of Haystack Document, which can be fetched by
75
+ key = 'documents' on output.
76
+ """
77
+
78
+ netzero_processing_pipeline = processingpipeline()
79
+
80
+ output_netzero_pre = netzero_processing_pipeline.run(file_paths = file_path,
81
+ params= {"FileConverter": {"file_path": file_path, \
82
+ "file_name": file_name},
83
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
84
+ "split_by": split_by, \
85
+ "split_length":split_length,\
86
+ "split_overlap": split_overlap, \
87
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
88
+
89
+ return output_netzero_pre
90
+
91
+
92
+ @st.cache_data
93
+ def netzero_classification(haystack_doc:List[Document],
94
+ threshold:float = 0.8,
95
+ classifier_model:TransformersDocumentClassifier= None
96
+ )->Tuple[DataFrame,Series]:
97
+ """
98
+ Text-Classification on the list of texts provided. Classifier provides the
99
+ most appropriate label for each text. these labels are in terms of if text
100
+ belongs to which particular Sustainable Devleopment Goal (SDG).
101
+ Params
102
+ ---------
103
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
104
+ contains the list of paragraphs in different format,here the list of
105
+ Haystack Documents is used.
106
+ threshold: threshold value for the model to keep the results from classifier
107
+ classifiermodel: you can pass the classifier model directly,which takes priority
108
+ however if not then looks for model in streamlit session.
109
+ In case of streamlit avoid passing the model directly.
110
+ Returns
111
+ ----------
112
+ df: Dataframe with two columns['SDG:int', 'text']
113
+ x: Series object with the unique SDG covered in the document uploaded and
114
+ the number of times it is covered/discussed/count_of_paragraphs.
115
+ """
116
+ logging.info("Working on Netzero Extraction")
117
+ if not classifier_model:
118
+ classifier_model = st.session_state['netzero_classifier']
119
+
120
+ results = classifier_model.predict(haystack_doc)
121
+ labels_= [(l.meta['classification']['label'],
122
+ l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
123
+
124
+ df = DataFrame(labels_, columns=["Target Label","Relevancy", "page","text"])
125
+
126
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
127
+ df.index += 1
128
+ # df =df[df['Relevancy']>threshold]
129
+ df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
130
+
131
+ # creating the dataframe for value counts of Labels, along with 'title' of Labels
132
+ # count_df = df['Target Label'].value_counts()
133
+ # count_df = count_df.rename('count')
134
+ # count_df = count_df.rename_axis('Target Label').reset_index()
135
+ # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
136
+
137
+ return df
utils/preprocessing.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes.base import BaseComponent
2
+ from haystack.schema import Document
3
+ from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
+ from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
+ from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
+ from typing_extensions import Literal
7
+ import pandas as pd
8
+ import logging
9
+ import re
10
+ import string
11
+ from haystack.pipelines import Pipeline
12
+
13
+ def useOCR(file_path: str)-> Text:
14
+ """
15
+ Converts image pdfs into text, Using the Farm-haystack[OCR]
16
+
17
+ Params
18
+ ----------
19
+ file_path: file_path of uploade file, returned by add_upload function in
20
+ uploadAndExample.py
21
+
22
+ Returns the text file as string.
23
+ """
24
+
25
+
26
+ converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
+ valid_languages=["eng"])
28
+ docs = converter.convert(file_path=file_path, meta=None)
29
+ return docs[0].content
30
+
31
+
32
+
33
+
34
+ class FileConverter(BaseComponent):
35
+ """
36
+ Wrapper class to convert uploaded document into text by calling appropriate
37
+ Converter class, will use internally haystack PDFToTextOCR in case of image
38
+ pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
+ label/output class for image.
40
+
41
+ 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
+ 2. https://docs.haystack.deepset.ai/docs/file_converters
43
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
+ 4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
+
46
+
47
+ """
48
+
49
+ outgoing_edges = 1
50
+
51
+ def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
52
+ id_hash_keys: Optional[List[str]] = None,
53
+ ) -> Tuple[dict,str]:
54
+ """ this is required method to invoke the component in
55
+ the pipeline implementation.
56
+
57
+ Params
58
+ ----------
59
+ file_name: name of file
60
+ file_path: file_path of uploade file, returned by add_upload function in
61
+ uploadAndExample.py
62
+
63
+ See the links provided in Class docstring/description to see other params
64
+
65
+ Return
66
+ ---------
67
+ output: dictionary, with key as identifier and value could be anything
68
+ we need to return. In this case its the List of Hasyatck Document
69
+
70
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
71
+ """
72
+ try:
73
+ if file_name.endswith('.pdf'):
74
+ converter = PDFToTextConverter(remove_numeric_tables=True)
75
+ if file_name.endswith('.txt'):
76
+ converter = TextConverter(remove_numeric_tables=True)
77
+ if file_name.endswith('.docx'):
78
+ converter = DocxToTextConverter()
79
+ except Exception as e:
80
+ logging.error(e)
81
+ return
82
+
83
+
84
+
85
+ documents = []
86
+
87
+
88
+ # encoding is empty, probably should be utf-8
89
+ document = converter.convert(
90
+ file_path=file_path, meta=None,
91
+ encoding=encoding, id_hash_keys=id_hash_keys
92
+ )[0]
93
+
94
+ text = document.content
95
+
96
+ # in case of scanned/images only PDF the content might contain only
97
+ # the page separator (\f or \x0c). We check if is so and use
98
+ # use the OCR to get the text.
99
+ filtered = re.sub(r'\x0c', '', text)
100
+
101
+ if filtered == "":
102
+ logging.info("Using OCR")
103
+ text = useOCR(file_path)
104
+
105
+ documents.append(Document(content=text,
106
+ meta={"name": file_name},
107
+ id_hash_keys=id_hash_keys))
108
+
109
+ logging.info('file conversion succesful')
110
+ output = {'documents': documents}
111
+ return output, 'output_1'
112
+
113
+ def run_batch():
114
+ """
115
+ we dont have requirement to process the multiple files in one go
116
+ therefore nothing here, however to use the custom node we need to have
117
+ this method for the class.
118
+ """
119
+
120
+ return
121
+
122
+
123
+ def basic(s:str, remove_punc:bool = False):
124
+
125
+ """
126
+ Performs basic cleaning of text.
127
+
128
+ Params
129
+ ----------
130
+ s: string to be processed
131
+ removePunc: to remove all Punctuation including ',' and '.' or not
132
+
133
+ Returns: processed string: see comments in the source code for more info
134
+ """
135
+
136
+ # Remove URLs
137
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
138
+ s = re.sub(r"http\S+", " ", s)
139
+
140
+ # Remove new line characters
141
+ s = re.sub('\n', ' ', s)
142
+
143
+ # Remove punctuations
144
+ if remove_punc == True:
145
+ translator = str.maketrans(' ', ' ', string.punctuation)
146
+ s = s.translate(translator)
147
+ # Remove distracting single quotes and dotted pattern
148
+ s = re.sub("\'", " ", s)
149
+ s = s.replace("..","")
150
+
151
+ return s.strip()
152
+
153
+ def paraLengthCheck(paraList, max_len = 512):
154
+ new_para_list = []
155
+ for passage in paraList:
156
+ if len(passage.split()) > max_len:
157
+ iterations = int(len(passage.split())/max_len)
158
+ # # st.write("Splitting")
159
+ for i in range(iterations):
160
+ temp = " ".join(passage.split()[max_len*i:max_len*(i+1)])
161
+ new_para_list.append(temp)
162
+ temp = " ".join(passage.split()[max_len*(i+1):])
163
+ new_para_list.append(temp)
164
+ else:
165
+ new_para_list.append(passage)
166
+
167
+ return new_para_list
168
+
169
+ class UdfPreProcessor(BaseComponent):
170
+ """
171
+ class to preprocess the document returned by FileConverter. It will check
172
+ for splitting strategy and splits the document by word or sentences and then
173
+ synthetically create the paragraphs.
174
+
175
+ 1. https://docs.haystack.deepset.ai/docs/preprocessor
176
+ 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
177
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
178
+
179
+ """
180
+ outgoing_edges = 1
181
+
182
+ def run(self, documents:List[Document], remove_punc:bool=False,
183
+ split_by: Literal["sentence", "word"] = 'sentence',
184
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
185
+ split_overlap:int = 0):
186
+
187
+ """ this is required method to invoke the component in
188
+ the pipeline implementation.
189
+
190
+ Params
191
+ ----------
192
+ documents: documents from the output dictionary returned by Fileconverter
193
+ remove_punc: to remove all Punctuation including ',' and '.' or not
194
+ split_by: document splitting strategy either as word or sentence
195
+ split_length: when synthetically creating the paragrpahs from document,
196
+ it defines the length of paragraph.
197
+ split_respect_sentence_boundary: Used when using 'word' strategy for
198
+ splititng of text.
199
+ split_overlap: Number of words or sentences that overlap when creating
200
+ the paragraphs. This is done as one sentence or 'some words' make sense
201
+ when read in together with others. Therefore the overlap is used.
202
+
203
+ Return
204
+ ---------
205
+ output: dictionary, with key as identifier and value could be anything
206
+ we need to return. In this case the output will contain 4 objects
207
+ the paragraphs text list as List, Haystack document, Dataframe and
208
+ one raw text file.
209
+
210
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
211
+
212
+ """
213
+
214
+ if split_by == 'sentence':
215
+ split_respect_sentence_boundary = False
216
+
217
+ else:
218
+ split_respect_sentence_boundary = split_respect_sentence_boundary
219
+
220
+ preprocessor = PreProcessor(
221
+ clean_empty_lines=True,
222
+ clean_whitespace=True,
223
+ clean_header_footer=True,
224
+ split_by=split_by,
225
+ split_length=split_length,
226
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
227
+ split_overlap=split_overlap,
228
+
229
+ # will add page number only in case of PDF not for text/docx file.
230
+ add_page_number=True
231
+ )
232
+
233
+ for i in documents:
234
+ # # basic cleaning before passing it to preprocessor.
235
+ # i = basic(i)
236
+ docs_processed = preprocessor.process([i])
237
+ for item in docs_processed:
238
+ item.content = basic(item.content, remove_punc= remove_punc)
239
+
240
+ df = pd.DataFrame(docs_processed)
241
+ all_text = " ".join(df.content.to_list())
242
+ para_list = df.content.to_list()
243
+ logging.info('document split into {} paragraphs'.format(len(para_list)))
244
+ output = {'documents': docs_processed,
245
+ 'dataframe': df,
246
+ 'text': all_text,
247
+ 'paraList': para_list
248
+ }
249
+ return output, "output_1"
250
+ def run_batch():
251
+ """
252
+ we dont have requirement to process the multiple files in one go
253
+ therefore nothing here, however to use the custom node we need to have
254
+ this method for the class.
255
+ """
256
+ return
257
+
258
+ def processingpipeline():
259
+ """
260
+ Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
261
+ from utils.preprocessing
262
+
263
+ """
264
+
265
+ preprocessing_pipeline = Pipeline()
266
+ file_converter = FileConverter()
267
+ custom_preprocessor = UdfPreProcessor()
268
+
269
+ preprocessing_pipeline.add_node(component=file_converter,
270
+ name="FileConverter", inputs=["File"])
271
+ preprocessing_pipeline.add_node(component = custom_preprocessor,
272
+ name ='UdfPreProcessor', inputs=["FileConverter"])
273
+
274
+ return preprocessing_pipeline
275
+
utils/sector_classifier.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.schema import Document
2
+ from typing import List, Tuple
3
+ from typing_extensions import Literal
4
+ import logging
5
+ import pandas as pd
6
+ from pandas import DataFrame, Series
7
+ from utils.config import getconfig
8
+ from utils.preprocessing import processingpipeline
9
+ import streamlit as st
10
+ from haystack.nodes import TransformersDocumentClassifier
11
+ from transformers import pipeline
12
+
13
+
14
+ # # Labels dictionary ###
15
+ # _lab_dict = {
16
+ # 'NEGATIVE':'NO NETZERO TARGET',
17
+ # 'NETZERO':'NETZERO TARGET',
18
+ # }
19
+
20
+ @st.cache_resource
21
+ def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
22
+ """
23
+ loads the document classifier using haystack, where the name/path of model
24
+ in HF-hub as string is used to fetch the model object.Either configfile or
25
+ model should be passed.
26
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
27
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
28
+ Params
29
+ --------
30
+ config_file: config file path from which to read the model name
31
+ classifier_name: if modelname is passed, it takes a priority if not \
32
+ found then will look for configfile, else raise error.
33
+ Return: document classifier model
34
+ """
35
+ if not classifier_name:
36
+ if not config_file:
37
+ logging.warning("Pass either model name or config file")
38
+ return
39
+ else:
40
+ config = getconfig(config_file)
41
+ classifier_name = config.get('sector','MODEL')
42
+
43
+ logging.info("Loading sector classifier")
44
+ # we are using the pipeline as the model is multilabel and DocumentClassifier
45
+ # from Haystack doesnt support multilabel
46
+ # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
47
+ # if not then it will automatically use softmax, which is not a desired thing.
48
+ # doc_classifier = TransformersDocumentClassifier(
49
+ # model_name_or_path=classifier_name,
50
+ # task="text-classification",
51
+ # top_k = None)
52
+
53
+ doc_classifier = pipeline("text-classification",
54
+ model=classifier_name,
55
+ return_all_scores=True,
56
+ function_to_apply= "sigmoid")
57
+
58
+ return doc_classifier
59
+
60
+
61
+ def runSectorPreprocessingPipeline(file_name:str, file_path:str,
62
+ split_by: Literal["sentence", "word"] = 'sentence',
63
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
64
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
65
+ """
66
+ creates the pipeline and runs the preprocessing pipeline,
67
+ the params for pipeline are fetched from paramconfig
68
+ Params
69
+ ------------
70
+ file_name: filename, in case of streamlit application use
71
+ st.session_state['filename']
72
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
73
+ split_by: document splitting strategy either as word or sentence
74
+ split_length: when synthetically creating the paragrpahs from document,
75
+ it defines the length of paragraph.
76
+ split_respect_sentence_boundary: Used when using 'word' strategy for
77
+ splititng of text.
78
+ split_overlap: Number of words or sentences that overlap when creating
79
+ the paragraphs. This is done as one sentence or 'some words' make sense
80
+ when read in together with others. Therefore the overlap is used.
81
+ remove_punc: to remove all Punctuation including ',' and '.' or not
82
+ Return
83
+ --------------
84
+ List[Document]: When preprocessing pipeline is run, the output dictionary
85
+ has four objects. For the Haysatck implementation of SDG classification we,
86
+ need to use the List of Haystack Document, which can be fetched by
87
+ key = 'documents' on output.
88
+ """
89
+
90
+ sector_processing_pipeline = processingpipeline()
91
+
92
+ output_sector_pre = sector_processing_pipeline.run(file_paths = file_path,
93
+ params= {"FileConverter": {"file_path": file_path, \
94
+ "file_name": file_name},
95
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
96
+ "split_by": split_by, \
97
+ "split_length":split_length,\
98
+ "split_overlap": split_overlap, \
99
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
100
+
101
+ return output_sector_pre
102
+
103
+
104
+ @st.cache_data
105
+ def sector_classification(haystack_doc:List[Document],
106
+ threshold:float = 0.8,
107
+ classifier_model:TransformersDocumentClassifier= None
108
+ )->Tuple[DataFrame,Series]:
109
+ """
110
+ Text-Classification on the list of texts provided. Classifier provides the
111
+ most appropriate label for each text. these labels are in terms of if text
112
+ belongs to which particular Sustainable Devleopment Goal (SDG).
113
+ Params
114
+ ---------
115
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
116
+ contains the list of paragraphs in different format,here the list of
117
+ Haystack Documents is used.
118
+ threshold: threshold value for the model to keep the results from classifier
119
+ classifiermodel: you can pass the classifier model directly,which takes priority
120
+ however if not then looks for model in streamlit session.
121
+ In case of streamlit avoid passing the model directly.
122
+ Returns
123
+ ----------
124
+ df: Dataframe with two columns['SDG:int', 'text']
125
+ x: Series object with the unique SDG covered in the document uploaded and
126
+ the number of times it is covered/discussed/count_of_paragraphs.
127
+ """
128
+ logging.info("Working on Sector Identification")
129
+ if not classifier_model:
130
+ classifier_model = st.session_state['sector_classifier']
131
+
132
+ predictions = classifier_model(haystack_doc)
133
+ list_ = []
134
+ for i in range(len(predictions)):
135
+
136
+ temp = predictions[i]
137
+ placeholder = {}
138
+ for j in range(len(temp)):
139
+ placeholder[temp[j]['label']] = temp[j]['score']
140
+ list_.append(placeholder)
141
+ labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
142
+ # labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
143
+ df = DataFrame.from_dict(labels_)
144
+ df = df.round(2)
145
+
146
+ return df
utils/target_classifier.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TransformersDocumentClassifier
2
+ from haystack.schema import Document
3
+ from typing import List, Tuple
4
+ from typing_extensions import Literal
5
+ import logging
6
+ import pandas as pd
7
+ from pandas import DataFrame, Series
8
+ from utils.config import getconfig
9
+ from utils.preprocessing import processingpipeline
10
+ import streamlit as st
11
+
12
+ ## Labels dictionary ###
13
+ _lab_dict = {
14
+ 'LABEL_0':'NO TARGET INFO',
15
+ 'LABEL_1':'ECONOMY-WIDE TARGET',
16
+ }
17
+
18
+ @st.cache_resource
19
+ def load_targetClassifier(config_file:str = None, classifier_name:str = None):
20
+ """
21
+ loads the document classifier using haystack, where the name/path of model
22
+ in HF-hub as string is used to fetch the model object.Either configfile or
23
+ model should be passed.
24
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
25
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
26
+ Params
27
+ --------
28
+ config_file: config file path from which to read the model name
29
+ classifier_name: if modelname is passed, it takes a priority if not \
30
+ found then will look for configfile, else raise error.
31
+ Return: document classifier model
32
+ """
33
+ if not classifier_name:
34
+ if not config_file:
35
+ logging.warning("Pass either model name or config file")
36
+ return
37
+ else:
38
+ config = getconfig(config_file)
39
+ classifier_name = config.get('target','MODEL')
40
+
41
+ logging.info("Loading classifier")
42
+ doc_classifier = TransformersDocumentClassifier(
43
+ model_name_or_path=classifier_name,
44
+ task="text-classification")
45
+
46
+ return doc_classifier
47
+
48
+
49
+ def runTargetPreprocessingPipeline(file_name:str, file_path:str,
50
+ split_by: Literal["sentence", "word"] = 'sentence',
51
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
52
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
53
+ """
54
+ creates the pipeline and runs the preprocessing pipeline,
55
+ the params for pipeline are fetched from paramconfig
56
+ Params
57
+ ------------
58
+ file_name: filename, in case of streamlit application use
59
+ st.session_state['filename']
60
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
61
+ split_by: document splitting strategy either as word or sentence
62
+ split_length: when synthetically creating the paragrpahs from document,
63
+ it defines the length of paragraph.
64
+ split_respect_sentence_boundary: Used when using 'word' strategy for
65
+ splititng of text.
66
+ split_overlap: Number of words or sentences that overlap when creating
67
+ the paragraphs. This is done as one sentence or 'some words' make sense
68
+ when read in together with others. Therefore the overlap is used.
69
+ remove_punc: to remove all Punctuation including ',' and '.' or not
70
+ Return
71
+ --------------
72
+ List[Document]: When preprocessing pipeline is run, the output dictionary
73
+ has four objects. For the Haysatck implementation of SDG classification we,
74
+ need to use the List of Haystack Document, which can be fetched by
75
+ key = 'documents' on output.
76
+ """
77
+
78
+ target_processing_pipeline = processingpipeline()
79
+
80
+ output_target_pre = target_processing_pipeline.run(file_paths = file_path,
81
+ params= {"FileConverter": {"file_path": file_path, \
82
+ "file_name": file_name},
83
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
84
+ "split_by": split_by, \
85
+ "split_length":split_length,\
86
+ "split_overlap": split_overlap, \
87
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
88
+
89
+ return output_target_pre
90
+
91
+
92
+ @st.cache_data
93
+ def target_classification(haystack_doc:List[Document],
94
+ threshold:float = 0.8,
95
+ classifier_model:TransformersDocumentClassifier= None
96
+ )->Tuple[DataFrame,Series]:
97
+ """
98
+ Text-Classification on the list of texts provided. Classifier provides the
99
+ most appropriate label for each text. these labels are in terms of if text
100
+ belongs to which particular Sustainable Devleopment Goal (SDG).
101
+ Params
102
+ ---------
103
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
104
+ contains the list of paragraphs in different format,here the list of
105
+ Haystack Documents is used.
106
+ threshold: threshold value for the model to keep the results from classifier
107
+ classifiermodel: you can pass the classifier model directly,which takes priority
108
+ however if not then looks for model in streamlit session.
109
+ In case of streamlit avoid passing the model directly.
110
+ Returns
111
+ ----------
112
+ df: Dataframe with two columns['SDG:int', 'text']
113
+ x: Series object with the unique SDG covered in the document uploaded and
114
+ the number of times it is covered/discussed/count_of_paragraphs.
115
+ """
116
+ logging.info("Working on Target Extraction")
117
+ if not classifier_model:
118
+ classifier_model = st.session_state['target_classifier']
119
+
120
+ results = classifier_model.predict(haystack_doc)
121
+ labels_= [(l.meta['classification']['label'],
122
+ l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
123
+
124
+
125
+ df = DataFrame(labels_, columns=["Target Label","Relevancy","page","text"])
126
+
127
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
128
+ df.index += 1
129
+ # df =df[df['Relevancy']>threshold]
130
+ df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
131
+
132
+ # creating the dataframe for value counts of Labels, along with 'title' of Labels
133
+ # count_df = df['Target Label'].value_counts()
134
+ # count_df = count_df.rename('count')
135
+ # count_df = count_df.rename_axis('Target Label').reset_index()
136
+ # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
137
+
138
+ return df
utils/uploadAndExample.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import json
4
+
5
+ def add_upload(choice):
6
+ """
7
+ Provdies the user with choice to either 'Upload Document' or 'Try Example'.
8
+ Based on user choice runs streamlit processes and save the path and name of
9
+ the 'file' to streamlit session_state which then can be fetched later.
10
+
11
+ """
12
+
13
+ if choice == 'Upload Document':
14
+ uploaded_file = st.sidebar.file_uploader('Upload the File',
15
+ type=['pdf', 'docx', 'txt'])
16
+ if uploaded_file is not None:
17
+ with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
18
+ bytes_data = uploaded_file.getvalue()
19
+ temp.write(bytes_data)
20
+ st.session_state['filename'] = uploaded_file.name
21
+ st.session_state['filepath'] = temp.name
22
+
23
+
24
+ else:
25
+ # listing the options
26
+ with open('docStore/sample/files.json','r') as json_file:
27
+ files = json.load(json_file)
28
+
29
+ option = st.sidebar.selectbox('Select the example document',
30
+ list(files.keys()))
31
+ file_name = file_path = files[option]
32
+ st.session_state['filename'] = file_name
33
+ st.session_state['filepath'] = file_path