ppsingh commited on
Commit
6d737a4
·
1 Parent(s): 031e5e2
app.py CHANGED
@@ -2,19 +2,43 @@ import appStore.target as target_extraction
2
  import appStore.netzero as netzero
3
  import appStore.sector as sector
4
  import appStore.adapmit as adapmit
5
- # import appStore.info as info
6
- from appStore.multiapp import MultiApp
 
7
  import streamlit as st
8
 
9
  st.set_page_config(page_title = 'Climate Policy Intelligence',
10
  initial_sidebar_state='expanded', layout="wide")
11
 
12
- app = MultiApp()
 
 
 
 
 
 
 
13
 
14
- # app.add_app("About","house", info.app)
15
- app.add_app("Economy-Wide Target Extraction","gear",target_extraction.app)
16
- app.add_app("NetZero Target Extraction","gear", netzero.app)
17
- app.add_app("Sector Classification","gear", sector.app)
18
- app.add_app("Adaptation-Mitigation","gear", adapmit.app)
19
 
20
- app.run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import appStore.netzero as netzero
3
  import appStore.sector as sector
4
  import appStore.adapmit as adapmit
5
+ import appStore.ghg as ghg
6
+ import appStore.doc_processing as processing
7
+ from utils.uploadAndExample import add_upload
8
  import streamlit as st
9
 
10
  st.set_page_config(page_title = 'Climate Policy Intelligence',
11
  initial_sidebar_state='expanded', layout="wide")
12
 
13
+ with st.sidebar:
14
+ # upload and example doc
15
+ choice = st.sidebar.radio(label = 'Select the Document',
16
+ help = 'You can upload the document \
17
+ or else you can try a example document',
18
+ options = ('Upload Document', 'Try Example'),
19
+ horizontal = True)
20
+ add_upload(choice)
21
 
22
+ with st.container():
23
+ st.markdown("<h2 style='text-align: center; color: black;'> Climate Policy Intelligence App </h2>", unsafe_allow_html=True)
24
+ st.write(' ')
 
 
25
 
26
+ # with st.expander("ℹ️ - About this app", expanded=False):
27
+ # st.write(
28
+ # """
29
+ # Climate Policy Understanding App is an open-source\
30
+ # digital tool which aims to assist policy analysts and \
31
+ # other users in extracting and filtering relevant \
32
+ # information from public documents.
33
+ # """)
34
+ # st.write("")
35
+ apps = [processing.app, target_extraction.app, netzero.app, ghg.app,
36
+ sector.app, adapmit.app]
37
+ multiplier_val = int(100/len(apps))
38
+ if st.button("Get the work done"):
39
+ prg = st.progress(0)
40
+ for i,func in enumerate(apps):
41
+ func()
42
+ prg.progress((i+1)*multiplier_val)
43
+ if 'key1' in st.session_state:
44
+ st.write(st.session_state.key1)
appStore/adapmit.py CHANGED
@@ -8,10 +8,7 @@ import matplotlib.pyplot as plt
8
  import numpy as np
9
  import pandas as pd
10
  import streamlit as st
11
- # from st_aggrid import AgGrid
12
- # from st_aggrid.shared import ColumnsAutoSizeMode
13
- from utils.adapmit_classifier import adapmit_classification
14
- from utils.adapmit_classifier import runAdapMitPreprocessingPipeline, load_adapmitClassifier
15
  # from utils.keyword_extraction import textrank
16
  import logging
17
  logger = logging.getLogger(__name__)
@@ -48,122 +45,87 @@ def to_excel(df):
48
 
49
  def app():
50
 
51
- #### APP INFO #####
52
- with st.container():
53
- st.markdown("<h1 style='text-align: center; color: black;'> Adaptation-Mitigation Classification </h1>", unsafe_allow_html=True)
54
- st.write(' ')
55
- st.write(' ')
56
-
57
- with st.expander("ℹ️ - About this app", expanded=False):
58
-
59
- st.write(
60
- """
61
- The **Adaptation-Mitigation Classification** app is an easy-to-use interface built \
62
- in Streamlit for analyzing policy documents for \
63
- Classification of the paragraphs/texts in the document *If it \
64
- belongs to 'Adaptation' and 'Mitigation' category or not. The paragraph \
65
- can belong to both category too. \
66
- - developed by GIZ Data Service Center, GFA, IKI Tracs, \
67
- SV Klima and SPA. \n
68
- """)
69
- st.write("""**Document Processing:** The Uploaded/Selected document is \
70
- automatically cleaned and split into paragraphs with a maximum \
71
- length of 60 words using a Haystack preprocessing pipeline. The \
72
- length of 60 is an empirical value which should reflect the length \
73
- of a “context” and should limit the paragraph length deviation. \
74
- However, since we want to respect the sentence boundary the limit \
75
- can breach and hence this limit of 60 is tentative. \n
76
- """)
77
-
78
- st.write("")
79
-
80
  ### Main app code ###
81
  with st.container():
82
- if st.button("RUN Adaptation-Mitigation Classification"):
83
- if 'key4' not in st.session_state:
84
- st.session_state['key4'] = None
85
 
86
- if 'filepath' in st.session_state:
87
- file_name = st.session_state['filename']
88
- file_path = st.session_state['filepath']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
 
91
- all_documents = runAdapMitPreprocessingPipeline(file_name= file_name,
92
- file_path= file_path, split_by= params['split_by'],
93
- split_length= params['split_length'],
94
- split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
95
- split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
96
- classifier = load_adapmitClassifier(classifier_name=params['model_name'])
97
- st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
98
- verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
99
- if len(verified_paralist) > 100:
100
- warning_msg = ": This might take sometime, please sit back and relax."
101
- else:
102
- warning_msg = ""
103
-
104
- # # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
105
- df = adapmit_classification(haystack_doc=verified_paralist,
106
- threshold= params['threshold'])
107
-
108
- threshold= params['threshold']
109
- truth_df = df.drop(['text'],axis=1)
110
- truth_df = truth_df.astype(float) >= threshold
111
- truth_df = truth_df.astype(str)
112
- categories = list(truth_df.columns)
113
-
114
- placeholder = {}
115
- for val in categories:
116
- placeholder[val] = dict(truth_df[val].value_counts())
117
- count_df = pd.DataFrame.from_dict(placeholder)
118
- count_df = count_df.T
119
- count_df = count_df.reset_index()
120
- # st.write(count_df)
121
- placeholder = []
122
- for i in range(len(count_df)):
123
- placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
124
- placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
125
- count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
126
- # st.write("Total Paragraphs: {}".format(len(df)))
127
- fig = px.bar(count_df, y='category', x='count',
128
- color='truth_value',orientation='h', height =200)
129
- c1, c2 = st.columns([1,1])
130
- with c1:
131
- st.plotly_chart(fig,use_container_width= True)
132
-
133
- truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
134
- truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
135
- # st.write(truth_df)
136
- df = pd.concat([df,truth_df['labels']],axis=1)
137
- st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
138
- df = df.sort_values(by = ['Mitigation'], ascending=False)
139
- for i in range(3):
140
- if df.iloc[i]['Mitigation'] >= 0.50:
141
- st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
142
- st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
143
 
144
- st.markdown("###### Top few 'Adaptation' related paragraph/text ######")
145
- df = df.sort_values(by = ['Adaptation'], ascending=False)
146
- for i in range(3):
147
- if df.iloc[i]['Adaptation'] > 0.5:
148
- st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Adaptation']))
149
- st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
150
- # st.write(df[['text','labels']])
151
- df['Validation'] = 'No'
152
- df['Val-Mitigation'] = 'No'
153
- df['Val-Adaptation'] = 'No'
154
- df_xlsx = to_excel(df)
155
- st.download_button(label='📥 Download Current Result',
156
- data=df_xlsx ,
157
- file_name= 'file_adaptation-mitigation.xlsx')
158
- # st.session_state.key4 =
159
-
160
- # category =set(df.columns)
161
- # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
162
- # category = list(category - removecols)
163
-
164
- else:
165
- st.info("🤔 No document found, please try to upload it at the sidebar!")
166
- logging.warning("Terminated as no document provided")
167
 
168
  # # Creating truth value dataframe
169
  # if 'key4' in st.session_state:
 
8
  import numpy as np
9
  import pandas as pd
10
  import streamlit as st
11
+ from utils.adapmit_classifier import load_adapmitClassifier,adapmit_classification
 
 
 
12
  # from utils.keyword_extraction import textrank
13
  import logging
14
  logger = logging.getLogger(__name__)
 
45
 
46
  def app():
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  ### Main app code ###
49
  with st.container():
 
 
 
50
 
51
+ if 'key1' in st.session_state:
52
+ df = st.session_state.key1
53
+
54
+ classifier = load_adapmitClassifier(classifier_name=params['model_name'])
55
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
56
+ if sum(df['Target Label'] == 'TARGET') > 100:
57
+ warning_msg = ": This might take sometime, please sit back and relax."
58
+ else:
59
+ warning_msg = ""
60
+
61
+ df = adapmit_classification(haystack_doc=df,
62
+ threshold= params['threshold'])
63
+
64
+ st.session_state.key1 = df
65
+
66
+
67
+
68
 
69
 
70
+ # threshold= params['threshold']
71
+ # truth_df = df.drop(['text'],axis=1)
72
+ # truth_df = truth_df.astype(float) >= threshold
73
+ # truth_df = truth_df.astype(str)
74
+ # categories = list(truth_df.columns)
75
+
76
+ # placeholder = {}
77
+ # for val in categories:
78
+ # placeholder[val] = dict(truth_df[val].value_counts())
79
+ # count_df = pd.DataFrame.from_dict(placeholder)
80
+ # count_df = count_df.T
81
+ # count_df = count_df.reset_index()
82
+ # # st.write(count_df)
83
+ # placeholder = []
84
+ # for i in range(len(count_df)):
85
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
86
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
87
+ # count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
88
+ # # st.write("Total Paragraphs: {}".format(len(df)))
89
+ # fig = px.bar(count_df, y='category', x='count',
90
+ # color='truth_value',orientation='h', height =200)
91
+ # c1, c2 = st.columns([1,1])
92
+ # with c1:
93
+ # st.plotly_chart(fig,use_container_width= True)
94
+
95
+ # truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
96
+ # truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
97
+ # # st.write(truth_df)
98
+ # df = pd.concat([df,truth_df['labels']],axis=1)
99
+ # st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
100
+ # df = df.sort_values(by = ['Mitigation'], ascending=False)
101
+ # for i in range(3):
102
+ # if df.iloc[i]['Mitigation'] >= 0.50:
103
+ # st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
104
+ # st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # st.markdown("###### Top few 'Adaptation' related paragraph/text ######")
107
+ # df = df.sort_values(by = ['Adaptation'], ascending=False)
108
+ # for i in range(3):
109
+ # if df.iloc[i]['Adaptation'] > 0.5:
110
+ # st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Adaptation']))
111
+ # st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
112
+ # # st.write(df[['text','labels']])
113
+ # df['Validation'] = 'No'
114
+ # df['Val-Mitigation'] = 'No'
115
+ # df['Val-Adaptation'] = 'No'
116
+ # df_xlsx = to_excel(df)
117
+ # st.download_button(label='📥 Download Current Result',
118
+ # data=df_xlsx ,
119
+ # file_name= 'file_adaptation-mitigation.xlsx')
120
+ # # st.session_state.key4 =
121
+
122
+ # # category =set(df.columns)
123
+ # # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
124
+ # # category = list(category - removecols)
125
+
126
+ # else:
127
+ # st.info("🤔 No document found, please try to upload it at the sidebar!")
128
+ # logging.warning("Terminated as no document provided")
129
 
130
  # # Creating truth value dataframe
131
  # if 'key4' in st.session_state:
appStore/doc_processing.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+ from typing import List, Tuple
5
+ from typing_extensions import Literal
6
+ from haystack.schema import Document
7
+ from utils.config import get_classifier_params
8
+ from utils.preprocessing import processingpipeline,paraLengthCheck
9
+ import streamlit as st
10
+ import logging
11
+ import pandas as pd
12
+ params = get_classifier_params("preprocessing")
13
+
14
+ @st.cache_data
15
+ def runPreprocessingPipeline(file_name:str, file_path:str,
16
+ split_by: Literal["sentence", "word"] = 'sentence',
17
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
18
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
19
+ """
20
+ creates the pipeline and runs the preprocessing pipeline,
21
+ the params for pipeline are fetched from paramconfig
22
+ Params
23
+ ------------
24
+ file_name: filename, in case of streamlit application use
25
+ st.session_state['filename']
26
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
27
+ split_by: document splitting strategy either as word or sentence
28
+ split_length: when synthetically creating the paragrpahs from document,
29
+ it defines the length of paragraph.
30
+ split_respect_sentence_boundary: Used when using 'word' strategy for
31
+ splititng of text.
32
+ split_overlap: Number of words or sentences that overlap when creating
33
+ the paragraphs. This is done as one sentence or 'some words' make sense
34
+ when read in together with others. Therefore the overlap is used.
35
+ remove_punc: to remove all Punctuation including ',' and '.' or not
36
+ Return
37
+ --------------
38
+ List[Document]: When preprocessing pipeline is run, the output dictionary
39
+ has four objects. For the Haysatck implementation of SDG classification we,
40
+ need to use the List of Haystack Document, which can be fetched by
41
+ key = 'documents' on output.
42
+ """
43
+
44
+ processing_pipeline = processingpipeline()
45
+
46
+ output_pre = processing_pipeline.run(file_paths = file_path,
47
+ params= {"FileConverter": {"file_path": file_path, \
48
+ "file_name": file_name},
49
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
50
+ "split_by": split_by, \
51
+ "split_length":split_length,\
52
+ "split_overlap": split_overlap, \
53
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
54
+
55
+ return output_pre
56
+
57
+
58
+ def app():
59
+ with st.container():
60
+ if 'filepath' in st.session_state:
61
+ file_name = st.session_state['filename']
62
+ file_path = st.session_state['filepath']
63
+
64
+
65
+ all_documents = runPreprocessingPipeline(file_name= file_name,
66
+ file_path= file_path, split_by= params['split_by'],
67
+ split_length= params['split_length'],
68
+ split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
69
+ split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
70
+ paralist = paraLengthCheck(all_documents['documents'], 100)
71
+ df = pd.DataFrame(paralist,columns = ['text','page'])
72
+ # saving the dataframe to session state
73
+ st.session_state['key0'] = df
74
+
75
+ else:
76
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
77
+ logging.warning("Terminated as no document provided")
appStore/ghg.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from utils.ghg_classifier import load_ghgClassifier, ghg_classification
12
+ import logging
13
+ logger = logging.getLogger(__name__)
14
+ from utils.config import get_classifier_params
15
+ from io import BytesIO
16
+ import xlsxwriter
17
+ import plotly.express as px
18
+
19
+
20
+ # Declare all the necessary variables
21
+ classifier_identifier = 'ghg'
22
+ params = get_classifier_params(classifier_identifier)
23
+
24
+ # Labels dictionary ###
25
+ _lab_dict = {
26
+ 'NEGATIVE':'NO GHG TARGET',
27
+ 'NA':'NOT APPLICABLE',
28
+ 'TARGET':'GHG TARGET',
29
+ }
30
+
31
+
32
+ @st.cache_data
33
+ def to_excel(df):
34
+ len_df = len(df)
35
+ output = BytesIO()
36
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
37
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
38
+ workbook = writer.book
39
+ worksheet = writer.sheets['Sheet1']
40
+ worksheet.data_validation('E2:E{}'.format(len_df),
41
+ {'validate': 'list',
42
+ 'source': ['No', 'Yes', 'Discard']})
43
+ writer.save()
44
+ processed_data = output.getvalue()
45
+ return processed_data
46
+
47
+ def app():
48
+ ### Main app code ###
49
+ with st.container():
50
+ if 'key1' in st.session_state:
51
+ df = st.session_state.key1
52
+
53
+ # Load the classifier model
54
+ classifier = load_ghgClassifier(classifier_name=params['model_name'])
55
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
56
+
57
+ if sum(df['Target Label'] == 'TARGET') > 100:
58
+ warning_msg = ": This might take sometime, please sit back and relax."
59
+ else:
60
+ warning_msg = ""
61
+
62
+ df = ghg_classification(haystack_doc=df,
63
+ threshold= params['threshold'])
64
+ st.session_state.key1 = df
65
+
66
+
67
+ def netzero_display():
68
+ if 'key1' in st.session_state:
69
+ df = st.session_state.key2
70
+ hits = df[df['GHG Label'] == 'TARGET']
71
+ range_val = min(5,len(hits))
72
+ if range_val !=0:
73
+ count_df = df['GHG Label'].value_counts()
74
+ count_df = count_df.rename('count')
75
+ count_df = count_df.rename_axis('GHG Label').reset_index()
76
+ count_df['Label_def'] = count_df['GHG Label'].apply(lambda x: _lab_dict[x])
77
+
78
+ fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
79
+ c1, c2 = st.columns([1,1])
80
+ with c1:
81
+ st.plotly_chart(fig,use_container_width= True)
82
+
83
+ hits = hits.sort_values(by=['GHG Score'], ascending=False)
84
+ st.write("")
85
+ st.markdown("###### Top few GHG Target Classified paragraph/text results ######")
86
+ range_val = min(5,len(hits))
87
+ for i in range(range_val):
88
+ # the page number reflects the page that contains the main paragraph
89
+ # according to split limit, the overlapping part can be on a separate page
90
+ st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['GHG Score']))
91
+ st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
92
+ else:
93
+ st.info("🤔 No GHG target found")
94
+
95
+
appStore/info.py DELETED
@@ -1,67 +0,0 @@
1
- import streamlit as st
2
- import os
3
- from PIL import Image
4
- _ROOT = os.path.abspath(os.path.dirname(__file__))
5
- def get_data(path):
6
- return os.path.join(_ROOT, 'data', path)
7
-
8
- def app():
9
-
10
-
11
- with open('style.css') as f:
12
- st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
13
-
14
- st.markdown("<h2 style='text-align: center; \
15
- color: black;'> Climate Policy Understanding App</h2>",
16
- unsafe_allow_html=True)
17
-
18
-
19
- st.markdown("<div style='text-align: center; \
20
- color: grey;'>Climate Policy Understanding App is an open-source\
21
- digital tool which aims to assist policy analysts and \
22
- other users in extracting and filtering relevant \
23
- information from public documents.</div>",
24
- unsafe_allow_html=True)
25
- footer = """
26
- <div class="footer-custom">
27
- Guidance & Feedback - <a>Nadja Taeger</a> |<a>Marie Hertel</a> | <a>Cecile Schneider</a> |
28
- Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
29
- <a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
30
-
31
- </div>
32
- """
33
- st.markdown(footer, unsafe_allow_html=True)
34
-
35
- c1, c2, c3 = st.columns([8,1,12])
36
- with c1:
37
- image = Image.open('docStore/img/ndc.png')
38
- st.image(image)
39
- with c3:
40
- st.markdown('<div style="text-align: justify;">The manual extraction \
41
- of relevant information from text documents is a \
42
- time-consuming task for any policy analysts. As the amount and length of \
43
- public policy documents in relation to sustainable development (such as \
44
- National Development Plans and Nationally Determined Contributions) \
45
- continuously increases, a major challenge for policy action tracking – the \
46
- evaluation of stated goals and targets and their actual implementation on \
47
- the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
48
- Language Processing (NLP) methods can help in shortening and easing this \
49
- task for policy analysts.</div><br>',
50
- unsafe_allow_html=True)
51
-
52
- intro = """
53
- <div style="text-align: justify;">
54
-
55
- For this purpose, IKI Tracs, SV KLIMA, SPA and Data Service Center (Deutsche Gesellschaft für Internationale \
56
- Zusammenarbeit (GIZ) GmbH) are collaborating since 2022 in the development \
57
- of an AI-powered open-source web application that helps find and extract \
58
- relevant information from public policy documents faster to facilitate \
59
- evidence-based decision-making processes in sustainable development and beyond.
60
-
61
-
62
- </div>
63
- <br>
64
- """
65
- st.markdown(intro, unsafe_allow_html=True)
66
- image2 = Image.open('docStore/img/paris.png')
67
- st.image(image2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appStore/multiapp.py DELETED
@@ -1,67 +0,0 @@
1
- """Frameworks for running multiple Streamlit applications as a single app.
2
- """
3
- import streamlit as st
4
- from PIL import Image
5
- from utils.uploadAndExample import add_upload
6
-
7
- class MultiApp:
8
- """Framework for combining multiple streamlit applications.
9
- Usage:
10
- def foo():
11
- st.title("Hello Foo")
12
- def bar():
13
- st.title("Hello Bar")
14
- app = MultiApp()
15
- app.add_app("Foo", foo)
16
- app.add_app("Bar", bar)
17
- app.run()
18
- It is also possible keep each application in a separate file.
19
- import foo
20
- import bar
21
- app = MultiApp()
22
- app.add_app("Foo", foo.app)
23
- app.add_app("Bar", bar.app)
24
- app.run()
25
- """
26
- def __init__(self):
27
- self.apps = []
28
-
29
- def add_app(self,title,icon, func):
30
- """Adds a new application.
31
- Parameters
32
- ----------
33
- func:
34
- the python function to render this app.
35
- title:
36
- title of the app. Appears in the dropdown in the sidebar.
37
- """
38
- self.apps.append({
39
- "title": title,
40
- "icon": icon,
41
- "function": func
42
- })
43
-
44
- def run(self):
45
-
46
- st.sidebar.write(format_func=lambda app: app['title'])
47
- #image = Image.open('docStore/img/dsc_giz.png')
48
- #st.sidebar.image(image, width =200)
49
-
50
- with st.sidebar:
51
- selected = st.selectbox("Select the Task to perform", [page["title"] for page in self.apps],)
52
- st.markdown("---")
53
-
54
-
55
- for index, item in enumerate(self.apps):
56
- if item["title"] == selected:
57
- self.apps[index]["function"]()
58
- break
59
-
60
-
61
- choice = st.sidebar.radio(label = 'Select the Document',
62
- help = 'You can upload the document \
63
- or else you can try a example document',
64
- options = ('Upload Document', 'Try Example'),
65
- horizontal = True)
66
- add_upload(choice)
67
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appStore/netzero.py CHANGED
@@ -8,11 +8,7 @@ import matplotlib.pyplot as plt
8
  import numpy as np
9
  import pandas as pd
10
  import streamlit as st
11
- # from st_aggrid import AgGrid
12
- # from st_aggrid.shared import ColumnsAutoSizeMode
13
- from utils.netzero_classifier import netzero_classification
14
- from utils.netzero_classifier import runNetZeroPreprocessingPipeline, load_netzeroClassifier
15
- # from utils.keyword_extraction import textrank
16
  import logging
17
  logger = logging.getLogger(__name__)
18
  from utils.config import get_classifier_params
@@ -28,6 +24,7 @@ params = get_classifier_params(classifier_identifier)
28
  # Labels dictionary ###
29
  _lab_dict = {
30
  'NEGATIVE':'NO NETZERO TARGET',
 
31
  'NETZERO':'NETZERO TARGET',
32
  }
33
 
@@ -48,159 +45,51 @@ def to_excel(df):
48
  return processed_data
49
 
50
  def app():
51
-
52
- #### APP INFO #####
53
- with st.container():
54
- st.markdown("<h1 style='text-align: center; color: black;'> NetZero Target Extraction </h1>", unsafe_allow_html=True)
55
- st.write(' ')
56
- st.write(' ')
57
-
58
- with st.expander("ℹ️ - About this app", expanded=False):
59
-
60
- st.write(
61
- """
62
- The **NetZero Extraction** app is an easy-to-use interface built \
63
- in Streamlit for analyzing policy documents for \
64
- Classification of the paragraphs/texts in the document *If it \
65
- contains any Net-Zero target related information* - \
66
- developed by GIZ Data Service Center, GFA, IKI Tracs, \
67
- SV Klima and SPA. \n
68
- """)
69
- st.write("""**Document Processing:** The Uploaded/Selected document is \
70
- automatically cleaned and split into paragraphs with a maximum \
71
- length of 60 words using a Haystack preprocessing pipeline. The \
72
- length of 60 is an empirical value which should reflect the length \
73
- of a “context” and should limit the paragraph length deviation. \
74
- However, since we want to respect the sentence boundary the limit \
75
- can breach and hence this limit of 60 is tentative. \n
76
- """)
77
-
78
- st.write("")
79
-
80
  ### Main app code ###
81
  with st.container():
82
- if st.button("RUN NetZero Related Paragraph Extractions"):
83
- if 'key2' not in st.session_state:
84
- st.session_state['key2'] = None
85
 
86
- if 'filepath' in st.session_state:
87
- file_name = st.session_state['filename']
88
- file_path = st.session_state['filepath']
89
-
90
- # Do the preprocessing of the PDF
91
-
92
- all_documents = runNetZeroPreprocessingPipeline(file_name= file_name,
93
- file_path= file_path, split_by= params['split_by'],
94
- split_length= params['split_length'],
95
- split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
96
- split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
97
-
98
- # st.dataframe(all_documents['documents'])
99
-
100
  # Load the classifier model
101
-
102
  classifier = load_netzeroClassifier(classifier_name=params['model_name'])
103
  st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
104
 
105
- if len(all_documents['documents']) > 100:
106
  warning_msg = ": This might take sometime, please sit back and relax."
107
  else:
108
  warning_msg = ""
109
 
110
- # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
111
- # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
112
-
113
- df = netzero_classification(haystack_doc=all_documents['documents'],
114
  threshold= params['threshold'])
115
- st.session_state.key2 = df
116
- hits = df[df['Target Label'] == 'NETZERO']
117
- range_val = min(5,len(hits))
118
- if range_val !=0:
119
- count_df = df['Target Label'].value_counts()
120
- count_df = count_df.rename('count')
121
- count_df = count_df.rename_axis('Target Label').reset_index()
122
- count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
123
-
124
- fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
125
- c1, c2 = st.columns([1,1])
126
- with c1:
127
- st.plotly_chart(fig,use_container_width= True)
128
-
129
- hits = hits.sort_values(by=['Relevancy'], ascending=False)
130
- st.write("")
131
- st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
132
- range_val = min(5,len(hits))
133
- for i in range(range_val):
134
- # the page number reflects the page that contains the main paragraph
135
- # according to split limit, the overlapping part can be on a separate page
136
- st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
137
- st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
138
- else:
139
- st.info("🤔 No Netzero target found")
140
- df['Validation'] = 'No'
141
- df_xlsx = to_excel(df)
142
- st.download_button(label='📥 Download Current Result',
143
- data=df_xlsx ,
144
- file_name= 'file_target.xlsx')
145
-
146
-
147
- else:
148
- st.info("🤔 No document found, please try to upload it at the sidebar!")
149
- logging.warning("Terminated as no document provided")
150
-
151
- # # Creating truth value dataframe
152
- # if 'key2' in st.session_state:
153
- # if st.session_state.key2 is not None:
154
- # df = st.session_state.key2
155
- # st.markdown("###### Select the threshold for classifier ######")
156
- # c1, c2 = st.columns([1,1])
157
-
158
- # netzero_df = df[df['Target Label'] == 'NETZERO'].reset_index(drop = True)
159
- # if len(netzero_df) >0:
160
- # with c1:
161
- # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
162
- # step=0.01, value=0.5,
163
- # help = "Keep High Value if want refined result, low if dont want to miss anything" )
164
-
165
- # # creating the dataframe for value counts of Labels, along with 'title' of Labels
166
- # temp = df[df['Relevancy']>threshold]
167
- # count_df = temp['Target Label'].value_counts()
168
- # count_df = count_df.rename('count')
169
- # count_df = count_df.rename_axis('Target Label').reset_index()
170
- # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
171
-
172
- # plt.rcParams['font.size'] = 25
173
- # colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
174
- # # plot
175
- # fig, ax = plt.subplots()
176
- # ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
177
- # wedgeprops={"linewidth": 1, "edgecolor": "white"},
178
- # textprops={'fontsize': 14},
179
- # frame=False,labels =list(count_df.Label_def),
180
- # labeldistance=1.2)
181
- # st.markdown("#### Anything related to NetZero Targets? ####")
182
-
183
- # c4, c5, c6 = st.columns([1,2,2])
184
-
185
- # with c5:
186
- # st.pyplot(fig)
187
- # with c6:
188
- # st.write(count_df[['Label_def','count']])
189
-
190
- # st.write("")
191
 
192
- # st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
193
-
194
- # st.dataframe(netzero_df.head())
195
- # else:
196
- # st.write("🤔 No Results found")
197
 
198
-
199
- # df['Validation'] = 'No'
200
- # df_xlsx = to_excel(df)
201
- # st.download_button(label='📥 Download Current Result',
202
- # data=df_xlsx ,
203
- # file_name= 'file_netzero.xlsx')
204
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
 
 
8
  import numpy as np
9
  import pandas as pd
10
  import streamlit as st
11
+ from utils.netzero_classifier import load_netzeroClassifier, netzero_classification
 
 
 
 
12
  import logging
13
  logger = logging.getLogger(__name__)
14
  from utils.config import get_classifier_params
 
24
  # Labels dictionary ###
25
  _lab_dict = {
26
  'NEGATIVE':'NO NETZERO TARGET',
27
+ 'NA':'NOT APPLICABLE',
28
  'NETZERO':'NETZERO TARGET',
29
  }
30
 
 
45
  return processed_data
46
 
47
  def app():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  ### Main app code ###
49
  with st.container():
50
+ if 'key1' in st.session_state:
51
+ df = st.session_state.key1
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  # Load the classifier model
 
54
  classifier = load_netzeroClassifier(classifier_name=params['model_name'])
55
  st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
56
 
57
+ if sum(df['Target Label'] == 'TARGET') > 100:
58
  warning_msg = ": This might take sometime, please sit back and relax."
59
  else:
60
  warning_msg = ""
61
 
62
+ df = netzero_classification(haystack_doc=df,
 
 
 
63
  threshold= params['threshold'])
64
+ st.session_state.key1 = df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
 
 
 
 
 
66
 
67
+ def netzero_display():
68
+ if 'key1' in st.session_state:
69
+ df = st.session_state.key2
70
+ hits = df[df['Netzero Label'] == 'NETZERO']
71
+ range_val = min(5,len(hits))
72
+ if range_val !=0:
73
+ count_df = df['Netzero Label'].value_counts()
74
+ count_df = count_df.rename('count')
75
+ count_df = count_df.rename_axis('Netzero Label').reset_index()
76
+ count_df['Label_def'] = count_df['Netzero Label'].apply(lambda x: _lab_dict[x])
77
+
78
+ fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
79
+ c1, c2 = st.columns([1,1])
80
+ with c1:
81
+ st.plotly_chart(fig,use_container_width= True)
82
+
83
+ hits = hits.sort_values(by=['Netzero Score'], ascending=False)
84
+ st.write("")
85
+ st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
86
+ range_val = min(5,len(hits))
87
+ for i in range(range_val):
88
+ # the page number reflects the page that contains the main paragraph
89
+ # according to split limit, the overlapping part can be on a separate page
90
+ st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Netzero Score']))
91
+ st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
92
+ else:
93
+ st.info("🤔 No Netzero target found")
94
 
95
 
appStore/sector.py CHANGED
@@ -8,11 +8,7 @@ import matplotlib.pyplot as plt
8
  import numpy as np
9
  import pandas as pd
10
  import streamlit as st
11
- # from st_aggrid import AgGrid
12
- # from st_aggrid.shared import ColumnsAutoSizeMode
13
- from utils.sector_classifier import sector_classification
14
- from utils.sector_classifier import runSectorPreprocessingPipeline, load_sectorClassifier
15
- # from utils.keyword_extraction import textrank
16
  import logging
17
  logger = logging.getLogger(__name__)
18
  from utils.config import get_classifier_params
@@ -58,107 +54,68 @@ def to_excel(df,sectorlist):
58
 
59
  def app():
60
 
61
- #### APP INFO #####
62
- with st.container():
63
- st.markdown("<h1 style='text-align: center; color: black;'> Sector Classification </h1>", unsafe_allow_html=True)
64
- st.write(' ')
65
- st.write(' ')
66
-
67
- with st.expander("ℹ️ - About this app", expanded=False):
68
-
69
- st.write(
70
- """
71
- The **Sector Classification** app is an easy-to-use interface built \
72
- in Streamlit for analyzing policy documents for \
73
- Classification of the paragraphs/texts in the document *If it \
74
- belongs to particular sector or not*. The paragraph can belong to multiple sectors - \
75
- developed by GIZ Data Service Center, GFA, IKI Tracs, \
76
- SV Klima and SPA. \n
77
- """)
78
- st.write("""**Document Processing:** The Uploaded/Selected document is \
79
- automatically cleaned and split into paragraphs with a maximum \
80
- length of 60 words using a Haystack preprocessing pipeline. The \
81
- length of 60 is an empirical value which should reflect the length \
82
- of a “context” and should limit the paragraph length deviation. \
83
- However, since we want to respect the sentence boundary the limit \
84
- can breach and hence this limit of 60 is tentative. \n
85
- """)
86
-
87
- st.write("")
88
-
89
  ### Main app code ###
90
  with st.container():
91
- if st.button("RUN Sector Classification"):
92
- if 'key' not in st.session_state:
93
- st.session_state['key'] = None
94
 
95
- if 'filepath' in st.session_state:
96
- file_name = st.session_state['filename']
97
- file_path = st.session_state['filepath']
98
-
99
-
100
- all_documents = runSectorPreprocessingPipeline(file_name= file_name,
101
- file_path= file_path, split_by= params['split_by'],
102
- split_length= params['split_length'],
103
- split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
104
- split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
105
- # st.write(all_documents['documents'])
106
  classifier = load_sectorClassifier(classifier_name=params['model_name'])
107
  st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
108
- verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
109
- if len(verified_paralist) > 100:
110
  warning_msg = ": This might take sometime, please sit back and relax."
111
  else:
112
  warning_msg = ""
113
-
114
- # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
115
- # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
116
 
117
- df = sector_classification(haystack_doc=verified_paralist,
118
  threshold= params['threshold'])
119
- # st.write(df)
120
- threshold= params['threshold']
121
- truth_df = df.drop(['text'],axis=1)
122
- truth_df = truth_df.astype(float) >= threshold
123
- truth_df = truth_df.astype(str)
124
- categories = list(truth_df.columns)
125
-
126
- placeholder = {}
127
- for val in categories:
128
- placeholder[val] = dict(truth_df[val].value_counts())
129
- count_df = pd.DataFrame.from_dict(placeholder)
130
- count_df = count_df.T
131
- count_df = count_df.reset_index()
132
- # st.write(count_df)
133
- placeholder = []
134
- for i in range(len(count_df)):
135
- placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
136
- placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
137
- count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
138
- # st.write("Total Paragraphs: {}".format(len(df)))
139
- fig = px.bar(count_df, x='category', y='count',
140
- color='truth_value')
141
- # c1, c2 = st.columns([1,1])
142
- # with c1:
143
- st.plotly_chart(fig,use_container_width= True)
144
-
145
- truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
146
- truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
147
- # st.write(truth_df)
148
- df = pd.concat([df,truth_df['labels']],axis=1)
149
- df['Validation'] = 'No'
150
- df['Sector1'] = 'Blank'
151
- df['Sector2'] = 'Blank'
152
- df['Sector3'] = 'Blank'
153
- df['Sector4'] = 'Blank'
154
- df['Sector5'] = 'Blank'
155
- df_xlsx = to_excel(df,categories)
156
- st.download_button(label='📥 Download Current Result',
157
- data=df_xlsx ,
158
- file_name= 'file_sector.xlsx')
159
- else:
160
- st.info("🤔 No document found, please try to upload it at the sidebar!")
161
- logging.warning("Terminated as no document provided")
 
 
 
 
162
 
163
  # # Creating truth value dataframe
164
  # if 'key' in st.session_state:
 
8
  import numpy as np
9
  import pandas as pd
10
  import streamlit as st
11
+ from utils.sector_classifier import load_sectorClassifier, sector_classification
 
 
 
 
12
  import logging
13
  logger = logging.getLogger(__name__)
14
  from utils.config import get_classifier_params
 
54
 
55
  def app():
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  ### Main app code ###
58
  with st.container():
 
 
 
59
 
60
+ if 'key1' in st.session_state:
61
+ df = st.session_state.key1
 
 
 
 
 
 
 
 
 
62
  classifier = load_sectorClassifier(classifier_name=params['model_name'])
63
  st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
64
+
65
+ if sum(df['Target Label'] == 'TARGET') > 100:
66
  warning_msg = ": This might take sometime, please sit back and relax."
67
  else:
68
  warning_msg = ""
 
 
 
69
 
70
+ df = sector_classification(haystack_doc=df,
71
  threshold= params['threshold'])
72
+
73
+ st.session_state.key1 = df
74
+
75
+
76
+ # # st.write(df)
77
+ # threshold= params['threshold']
78
+ # truth_df = df.drop(['text'],axis=1)
79
+ # truth_df = truth_df.astype(float) >= threshold
80
+ # truth_df = truth_df.astype(str)
81
+ # categories = list(truth_df.columns)
82
+
83
+ # placeholder = {}
84
+ # for val in categories:
85
+ # placeholder[val] = dict(truth_df[val].value_counts())
86
+ # count_df = pd.DataFrame.from_dict(placeholder)
87
+ # count_df = count_df.T
88
+ # count_df = count_df.reset_index()
89
+ # # st.write(count_df)
90
+ # placeholder = []
91
+ # for i in range(len(count_df)):
92
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
93
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
94
+ # count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
95
+ # # st.write("Total Paragraphs: {}".format(len(df)))
96
+ # fig = px.bar(count_df, x='category', y='count',
97
+ # color='truth_value')
98
+ # # c1, c2 = st.columns([1,1])
99
+ # # with c1:
100
+ # st.plotly_chart(fig,use_container_width= True)
101
+
102
+ # truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
103
+ # truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
104
+ # # st.write(truth_df)
105
+ # df = pd.concat([df,truth_df['labels']],axis=1)
106
+ # df['Validation'] = 'No'
107
+ # df['Sector1'] = 'Blank'
108
+ # df['Sector2'] = 'Blank'
109
+ # df['Sector3'] = 'Blank'
110
+ # df['Sector4'] = 'Blank'
111
+ # df['Sector5'] = 'Blank'
112
+ # df_xlsx = to_excel(df,categories)
113
+ # st.download_button(label='📥 Download Current Result',
114
+ # data=df_xlsx ,
115
+ # # file_name= 'file_sector.xlsx')
116
+ # else:
117
+ # st.info("🤔 No document found, please try to upload it at the sidebar!")
118
+ # logging.warning("Terminated as no document provided")
119
 
120
  # # Creating truth value dataframe
121
  # if 'key' in st.session_state:
appStore/target.py CHANGED
@@ -8,11 +8,7 @@ import matplotlib.pyplot as plt
8
  import numpy as np
9
  import pandas as pd
10
  import streamlit as st
11
- # from st_aggrid import AgGrid
12
- # from st_aggrid.shared import ColumnsAutoSizeMode
13
- from utils.target_classifier import target_classification
14
- from utils.target_classifier import runTargetPreprocessingPipeline, load_targetClassifier
15
- # from utils.keyword_extraction import textrank
16
  import logging
17
  logger = logging.getLogger(__name__)
18
  from utils.config import get_classifier_params
@@ -26,8 +22,8 @@ params = get_classifier_params(classifier_identifier)
26
 
27
  ## Labels dictionary ###
28
  _lab_dict = {
29
- 'LABEL_0':'NO TARGET INFO',
30
- 'LABEL_1':'ECONOMY-WIDE TARGET',
31
  }
32
 
33
  @st.cache_data
@@ -48,164 +44,68 @@ def to_excel(df):
48
  def app():
49
 
50
  #### APP INFO #####
51
- with st.container():
52
- st.markdown("<h1 style='text-align: center; color: black;'> Targets Extraction </h1>", unsafe_allow_html=True)
53
- st.write(' ')
54
- st.write(' ')
55
-
56
- with st.expander("ℹ️ - About this app", expanded=False):
 
 
 
57
 
58
- st.write(
59
- """
60
- The **Target Extraction** app is an easy-to-use interface built \
61
- in Streamlit for analyzing policy documents for \
62
- Classification of the paragraphs/texts in the document *If it \
63
- contains any Economy-Wide Targets related information* - \
64
- developed by GIZ Data Service Center, GFA, IKI Tracs, \
65
- SV Klima and SPA. \n
66
- """)
67
- st.write("""**Document Processing:** The Uploaded/Selected document is \
68
- automatically cleaned and split into paragraphs with a maximum \
69
- length of 60 words using a Haystack preprocessing pipeline. The \
70
- length of 60 is an empirical value which should reflect the length \
71
- of a “context” and should limit the paragraph length deviation. \
72
- However, since we want to respect the sentence boundary the limit \
73
- can breach and hence this limit of 60 is tentative. \n
74
- """)
75
-
76
- st.write("")
77
 
78
  ### Main app code ###
79
  with st.container():
80
- if st.button("RUN Target Related Paragraph Extractions"):
81
- if 'key1' not in st.session_state:
82
- st.session_state['key1'] = None
83
-
84
- if 'filepath' in st.session_state:
85
- file_name = st.session_state['filename']
86
- file_path = st.session_state['filepath']
87
-
88
-
89
- all_documents = runTargetPreprocessingPipeline(file_name= file_name,
90
- file_path= file_path, split_by= params['split_by'],
91
- split_length= params['split_length'],
92
- split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
93
- split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
94
- # st.write(all_documents['documents'])
95
-
96
- #load Classifier
97
- classifier = load_targetClassifier(classifier_name=params['model_name'])
98
- st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
99
- if len(all_documents['documents']) > 100:
100
- warning_msg = ": This might take sometime, please sit back and relax."
101
- else:
102
- warning_msg = ""
103
-
104
- # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
105
- # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
106
-
107
- df = target_classification(haystack_doc=all_documents['documents'],
108
- threshold= params['threshold'])
109
- st.session_state.key1 = df
110
- # temp = df[df['Relevancy']>threshold]
111
- hits = df[df['Target Label'] == 'LABEL_1']
112
- range_val = min(5,len(hits))
113
- if range_val !=0:
114
- count_df = df['Target Label'].value_counts()
115
- count_df = count_df.rename('count')
116
- count_df = count_df.rename_axis('Target Label').reset_index()
117
- count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
118
-
119
- fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
120
- c1, c2 = st.columns([1,1])
121
- with c1:
122
- st.plotly_chart(fig,use_container_width= True)
123
-
124
- hits = hits.sort_values(by=['Relevancy'], ascending=False)
125
- st.write("")
126
- st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
127
- range_val = min(5,len(hits))
128
- for i in range(range_val):
129
- # the page number reflects the page that contains the main paragraph
130
- # according to split limit, the overlapping part can be on a separate page
131
- st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
132
- st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
133
-
134
- else:
135
- st.info("🤔 No Economy Wide Target found")
136
- df['Validation'] = 'No'
137
- df_xlsx = to_excel(df)
138
- st.download_button(label='📥 Download Current Result',
139
- data=df_xlsx ,
140
- file_name= 'file_target.xlsx')
141
-
142
-
143
  else:
144
- st.info("🤔 No document found, please try to upload it at the sidebar!")
145
- logging.warning("Terminated as no document provided")
146
-
147
-
148
-
149
-
150
-
151
-
152
-
153
-
154
-
155
-
156
-
157
-
158
-
159
-
160
-
161
- # # Creating truth value dataframe
162
- # if 'key1' in st.session_state:
163
- # if st.session_state.key1 is not None:
164
- # df = st.session_state.key1
165
- # st.markdown("###### Select the threshold for classifier ######")
166
- # c1, c2 = st.columns([1,1])
167
-
168
- # with c1:
169
- # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
170
- # step=0.01, value=0.5,
171
- # help = "Keep High Value if want refined result, low if dont want to miss anything" )
172
- # sectors =set(df.columns)
173
- # removecols = {'Validation','Sectors','text'}
174
- # sectors = list(sectors - removecols)
175
-
176
- # # creating the dataframe for value counts of Labels, along with 'title' of Labels
177
- # temp = df[df['Relevancy']>threshold]
178
- # count_df = temp['Target Label'].value_counts()
179
- # count_df = count_df.rename('count')
180
- # count_df = count_df.rename_axis('Target Label').reset_index()
181
- # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
182
-
183
- # plt.rcParams['font.size'] = 25
184
- # colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
185
- # # plot
186
- # fig, ax = plt.subplots()
187
- # ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
188
- # wedgeprops={"linewidth": 1, "edgecolor": "white"},
189
- # textprops={'fontsize': 14},
190
- # frame=False,labels =list(count_df.Label_def),
191
- # labeldistance=1.2)
192
- # st.markdown("#### Anything related to Targets? ####")
193
-
194
- # c4, c5, c6 = st.columns([1,2,2])
195
-
196
- # with c5:
197
- # st.pyplot(fig)
198
- # with c6:
199
- # st.write(count_df[['Label_def','count']])
200
 
201
- # st.write("")
202
- # st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
203
- # st.dataframe(df[df['Target Label'] == 'LABEL_1'].reset_index(drop = True))
204
-
205
- # df['Validation'] = 'No'
206
- # df_xlsx = to_excel(df)
207
- # st.download_button(label='📥 Download Current Result',
208
- # data=df_xlsx ,
209
- # file_name= 'file_target.xlsx')
210
-
211
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import numpy as np
9
  import pandas as pd
10
  import streamlit as st
11
+ from utils.target_classifier import load_targetClassifier, target_classification
 
 
 
 
12
  import logging
13
  logger = logging.getLogger(__name__)
14
  from utils.config import get_classifier_params
 
22
 
23
  ## Labels dictionary ###
24
  _lab_dict = {
25
+ 'NEGATIVE':'NO TARGET INFO',
26
+ 'TARGET':'TARGET',
27
  }
28
 
29
  @st.cache_data
 
44
  def app():
45
 
46
  #### APP INFO #####
47
+ # st.write(
48
+ # """
49
+ # The **Target Extraction** app is an easy-to-use interface built \
50
+ # in Streamlit for analyzing policy documents for \
51
+ # Classification of the paragraphs/texts in the document *If it \
52
+ # contains any Economy-Wide Targets related information* - \
53
+ # developed by GIZ Data Service Center, GFA, IKI Tracs, \
54
+ # SV Klima and SPA. \n
55
+ # """)
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  ### Main app code ###
59
  with st.container():
60
+ if 'key0' in st.session_state:
61
+ df = st.session_state.key0
62
+
63
+ #load Classifier
64
+ classifier = load_targetClassifier(classifier_name=params['model_name'])
65
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
66
+ if len(df) > 100:
67
+ warning_msg = ": This might take sometime, please sit back and relax."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  else:
69
+ warning_msg = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ df = target_classification(haystack_doc=df,
72
+ threshold= params['threshold'])
73
+ st.session_state.key1 = df
74
+
75
+ # # excel part
76
+ # temp = df[df['Relevancy']>threshold]
77
+
78
+ # df['Validation'] = 'No'
79
+ # df_xlsx = to_excel(df)
80
+ # st.download_button(label='�� Download Current Result',
81
+ # data=df_xlsx ,
82
+ # file_name= 'file_target.xlsx')
83
+
84
+ def target_display():
85
+ if 'key1' in st.session_state:
86
+ df = st.session_state.key1
87
+ hits = df[df['Target Label'] == 'TARGET']
88
+ range_val = min(5,len(hits))
89
+ if range_val !=0:
90
+ count_df = df['Target Label'].value_counts()
91
+ count_df = count_df.rename('count')
92
+ count_df = count_df.rename_axis('Target Label').reset_index()
93
+ count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
94
+
95
+ fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
96
+ c1, c2 = st.columns([1,1])
97
+ with c1:
98
+ st.plotly_chart(fig,use_container_width= True)
99
+
100
+ hits = hits.sort_values(by=['Relevancy'], ascending=False)
101
+ st.write("")
102
+ st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
103
+ range_val = min(5,len(hits))
104
+ for i in range(range_val):
105
+ # the page number reflects the page that contains the main paragraph
106
+ # according to split limit, the overlapping part can be on a separate page
107
+ st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
108
+ st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
109
+
110
+ else:
111
+ st.info("🤔 No Targets found")
paramconfig.cfg CHANGED
@@ -1,6 +1,16 @@
 
 
 
 
 
 
 
 
 
 
1
  [target]
2
  THRESHOLD = 0.50
3
- MODEL = mtyrrell/ikitracs_economywide
4
  SPLIT_BY = word
5
  REMOVE_PUNC = 0
6
  SPLIT_LENGTH = 60
@@ -36,4 +46,14 @@ REMOVE_PUNC = 0
36
  SPLIT_LENGTH = 60
37
  SPLIT_OVERLAP = 10
38
  RESPECT_SENTENCE_BOUNDARY = 1
 
 
 
 
 
 
 
 
 
 
39
  TOP_KEY = 10
 
1
+ [preprocessing]
2
+ THRESHOLD = 0.50
3
+ MODEL = garbage
4
+ SPLIT_BY = word
5
+ REMOVE_PUNC = 0
6
+ SPLIT_LENGTH = 60
7
+ SPLIT_OVERLAP = 10
8
+ RESPECT_SENTENCE_BOUNDARY = 1
9
+ TOP_KEY = 10
10
+
11
  [target]
12
  THRESHOLD = 0.50
13
+ MODEL = mtyrrell/ikitracs_target_mpnet
14
  SPLIT_BY = word
15
  REMOVE_PUNC = 0
16
  SPLIT_LENGTH = 60
 
46
  SPLIT_LENGTH = 60
47
  SPLIT_OVERLAP = 10
48
  RESPECT_SENTENCE_BOUNDARY = 1
49
+ TOP_KEY = 10
50
+
51
+ [ghg]
52
+ THRESHOLD = 0.50
53
+ MODEL = mtyrrell/ikitracs_transport_ghg
54
+ SPLIT_BY = word
55
+ REMOVE_PUNC = 0
56
+ SPLIT_LENGTH = 60
57
+ SPLIT_OVERLAP = 10
58
+ RESPECT_SENTENCE_BOUNDARY = 1
59
  TOP_KEY = 10
utils/adapmit_classifier.py CHANGED
@@ -34,10 +34,6 @@ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
34
  classifier_name = config.get('adapmit','MODEL')
35
 
36
  logging.info("Loading Adaptation Mitigation classifier")
37
- # doc_classifier = TransformersDocumentClassifier(
38
- # model_name_or_path=classifier_name,
39
- # task="text-classification",
40
- # top_k = None)
41
  doc_classifier = pipeline("text-classification",
42
  model=classifier_name,
43
  return_all_scores=True,
@@ -47,51 +43,8 @@ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
47
  return doc_classifier
48
 
49
 
50
- def runAdapMitPreprocessingPipeline(file_name:str, file_path:str,
51
- split_by: Literal["sentence", "word"] = 'sentence',
52
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
53
- split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
54
- """
55
- creates the pipeline and runs the preprocessing pipeline,
56
- the params for pipeline are fetched from paramconfig
57
- Params
58
- ------------
59
- file_name: filename, in case of streamlit application use
60
- st.session_state['filename']
61
- file_path: filepath, in case of streamlit application use st.session_state['filepath']
62
- split_by: document splitting strategy either as word or sentence
63
- split_length: when synthetically creating the paragrpahs from document,
64
- it defines the length of paragraph.
65
- split_respect_sentence_boundary: Used when using 'word' strategy for
66
- splititng of text.
67
- split_overlap: Number of words or sentences that overlap when creating
68
- the paragraphs. This is done as one sentence or 'some words' make sense
69
- when read in together with others. Therefore the overlap is used.
70
- remove_punc: to remove all Punctuation including ',' and '.' or not
71
- Return
72
- --------------
73
- List[Document]: When preprocessing pipeline is run, the output dictionary
74
- has four objects. For the Haysatck implementation of SDG classification we,
75
- need to use the List of Haystack Document, which can be fetched by
76
- key = 'documents' on output.
77
- """
78
-
79
- adapmit_processing_pipeline = processingpipeline()
80
-
81
- output_adapmit_pre = adapmit_processing_pipeline.run(file_paths = file_path,
82
- params= {"FileConverter": {"file_path": file_path, \
83
- "file_name": file_name},
84
- "UdfPreProcessor": {"remove_punc": remove_punc, \
85
- "split_by": split_by, \
86
- "split_length":split_length,\
87
- "split_overlap": split_overlap, \
88
- "split_respect_sentence_boundary":split_respect_sentence_boundary}})
89
-
90
- return output_adapmit_pre
91
-
92
-
93
  @st.cache_data
94
- def adapmit_classification(haystack_doc:List[Document],
95
  threshold:float = 0.5,
96
  classifier_model:pipeline= None
97
  )->Tuple[DataFrame,Series]:
@@ -115,10 +68,14 @@ def adapmit_classification(haystack_doc:List[Document],
115
  the number of times it is covered/discussed/count_of_paragraphs.
116
  """
117
  logging.info("Working on Adaptation-Mitigation Identification")
 
 
 
 
118
  if not classifier_model:
119
  classifier_model = st.session_state['adapmit_classifier']
120
 
121
- predictions = classifier_model(haystack_doc)
122
  # converting the predictions to desired format
123
  list_ = []
124
  for i in range(len(predictions)):
@@ -128,9 +85,17 @@ def adapmit_classification(haystack_doc:List[Document],
128
  for j in range(len(temp)):
129
  placeholder[temp[j]['label']] = temp[j]['score']
130
  list_.append(placeholder)
131
- labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
132
- # labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
133
- df = DataFrame.from_dict(labels_)
134
- df = df.round(2)
 
 
 
 
 
 
 
 
135
 
136
  return df
 
34
  classifier_name = config.get('adapmit','MODEL')
35
 
36
  logging.info("Loading Adaptation Mitigation classifier")
 
 
 
 
37
  doc_classifier = pipeline("text-classification",
38
  model=classifier_name,
39
  return_all_scores=True,
 
43
  return doc_classifier
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  @st.cache_data
47
+ def adapmit_classification(haystack_doc:pd.DataFrame,
48
  threshold:float = 0.5,
49
  classifier_model:pipeline= None
50
  )->Tuple[DataFrame,Series]:
 
68
  the number of times it is covered/discussed/count_of_paragraphs.
69
  """
70
  logging.info("Working on Adaptation-Mitigation Identification")
71
+ haystack_doc['Adapt-Mitig Label'] = 'NA'
72
+ df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
73
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
74
+
75
  if not classifier_model:
76
  classifier_model = st.session_state['adapmit_classifier']
77
 
78
+ predictions = classifier_model(list(df1.text))
79
  # converting the predictions to desired format
80
  list_ = []
81
  for i in range(len(predictions)):
 
85
  for j in range(len(temp)):
86
  placeholder[temp[j]['label']] = temp[j]['score']
87
  list_.append(placeholder)
88
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
89
+ truth_df = DataFrame.from_dict(labels_)
90
+ truth_df = truth_df.round(2)
91
+ truth_df = truth_df.astype(float) >= threshold
92
+ truth_df = truth_df.astype(str)
93
+ categories = list(truth_df.columns)
94
+ truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
95
+ else None for i in categories}, axis=1)
96
+ truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
97
+ list(x['Adapt-Mitig Label'] -{None}),axis=1)
98
+ df1['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
99
+ df = pd.concat([df,df1])
100
 
101
  return df
utils/ghg_classifier.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TransformersDocumentClassifier
2
+ from haystack.schema import Document
3
+ from typing import List, Tuple
4
+ from typing_extensions import Literal
5
+ import logging
6
+ import pandas as pd
7
+ from pandas import DataFrame, Series
8
+ from utils.config import getconfig
9
+ from utils.preprocessing import processingpipeline
10
+ import streamlit as st
11
+ from transformers import pipeline
12
+
13
+ # Labels dictionary ###
14
+ _lab_dict = {
15
+ 'NEGATIVE':'NO GHG TARGET',
16
+ 'TARGET':'GHG TARGET',
17
+ }
18
+
19
+ @st.cache_resource
20
+ def load_ghgClassifier(config_file:str = None, classifier_name:str = None):
21
+ """
22
+ loads the document classifier using haystack, where the name/path of model
23
+ in HF-hub as string is used to fetch the model object.Either configfile or
24
+ model should be passed.
25
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
26
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
27
+ Params
28
+ --------
29
+ config_file: config file path from which to read the model name
30
+ classifier_name: if modelname is passed, it takes a priority if not \
31
+ found then will look for configfile, else raise error.
32
+ Return: document classifier model
33
+ """
34
+ if not classifier_name:
35
+ if not config_file:
36
+ logging.warning("Pass either model name or config file")
37
+ return
38
+ else:
39
+ config = getconfig(config_file)
40
+ classifier_name = config.get('ghg','MODEL')
41
+
42
+ logging.info("Loading ghg classifier")
43
+ doc_classifier = pipeline("text-classification",
44
+ model=classifier_name,
45
+ top_k =1)
46
+
47
+ return doc_classifier
48
+
49
+
50
+ @st.cache_data
51
+ def ghg_classification(haystack_doc:pd.DataFrame,
52
+ threshold:float = 0.5,
53
+ classifier_model:pipeline= None
54
+ )->Tuple[DataFrame,Series]:
55
+ """
56
+ Text-Classification on the list of texts provided. Classifier provides the
57
+ most appropriate label for each text. these labels are in terms of if text
58
+ belongs to which particular Sustainable Devleopment Goal (SDG).
59
+ Params
60
+ ---------
61
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
+ contains the list of paragraphs in different format,here the list of
63
+ Haystack Documents is used.
64
+ threshold: threshold value for the model to keep the results from classifier
65
+ classifiermodel: you can pass the classifier model directly,which takes priority
66
+ however if not then looks for model in streamlit session.
67
+ In case of streamlit avoid passing the model directly.
68
+ Returns
69
+ ----------
70
+ df: Dataframe with two columns['SDG:int', 'text']
71
+ x: Series object with the unique SDG covered in the document uploaded and
72
+ the number of times it is covered/discussed/count_of_paragraphs.
73
+ """
74
+ logging.info("Working on GHG Extraction")
75
+ haystack_doc['GHG Label'] = 'NA'
76
+ haystack_doc['GHG Score'] = 'NA'
77
+ temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
78
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
79
+
80
+ if not classifier_model:
81
+ classifier_model = st.session_state['ghg_classifier']
82
+
83
+ results = classifier_model(list(temp.text))
84
+ labels_= [(l[0]['label'],l[0]['score']) for l in results]
85
+ temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
86
+ df = pd.concat([df,temp])
87
+ df = df.reset_index(drop =True)
88
+ df.index += 1
89
+
90
+ return df
utils/netzero_classifier.py CHANGED
@@ -8,6 +8,7 @@ from pandas import DataFrame, Series
8
  from utils.config import getconfig
9
  from utils.preprocessing import processingpipeline
10
  import streamlit as st
 
11
 
12
  # Labels dictionary ###
13
  _lab_dict = {
@@ -39,60 +40,17 @@ def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
39
  classifier_name = config.get('netzero','MODEL')
40
 
41
  logging.info("Loading netzero classifier")
42
- doc_classifier = TransformersDocumentClassifier(
43
- model_name_or_path=classifier_name,
44
- task="text-classification")
45
 
46
  return doc_classifier
47
 
48
 
49
- def runNetZeroPreprocessingPipeline(file_name:str, file_path:str,
50
- split_by: Literal["sentence", "word"] = 'sentence',
51
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
52
- split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
53
- """
54
- creates the pipeline and runs the preprocessing pipeline,
55
- the params for pipeline are fetched from paramconfig
56
- Params
57
- ------------
58
- file_name: filename, in case of streamlit application use
59
- st.session_state['filename']
60
- file_path: filepath, in case of streamlit application use st.session_state['filepath']
61
- split_by: document splitting strategy either as word or sentence
62
- split_length: when synthetically creating the paragrpahs from document,
63
- it defines the length of paragraph.
64
- split_respect_sentence_boundary: Used when using 'word' strategy for
65
- splititng of text.
66
- split_overlap: Number of words or sentences that overlap when creating
67
- the paragraphs. This is done as one sentence or 'some words' make sense
68
- when read in together with others. Therefore the overlap is used.
69
- remove_punc: to remove all Punctuation including ',' and '.' or not
70
- Return
71
- --------------
72
- List[Document]: When preprocessing pipeline is run, the output dictionary
73
- has four objects. For the Haysatck implementation of SDG classification we,
74
- need to use the List of Haystack Document, which can be fetched by
75
- key = 'documents' on output.
76
- """
77
-
78
- netzero_processing_pipeline = processingpipeline()
79
-
80
- output_netzero_pre = netzero_processing_pipeline.run(file_paths = file_path,
81
- params= {"FileConverter": {"file_path": file_path, \
82
- "file_name": file_name},
83
- "UdfPreProcessor": {"remove_punc": remove_punc, \
84
- "split_by": split_by, \
85
- "split_length":split_length,\
86
- "split_overlap": split_overlap, \
87
- "split_respect_sentence_boundary":split_respect_sentence_boundary}})
88
-
89
- return output_netzero_pre
90
-
91
-
92
  @st.cache_data
93
- def netzero_classification(haystack_doc:List[Document],
94
  threshold:float = 0.8,
95
- classifier_model:TransformersDocumentClassifier= None
96
  )->Tuple[DataFrame,Series]:
97
  """
98
  Text-Classification on the list of texts provided. Classifier provides the
@@ -114,24 +72,19 @@ def netzero_classification(haystack_doc:List[Document],
114
  the number of times it is covered/discussed/count_of_paragraphs.
115
  """
116
  logging.info("Working on Netzero Extraction")
 
 
 
 
 
117
  if not classifier_model:
118
  classifier_model = st.session_state['netzero_classifier']
119
 
120
- results = classifier_model.predict(haystack_doc)
121
- labels_= [(l.meta['classification']['label'],
122
- l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
123
-
124
- df = DataFrame(labels_, columns=["Target Label","Relevancy", "page","text"])
125
-
126
- df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
127
  df.index += 1
128
- # df =df[df['Relevancy']>threshold]
129
- df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
130
-
131
- # creating the dataframe for value counts of Labels, along with 'title' of Labels
132
- # count_df = df['Target Label'].value_counts()
133
- # count_df = count_df.rename('count')
134
- # count_df = count_df.rename_axis('Target Label').reset_index()
135
- # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
136
 
137
  return df
 
8
  from utils.config import getconfig
9
  from utils.preprocessing import processingpipeline
10
  import streamlit as st
11
+ from transformers import pipeline
12
 
13
  # Labels dictionary ###
14
  _lab_dict = {
 
40
  classifier_name = config.get('netzero','MODEL')
41
 
42
  logging.info("Loading netzero classifier")
43
+ doc_classifier = pipeline("text-classification",
44
+ model=classifier_name,
45
+ top_k =1)
46
 
47
  return doc_classifier
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  @st.cache_data
51
+ def netzero_classification(haystack_doc:pd.DataFrame,
52
  threshold:float = 0.8,
53
+ classifier_model:pipeline= None
54
  )->Tuple[DataFrame,Series]:
55
  """
56
  Text-Classification on the list of texts provided. Classifier provides the
 
72
  the number of times it is covered/discussed/count_of_paragraphs.
73
  """
74
  logging.info("Working on Netzero Extraction")
75
+ haystack_doc['Netzero Label'] = 'NA'
76
+ haystack_doc['Netzero Score'] = 'NA'
77
+ temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
78
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
79
+
80
  if not classifier_model:
81
  classifier_model = st.session_state['netzero_classifier']
82
 
83
+ results = classifier_model(list(temp.text))
84
+ labels_= [(l[0]['label'],l[0]['score']) for l in results]
85
+ temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
86
+ df = pd.concat([df,temp])
87
+ df = df.reset_index(drop =True)
 
 
88
  df.index += 1
 
 
 
 
 
 
 
 
89
 
90
  return df
utils/preprocessing.py CHANGED
@@ -150,20 +150,36 @@ def basic(s:str, remove_punc:bool = False):
150
 
151
  return s.strip()
152
 
153
- def paraLengthCheck(paraList, max_len = 512):
 
 
 
 
 
 
 
 
 
 
 
 
154
  new_para_list = []
155
  for passage in paraList:
156
- if len(passage.split()) > max_len:
157
- iterations = int(len(passage.split())/max_len)
158
- # # st.write("Splitting")
 
 
159
  for i in range(iterations):
160
- temp = " ".join(passage.split()[max_len*i:max_len*(i+1)])
161
- new_para_list.append(temp)
162
- temp = " ".join(passage.split()[max_len*(i+1):])
163
- new_para_list.append(temp)
164
  else:
165
- new_para_list.append(passage)
166
-
 
 
167
  return new_para_list
168
 
169
  class UdfPreProcessor(BaseComponent):
 
150
 
151
  return s.strip()
152
 
153
+ def paraLengthCheck(paraList, max_len = 100):
154
+ """
155
+ There are cases where preprocessor cannot respect word limit, when using
156
+ respect sentence boundary flag due to missing sentence boundaries.
157
+ Therefore we run one more round of split here for those paragraphs
158
+
159
+ Params
160
+ ---------------
161
+ paraList : list of paragraphs/text
162
+ max_len : max length to be respected by sentences which bypassed
163
+ preprocessor strategy
164
+
165
+ """
166
  new_para_list = []
167
  for passage in paraList:
168
+ # check if para exceeds words limit
169
+ if len(passage.content.split()) > max_len:
170
+ # we might need few iterations example if para = 512 tokens
171
+ # we need to iterate 5 times to reduce para to size limit of '100'
172
+ iterations = int(len(passage.content.split())/max_len)
173
  for i in range(iterations):
174
+ temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
175
+ new_para_list.append((temp,passage.meta['page']))
176
+ temp = " ".join(passage.content.split()[max_len*(i+1):])
177
+ new_para_list.append((temp,passage.meta['page']))
178
  else:
179
+ # paragraphs which dont need any splitting
180
+ new_para_list.append((passage.content, passage.meta['page']))
181
+
182
+ logging.info("New paragraphs length {}".format(len(new_para_list)))
183
  return new_para_list
184
 
185
  class UdfPreProcessor(BaseComponent):
utils/sector_classifier.py CHANGED
@@ -11,12 +11,6 @@ from haystack.nodes import TransformersDocumentClassifier
11
  from transformers import pipeline
12
 
13
 
14
- # # Labels dictionary ###
15
- # _lab_dict = {
16
- # 'NEGATIVE':'NO NETZERO TARGET',
17
- # 'NETZERO':'NETZERO TARGET',
18
- # }
19
-
20
  @st.cache_resource
21
  def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
22
  """
@@ -58,53 +52,10 @@ def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
58
  return doc_classifier
59
 
60
 
61
- def runSectorPreprocessingPipeline(file_name:str, file_path:str,
62
- split_by: Literal["sentence", "word"] = 'sentence',
63
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
64
- split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
65
- """
66
- creates the pipeline and runs the preprocessing pipeline,
67
- the params for pipeline are fetched from paramconfig
68
- Params
69
- ------------
70
- file_name: filename, in case of streamlit application use
71
- st.session_state['filename']
72
- file_path: filepath, in case of streamlit application use st.session_state['filepath']
73
- split_by: document splitting strategy either as word or sentence
74
- split_length: when synthetically creating the paragrpahs from document,
75
- it defines the length of paragraph.
76
- split_respect_sentence_boundary: Used when using 'word' strategy for
77
- splititng of text.
78
- split_overlap: Number of words or sentences that overlap when creating
79
- the paragraphs. This is done as one sentence or 'some words' make sense
80
- when read in together with others. Therefore the overlap is used.
81
- remove_punc: to remove all Punctuation including ',' and '.' or not
82
- Return
83
- --------------
84
- List[Document]: When preprocessing pipeline is run, the output dictionary
85
- has four objects. For the Haysatck implementation of SDG classification we,
86
- need to use the List of Haystack Document, which can be fetched by
87
- key = 'documents' on output.
88
- """
89
-
90
- sector_processing_pipeline = processingpipeline()
91
-
92
- output_sector_pre = sector_processing_pipeline.run(file_paths = file_path,
93
- params= {"FileConverter": {"file_path": file_path, \
94
- "file_name": file_name},
95
- "UdfPreProcessor": {"remove_punc": remove_punc, \
96
- "split_by": split_by, \
97
- "split_length":split_length,\
98
- "split_overlap": split_overlap, \
99
- "split_respect_sentence_boundary":split_respect_sentence_boundary}})
100
-
101
- return output_sector_pre
102
-
103
-
104
  @st.cache_data
105
- def sector_classification(haystack_doc:List[Document],
106
- threshold:float = 0.8,
107
- classifier_model:TransformersDocumentClassifier= None
108
  )->Tuple[DataFrame,Series]:
109
  """
110
  Text-Classification on the list of texts provided. Classifier provides the
@@ -126,10 +77,14 @@ def sector_classification(haystack_doc:List[Document],
126
  the number of times it is covered/discussed/count_of_paragraphs.
127
  """
128
  logging.info("Working on Sector Identification")
 
 
 
129
  if not classifier_model:
130
  classifier_model = st.session_state['sector_classifier']
131
 
132
- predictions = classifier_model(haystack_doc)
 
133
  list_ = []
134
  for i in range(len(predictions)):
135
 
@@ -138,9 +93,16 @@ def sector_classification(haystack_doc:List[Document],
138
  for j in range(len(temp)):
139
  placeholder[temp[j]['label']] = temp[j]['score']
140
  list_.append(placeholder)
141
- labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
142
- # labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
143
- df = DataFrame.from_dict(labels_)
144
- df = df.round(2)
145
-
 
 
 
 
 
 
 
146
  return df
 
11
  from transformers import pipeline
12
 
13
 
 
 
 
 
 
 
14
  @st.cache_resource
15
  def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
16
  """
 
52
  return doc_classifier
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  @st.cache_data
56
+ def sector_classification(haystack_doc:pd.DataFrame,
57
+ threshold:float = 0.5,
58
+ classifier_model:pipeline= None
59
  )->Tuple[DataFrame,Series]:
60
  """
61
  Text-Classification on the list of texts provided. Classifier provides the
 
77
  the number of times it is covered/discussed/count_of_paragraphs.
78
  """
79
  logging.info("Working on Sector Identification")
80
+ haystack_doc['Sector Label'] = 'NA'
81
+ df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
82
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
83
  if not classifier_model:
84
  classifier_model = st.session_state['sector_classifier']
85
 
86
+ predictions = classifier_model(list(df1.text))
87
+
88
  list_ = []
89
  for i in range(len(predictions)):
90
 
 
93
  for j in range(len(temp)):
94
  placeholder[temp[j]['label']] = temp[j]['score']
95
  list_.append(placeholder)
96
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
97
+ truth_df = DataFrame.from_dict(labels_)
98
+ truth_df = truth_df.round(2)
99
+ truth_df = truth_df.astype(float) >= threshold
100
+ truth_df = truth_df.astype(str)
101
+ categories = list(truth_df.columns)
102
+ truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
103
+ None for i in categories}, axis=1)
104
+ truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
105
+ -{None}),axis=1)
106
+ df1['Sector Label'] = list(truth_df['Sector Label'])
107
+ df = pd.concat([df,df1])
108
  return df
utils/target_classifier.py CHANGED
@@ -8,11 +8,12 @@ from pandas import DataFrame, Series
8
  from utils.config import getconfig
9
  from utils.preprocessing import processingpipeline
10
  import streamlit as st
 
11
 
12
  ## Labels dictionary ###
13
  _lab_dict = {
14
- 'LABEL_0':'NO TARGET INFO',
15
- 'LABEL_1':'ECONOMY-WIDE TARGET',
16
  }
17
 
18
  @st.cache_resource
@@ -38,61 +39,19 @@ def load_targetClassifier(config_file:str = None, classifier_name:str = None):
38
  config = getconfig(config_file)
39
  classifier_name = config.get('target','MODEL')
40
 
41
- logging.info("Loading classifier")
42
- doc_classifier = TransformersDocumentClassifier(
43
- model_name_or_path=classifier_name,
44
- task="text-classification")
 
45
 
46
  return doc_classifier
47
 
48
 
49
- def runTargetPreprocessingPipeline(file_name:str, file_path:str,
50
- split_by: Literal["sentence", "word"] = 'sentence',
51
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
52
- split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
53
- """
54
- creates the pipeline and runs the preprocessing pipeline,
55
- the params for pipeline are fetched from paramconfig
56
- Params
57
- ------------
58
- file_name: filename, in case of streamlit application use
59
- st.session_state['filename']
60
- file_path: filepath, in case of streamlit application use st.session_state['filepath']
61
- split_by: document splitting strategy either as word or sentence
62
- split_length: when synthetically creating the paragrpahs from document,
63
- it defines the length of paragraph.
64
- split_respect_sentence_boundary: Used when using 'word' strategy for
65
- splititng of text.
66
- split_overlap: Number of words or sentences that overlap when creating
67
- the paragraphs. This is done as one sentence or 'some words' make sense
68
- when read in together with others. Therefore the overlap is used.
69
- remove_punc: to remove all Punctuation including ',' and '.' or not
70
- Return
71
- --------------
72
- List[Document]: When preprocessing pipeline is run, the output dictionary
73
- has four objects. For the Haysatck implementation of SDG classification we,
74
- need to use the List of Haystack Document, which can be fetched by
75
- key = 'documents' on output.
76
- """
77
-
78
- target_processing_pipeline = processingpipeline()
79
-
80
- output_target_pre = target_processing_pipeline.run(file_paths = file_path,
81
- params= {"FileConverter": {"file_path": file_path, \
82
- "file_name": file_name},
83
- "UdfPreProcessor": {"remove_punc": remove_punc, \
84
- "split_by": split_by, \
85
- "split_length":split_length,\
86
- "split_overlap": split_overlap, \
87
- "split_respect_sentence_boundary":split_respect_sentence_boundary}})
88
-
89
- return output_target_pre
90
-
91
-
92
  @st.cache_data
93
- def target_classification(haystack_doc:List[Document],
94
- threshold:float = 0.8,
95
- classifier_model:TransformersDocumentClassifier= None
96
  )->Tuple[DataFrame,Series]:
97
  """
98
  Text-Classification on the list of texts provided. Classifier provides the
@@ -117,22 +76,16 @@ def target_classification(haystack_doc:List[Document],
117
  if not classifier_model:
118
  classifier_model = st.session_state['target_classifier']
119
 
120
- results = classifier_model.predict(haystack_doc)
121
- labels_= [(l.meta['classification']['label'],
122
- l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
123
 
124
 
125
- df = DataFrame(labels_, columns=["Target Label","Relevancy","page","text"])
 
126
 
127
  df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
128
  df.index += 1
129
- # df =df[df['Relevancy']>threshold]
130
  df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
131
 
132
- # creating the dataframe for value counts of Labels, along with 'title' of Labels
133
- # count_df = df['Target Label'].value_counts()
134
- # count_df = count_df.rename('count')
135
- # count_df = count_df.rename_axis('Target Label').reset_index()
136
- # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
137
-
138
  return df
 
8
  from utils.config import getconfig
9
  from utils.preprocessing import processingpipeline
10
  import streamlit as st
11
+ from transformers import pipeline
12
 
13
  ## Labels dictionary ###
14
  _lab_dict = {
15
+ 'NEGATIVE':'NO TARGET INFO',
16
+ 'TARGET':'TARGET',
17
  }
18
 
19
  @st.cache_resource
 
39
  config = getconfig(config_file)
40
  classifier_name = config.get('target','MODEL')
41
 
42
+ logging.info("Loading classifier")
43
+
44
+ doc_classifier = pipeline("text-classification",
45
+ model=classifier_name,
46
+ top_k =1)
47
 
48
  return doc_classifier
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  @st.cache_data
52
+ def target_classification(haystack_doc:pd.DataFrame,
53
+ threshold:float = 0.5,
54
+ classifier_model:pipeline= None
55
  )->Tuple[DataFrame,Series]:
56
  """
57
  Text-Classification on the list of texts provided. Classifier provides the
 
76
  if not classifier_model:
77
  classifier_model = st.session_state['target_classifier']
78
 
79
+ results = classifier_model(list(haystack_doc.text))
80
+ labels_= [(l[0]['label'],
81
+ l[0]['score']) for l in results]
82
 
83
 
84
+ df1 = DataFrame(labels_, columns=["Target Label","Relevancy"])
85
+ df = pd.concat([haystack_doc,df1],axis=1)
86
 
87
  df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
88
  df.index += 1
 
89
  df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
90
 
 
 
 
 
 
 
91
  return df
utils/uploadAndExample.py CHANGED
@@ -11,6 +11,12 @@ def add_upload(choice):
11
  """
12
 
13
  if choice == 'Upload Document':
 
 
 
 
 
 
14
  uploaded_file = st.sidebar.file_uploader('Upload the File',
15
  type=['pdf', 'docx', 'txt'])
16
  if uploaded_file is not None:
 
11
  """
12
 
13
  if choice == 'Upload Document':
14
+
15
+ if 'filename' in st.session_state:
16
+ # Delete all the items in Session state
17
+ for key in st.session_state.keys():
18
+ del st.session_state[key]
19
+
20
  uploaded_file = st.sidebar.file_uploader('Upload the File',
21
  type=['pdf', 'docx', 'txt'])
22
  if uploaded_file is not None: