ppsingh commited on
Commit
0e3ebc4
·
1 Parent(s): 2472737
app.py CHANGED
@@ -3,6 +3,8 @@ import appStore.netzero as netzero
3
  import appStore.sector as sector
4
  import appStore.adapmit as adapmit
5
  import appStore.ghg as ghg
 
 
6
  import appStore.doc_processing as processing
7
  from utils.uploadAndExample import add_upload
8
  import streamlit as st
@@ -52,7 +54,7 @@ with st.expander("ℹ️ - About this app", expanded=False):
52
  """)
53
  st.write("")
54
  apps = [processing.app, target_extraction.app, netzero.app, ghg.app,
55
- sector.app, adapmit.app]
56
  multiplier_val =1/len(apps)
57
  if st.button("Analyze Document"):
58
  prg = st.progress(0.0)
@@ -60,6 +62,17 @@ if st.button("Analyze Document"):
60
  func()
61
  prg.progress((i+1)*multiplier_val)
62
 
63
- if 'key1' in st.session_state:
 
 
 
 
 
 
 
64
  target_extraction.target_display()
65
- # st.write(st.session_state.key1)
 
 
 
 
 
3
  import appStore.sector as sector
4
  import appStore.adapmit as adapmit
5
  import appStore.ghg as ghg
6
+ import appStore.policyaction as policyaction
7
+ import appStore.indicator as indicator
8
  import appStore.doc_processing as processing
9
  from utils.uploadAndExample import add_upload
10
  import streamlit as st
 
54
  """)
55
  st.write("")
56
  apps = [processing.app, target_extraction.app, netzero.app, ghg.app,
57
+ sector.app, policyaction.app, indicator.app, adapmit.app]
58
  multiplier_val =1/len(apps)
59
  if st.button("Analyze Document"):
60
  prg = st.progress(0.0)
 
62
  func()
63
  prg.progress((i+1)*multiplier_val)
64
 
65
+
66
+ if 'key1' in st.session_state:
67
+ with st.sidebar:
68
+ topic = st.radio(
69
+ "Which category you want to explore?",
70
+ ('Target', 'Action', 'Policies/Plans'))
71
+
72
+ if topic == 'Target':
73
  target_extraction.target_display()
74
+ elif topic == 'Action':
75
+ policyaction.action_display()
76
+ else:
77
+ policyaction.policy_display()
78
+ # st.write(st.session_state.key1)
appStore/indicator.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from utils.indicator_classifier import load_indicatorClassifier, indicator_classification
12
+ import logging
13
+ logger = logging.getLogger(__name__)
14
+ from utils.config import get_classifier_params
15
+ from utils.preprocessing import paraLengthCheck
16
+ from io import BytesIO
17
+ import xlsxwriter
18
+ import plotly.express as px
19
+
20
+
21
+ # Declare all the necessary variables
22
+ classifier_identifier = 'indicator'
23
+ params = get_classifier_params(classifier_identifier)
24
+
25
+ @st.cache_data
26
+ def to_excel(df,sectorlist):
27
+ len_df = len(df)
28
+ output = BytesIO()
29
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
30
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
31
+ workbook = writer.book
32
+ worksheet = writer.sheets['Sheet1']
33
+ worksheet.data_validation('S2:S{}'.format(len_df),
34
+ {'validate': 'list',
35
+ 'source': ['No', 'Yes', 'Discard']})
36
+ worksheet.data_validation('X2:X{}'.format(len_df),
37
+ {'validate': 'list',
38
+ 'source': sectorlist + ['Blank']})
39
+ worksheet.data_validation('T2:T{}'.format(len_df),
40
+ {'validate': 'list',
41
+ 'source': sectorlist + ['Blank']})
42
+ worksheet.data_validation('U2:U{}'.format(len_df),
43
+ {'validate': 'list',
44
+ 'source': sectorlist + ['Blank']})
45
+ worksheet.data_validation('V2:V{}'.format(len_df),
46
+ {'validate': 'list',
47
+ 'source': sectorlist + ['Blank']})
48
+ worksheet.data_validation('W2:U{}'.format(len_df),
49
+ {'validate': 'list',
50
+ 'source': sectorlist + ['Blank']})
51
+ writer.save()
52
+ processed_data = output.getvalue()
53
+ return processed_data
54
+
55
+ def app():
56
+
57
+ ### Main app code ###
58
+ with st.container():
59
+
60
+ if 'key1' in st.session_state:
61
+ df = st.session_state.key1
62
+ classifier = load_indicatorClassifier(classifier_name=params['model_name'])
63
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
64
+
65
+ if sum(df['Target Label'] == 'TARGET') > 100:
66
+ warning_msg = ": This might take sometime, please sit back and relax."
67
+ else:
68
+ warning_msg = ""
69
+
70
+ df = indicator_classification(haystack_doc=df,
71
+ threshold= params['threshold'])
72
+
73
+ st.session_state.key1 = df
74
+
75
+
76
+ # # st.write(df)
77
+ # threshold= params['threshold']
78
+ # truth_df = df.drop(['text'],axis=1)
79
+ # truth_df = truth_df.astype(float) >= threshold
80
+ # truth_df = truth_df.astype(str)
81
+ # categories = list(truth_df.columns)
82
+
83
+ # placeholder = {}
84
+ # for val in categories:
85
+ # placeholder[val] = dict(truth_df[val].value_counts())
86
+ # count_df = pd.DataFrame.from_dict(placeholder)
87
+ # count_df = count_df.T
88
+ # count_df = count_df.reset_index()
89
+ # # st.write(count_df)
90
+ # placeholder = []
91
+ # for i in range(len(count_df)):
92
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
93
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
94
+ # count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
95
+ # # st.write("Total Paragraphs: {}".format(len(df)))
96
+ # fig = px.bar(count_df, x='category', y='count',
97
+ # color='truth_value')
98
+ # # c1, c2 = st.columns([1,1])
99
+ # # with c1:
100
+ # st.plotly_chart(fig,use_container_width= True)
101
+
102
+ # truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
103
+ # truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
104
+ # # st.write(truth_df)
105
+ # df = pd.concat([df,truth_df['labels']],axis=1)
106
+ # df['Validation'] = 'No'
107
+ # df['Sector1'] = 'Blank'
108
+ # df['Sector2'] = 'Blank'
109
+ # df['Sector3'] = 'Blank'
110
+ # df['Sector4'] = 'Blank'
111
+ # df['Sector5'] = 'Blank'
112
+ # df_xlsx = to_excel(df,categories)
113
+ # st.download_button(label='📥 Download Current Result',
114
+ # data=df_xlsx ,
115
+ # # file_name= 'file_sector.xlsx')
116
+ # else:
117
+ # st.info("🤔 No document found, please try to upload it at the sidebar!")
118
+ # logging.warning("Terminated as no document provided")
119
+
120
+ # # Creating truth value dataframe
121
+ # if 'key' in st.session_state:
122
+ # if st.session_state.key is not None:
123
+ # df = st.session_state.key
124
+ # st.markdown("###### Select the threshold for classifier ######")
125
+ # c4, c5 = st.columns([1,1])
126
+
127
+ # with c4:
128
+ # threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
129
+ # step=0.01, value=0.5,
130
+ # help = "Keep High Value if want refined result, low if dont want to miss anything" )
131
+ # sectors =set(df.columns)
132
+ # removecols = {'Validation','Sector1','Sector2','Sector3','Sector4',
133
+ # 'Sector5','text'}
134
+ # sectors = list(sectors - removecols)
135
+
136
+ # placeholder = {}
137
+ # for val in sectors:
138
+ # temp = df[val].astype(float) > threshold
139
+ # temp = temp.astype(str)
140
+ # placeholder[val] = dict(temp.value_counts())
141
+
142
+ # count_df = pd.DataFrame.from_dict(placeholder)
143
+ # count_df = count_df.T
144
+ # count_df = count_df.reset_index()
145
+ # placeholder = []
146
+ # for i in range(len(count_df)):
147
+ # placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
148
+ # placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
149
+
150
+ # count_df = pd.DataFrame(placeholder, columns = ['sector','count','truth_value'])
151
+ # fig = px.bar(count_df, x='sector', y='count',
152
+ # color='truth_value',
153
+ # height=400)
154
+ # st.write("")
155
+ # st.plotly_chart(fig)
156
+
157
+ # df['Validation'] = 'No'
158
+ # df['Sector1'] = 'Blank'
159
+ # df['Sector2'] = 'Blank'
160
+ # df['Sector3'] = 'Blank'
161
+ # df['Sector4'] = 'Blank'
162
+ # df['Sector5'] = 'Blank'
163
+ # df_xlsx = to_excel(df,sectors)
164
+ # st.download_button(label='📥 Download Current Result',
165
+ # data=df_xlsx ,
166
+ # file_name= 'file_sector.xlsx')
appStore/policyaction.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from utils.policyaction_classifier import load_policyactionClassifier, policyaction_classification
12
+ import logging
13
+ logger = logging.getLogger(__name__)
14
+ from utils.config import get_classifier_params
15
+ from utils.preprocessing import paraLengthCheck
16
+ from io import BytesIO
17
+ import xlsxwriter
18
+ import plotly.express as px
19
+
20
+
21
+ # Declare all the necessary variables
22
+ classifier_identifier = 'policyaction'
23
+ params = get_classifier_params(classifier_identifier)
24
+
25
+ @st.cache_data
26
+ def to_excel(df):
27
+ df['Target Validation'] = 'No'
28
+ df['Netzero Validation'] = 'No'
29
+ df['GHG Validation'] = 'No'
30
+ df['Adapt-Mitig Validation'] = 'No'
31
+ df['Sector'] = 'No'
32
+ len_df = len(df)
33
+ output = BytesIO()
34
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
35
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
36
+ workbook = writer.book
37
+ worksheet = writer.sheets['Sheet1']
38
+ worksheet.data_validation('L2:L{}'.format(len_df),
39
+ {'validate': 'list',
40
+ 'source': ['No', 'Yes', 'Discard']})
41
+ worksheet.data_validation('M2:L{}'.format(len_df),
42
+ {'validate': 'list',
43
+ 'source': ['No', 'Yes', 'Discard']})
44
+ worksheet.data_validation('N2:L{}'.format(len_df),
45
+ {'validate': 'list',
46
+ 'source': ['No', 'Yes', 'Discard']})
47
+ worksheet.data_validation('O2:L{}'.format(len_df),
48
+ {'validate': 'list',
49
+ 'source': ['No', 'Yes', 'Discard']})
50
+ worksheet.data_validation('P2:L{}'.format(len_df),
51
+ {'validate': 'list',
52
+ 'source': ['No', 'Yes', 'Discard']})
53
+ writer.save()
54
+ processed_data = output.getvalue()
55
+ return processed_data
56
+
57
+ def app():
58
+
59
+ ### Main app code ###
60
+ with st.container():
61
+
62
+ if 'key1' in st.session_state:
63
+ df = st.session_state.key1
64
+ classifier = load_policyactionClassifier(classifier_name=params['model_name'])
65
+ st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
66
+
67
+ if sum(df['Target Label'] == 'TARGET') > 100:
68
+ warning_msg = ": This might take sometime, please sit back and relax."
69
+ else:
70
+ warning_msg = ""
71
+
72
+ df = policyaction_classification(haystack_doc=df,
73
+ threshold= params['threshold'])
74
+
75
+ st.session_state.key1 = df
76
+
77
+
78
+
79
+ def action_display():
80
+ if 'key1' in st.session_state:
81
+ df = st.session_state.key1
82
+
83
+
84
+ df['Action_check'] = df['Policy-Action Label'].apply(lambda x: True if 'Action' in x else False)
85
+ hits = df[df['Action_check'] == True]
86
+ # hits['GHG Label'] = hits['GHG Label'].apply(lambda i: _lab_dict[i])
87
+ range_val = min(5,len(hits))
88
+ if range_val !=0:
89
+ count_action = len(hits)
90
+ #count_netzero = sum(hits['Netzero Label'] == 'NETZERO')
91
+ #count_ghg = sum(hits['GHG Label'] == 'GHG')
92
+ #count_economy = sum([True if 'Economy-wide' in x else False
93
+ # for x in hits['Sector Label']])
94
+
95
+ # count_df = df['Target Label'].value_counts()
96
+ # count_df = count_df.rename('count')
97
+ # count_df = count_df.rename_axis('Target Label').reset_index()
98
+ # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
99
+
100
+ # fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
101
+ # c1, c2 = st.columns([1,1])
102
+ # with c1:
103
+ # st.write('**Target Paragraphs**: `{}`'.format(count_target))
104
+ # st.write('**NetZero Related Paragraphs**: `{}`'.format(count_netzero))
105
+ #
106
+ # # st.plotly_chart(fig,use_container_width= True)
107
+ #
108
+ # count_netzero = sum(hits['Netzero Label'] == 'NETZERO')
109
+ # count_ghg = sum(hits['GHG Label'] == 'LABEL_2')
110
+ # count_economy = sum([True if 'Economy-wide' in x else False
111
+ # for x in hits['Sector Label']])
112
+ # with c2:
113
+ # st.write('**GHG Related Paragraphs**: `{}`'.format(count_ghg))
114
+ # st.write('**Economy-wide Related Paragraphs**: `{}`'.format(count_economy))
115
+ # st.write('-------------------')
116
+ # hits = hits.sort_values(by=['Relevancy'], ascending=False)
117
+ # netzerohit = hits[hits['Netzero Label'] == 'NETZERO']
118
+ # if not netzerohit.empty:
119
+ # netzerohit = netzerohit.sort_values(by = ['Netzero Score'], ascending = False)
120
+ # # st.write('-------------------')
121
+ # st.markdown("###### Netzero paragraph ######")
122
+ # st.write('**Netzero paragraph** `page {}`: {}'.format(netzerohit.iloc[0]['page'],
123
+ # netzerohit.iloc[0]['text'].replace("\n", " ")))
124
+ # st.write("")
125
+ # else:
126
+ # st.info("🤔 No Netzero paragraph found")
127
+
128
+ # st.write("**Result {}** `page {}` (Relevancy Score: {:.2f})'".format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy'])")
129
+ # st.write('-------------------')
130
+ st.write("")
131
+ st.markdown("###### Top few Action Classified paragraph/text results from list of {} classified paragraphs ######".format(count_action))
132
+ st.markdown("""<hr style="height:10px;border:none;color:#097969;background-color:#097969;" /> """, unsafe_allow_html=True)
133
+ range_val = min(5,len(hits))
134
+ for i in range(range_val):
135
+ # the page number reflects the page that contains the main paragraph
136
+ # according to split limit, the overlapping part can be on a separate page
137
+ st.write('**Result {}** : `page {}`, `Sector: {}`,\
138
+ `Indicators: {}`, `Adapt-Mitig :{}`'\
139
+ .format(i+1,
140
+ hits.iloc[i]['page'], hits.iloc[i]['Sector Label'],
141
+ hits.iloc[i]['Indicator Label'],hits.iloc[i]['Adapt-Mitig Label']))
142
+ st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
143
+ hits = hits.reset_index(drop =True)
144
+ st.write('----------------')
145
+ st.write('Explore the data')
146
+ st.write(hits)
147
+ df.drop(columns = ['Action_check'],inplace=True)
148
+ df_xlsx = to_excel(df)
149
+
150
+ with st.sidebar:
151
+ st.write('-------------')
152
+ st.download_button(label='📥 Download Result',
153
+ data=df_xlsx ,
154
+ file_name= 'cpu_analysis.xlsx')
155
+
156
+ else:
157
+ st.info("🤔 No Actions found")
158
+
159
+
160
+ def policy_display():
161
+ if 'key1' in st.session_state:
162
+ df = st.session_state.key1
163
+
164
+
165
+ df['Policy_check'] = df['Policy-Action Label'].apply(lambda x: True if 'Policies & Plans' in x else False)
166
+ hits = df[df['Policy_check'] == True]
167
+ # hits['GHG Label'] = hits['GHG Label'].apply(lambda i: _lab_dict[i])
168
+ range_val = min(5,len(hits))
169
+ if range_val !=0:
170
+ count_policy = len(hits)
171
+ #count_netzero = sum(hits['Netzero Label'] == 'NETZERO')
172
+ #count_ghg = sum(hits['GHG Label'] == 'GHG')
173
+ #count_economy = sum([True if 'Economy-wide' in x else False
174
+ # for x in hits['Sector Label']])
175
+
176
+ # count_df = df['Target Label'].value_counts()
177
+ # count_df = count_df.rename('count')
178
+ # count_df = count_df.rename_axis('Target Label').reset_index()
179
+ # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
180
+
181
+ # fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
182
+ # c1, c2 = st.columns([1,1])
183
+ # with c1:
184
+ # st.write('**Target Paragraphs**: `{}`'.format(count_target))
185
+ # st.write('**NetZero Related Paragraphs**: `{}`'.format(count_netzero))
186
+ #
187
+ # # st.plotly_chart(fig,use_container_width= True)
188
+ #
189
+ # count_netzero = sum(hits['Netzero Label'] == 'NETZERO')
190
+ # count_ghg = sum(hits['GHG Label'] == 'LABEL_2')
191
+ # count_economy = sum([True if 'Economy-wide' in x else False
192
+ # for x in hits['Sector Label']])
193
+ # with c2:
194
+ # st.write('**GHG Related Paragraphs**: `{}`'.format(count_ghg))
195
+ # st.write('**Economy-wide Related Paragraphs**: `{}`'.format(count_economy))
196
+ # st.write('-------------------')
197
+ # hits = hits.sort_values(by=['Relevancy'], ascending=False)
198
+ # netzerohit = hits[hits['Netzero Label'] == 'NETZERO']
199
+ # if not netzerohit.empty:
200
+ # netzerohit = netzerohit.sort_values(by = ['Netzero Score'], ascending = False)
201
+ # # st.write('-------------------')
202
+ # st.markdown("###### Netzero paragraph ######")
203
+ # st.write('**Netzero paragraph** `page {}`: {}'.format(netzerohit.iloc[0]['page'],
204
+ # netzerohit.iloc[0]['text'].replace("\n", " ")))
205
+ # st.write("")
206
+ # else:
207
+ # st.info("🤔 No Netzero paragraph found")
208
+
209
+ # st.write("**Result {}** `page {}` (Relevancy Score: {:.2f})'".format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy'])")
210
+ # st.write('-------------------')
211
+ st.write("")
212
+ st.markdown("###### Top few Policy/Plans Classified paragraph/text results from list of {} classified paragraphs ######".format(count_policy))
213
+ st.markdown("""<hr style="height:10px;border:none;color:#097969;background-color:#097969;" /> """, unsafe_allow_html=True)
214
+ range_val = min(5,len(hits))
215
+ for i in range(range_val):
216
+ # the page number reflects the page that contains the main paragraph
217
+ # according to split limit, the overlapping part can be on a separate page
218
+ st.write('**Result {}** : `page {}`, `Sector: {}`,\
219
+ `Indicators: {}`, `Adapt-Mitig :{}`'\
220
+ .format(i+1,
221
+ hits.iloc[i]['page'], hits.iloc[i]['Sector Label'],
222
+ hits.iloc[i]['Indicator Label'],hits.iloc[i]['Adapt-Mitig Label']))
223
+ st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
224
+ hits = hits.reset_index(drop =True)
225
+ st.write('----------------')
226
+ st.write('Explore the data')
227
+ st.write(hits)
228
+ df.drop(columns = ['Policy_check'],inplace=True)
229
+ df_xlsx = to_excel(df)
230
+
231
+ with st.sidebar:
232
+ st.write('-------------')
233
+ st.download_button(label='📥 Download Result',
234
+ data=df_xlsx ,
235
+ file_name= 'cpu_analysis.xlsx')
236
+
237
+ else:
238
+ st.info("🤔 No Policy/Plans found")
appStore/target.py CHANGED
@@ -102,13 +102,7 @@ def target_display():
102
  if 'key1' in st.session_state:
103
  df = st.session_state.key1
104
 
105
- _lab_dict = {
106
- 'LABEL_0':'NEGATIVE',
107
- 'LABEL_1':'NOT GHG',
108
- 'LABEL_2':'GHG',
109
- 'NA':'NA',
110
- }
111
- df['GHG Label'] = df['GHG Label'].apply(lambda i: _lab_dict[i])
112
  hits = df[df['Target Label'] == 'TARGET']
113
  # hits['GHG Label'] = hits['GHG Label'].apply(lambda i: _lab_dict[i])
114
  range_val = min(5,len(hits))
 
102
  if 'key1' in st.session_state:
103
  df = st.session_state.key1
104
 
105
+
 
 
 
 
 
 
106
  hits = df[df['Target Label'] == 'TARGET']
107
  # hits['GHG Label'] = hits['GHG Label'].apply(lambda i: _lab_dict[i])
108
  range_val = min(5,len(hits))
paramconfig.cfg CHANGED
@@ -56,4 +56,24 @@ REMOVE_PUNC = 0
56
  SPLIT_LENGTH = 60
57
  SPLIT_OVERLAP = 10
58
  RESPECT_SENTENCE_BOUNDARY = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  TOP_KEY = 10
 
56
  SPLIT_LENGTH = 60
57
  SPLIT_OVERLAP = 10
58
  RESPECT_SENTENCE_BOUNDARY = 1
59
+ TOP_KEY = 10
60
+
61
+ [policyaction]
62
+ THRESHOLD = 0.50
63
+ MODEL = ppsingh/action-policy-plans-classifier
64
+ SPLIT_BY = word
65
+ REMOVE_PUNC = 0
66
+ SPLIT_LENGTH = 60
67
+ SPLIT_OVERLAP = 10
68
+ RESPECT_SENTENCE_BOUNDARY = 1
69
+ TOP_KEY = 10
70
+
71
+ [indicator]
72
+ THRESHOLD = 0.50
73
+ MODEL = ilaria-oneofftech/ikitracs_mitigation
74
+ SPLIT_BY = word
75
+ REMOVE_PUNC = 0
76
+ SPLIT_LENGTH = 60
77
+ SPLIT_OVERLAP = 10
78
+ RESPECT_SENTENCE_BOUNDARY = 1
79
  TOP_KEY = 10
requirements.txt CHANGED
@@ -15,5 +15,6 @@ markdown==3.4.1
15
  summa==1.2.0
16
  plotly
17
  xlsxwriter
 
18
  streamlit-aggrid
19
  python-docx
 
15
  summa==1.2.0
16
  plotly
17
  xlsxwriter
18
+ altair==4.0
19
  streamlit-aggrid
20
  python-docx
utils/adapmit_classifier.py CHANGED
@@ -67,13 +67,13 @@ def adapmit_classification(haystack_doc:pd.DataFrame,
67
  """
68
  logging.info("Working on Adaptation-Mitigation Identification")
69
  haystack_doc['Adapt-Mitig Label'] = 'NA'
70
- df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
71
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
72
 
73
  if not classifier_model:
74
  classifier_model = st.session_state['adapmit_classifier']
75
 
76
- predictions = classifier_model(list(df1.text))
77
  # converting the predictions to desired format
78
  list_ = []
79
  for i in range(len(predictions)):
@@ -93,7 +93,7 @@ def adapmit_classification(haystack_doc:pd.DataFrame,
93
  else None for i in categories}, axis=1)
94
  truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
95
  list(x['Adapt-Mitig Label'] -{None}),axis=1)
96
- df1['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
97
- df = pd.concat([df,df1])
98
 
99
- return df
 
67
  """
68
  logging.info("Working on Adaptation-Mitigation Identification")
69
  haystack_doc['Adapt-Mitig Label'] = 'NA'
70
+ # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
71
+ # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
72
 
73
  if not classifier_model:
74
  classifier_model = st.session_state['adapmit_classifier']
75
 
76
+ predictions = classifier_model(list(haystack_doc.text))
77
  # converting the predictions to desired format
78
  list_ = []
79
  for i in range(len(predictions)):
 
93
  else None for i in categories}, axis=1)
94
  truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
95
  list(x['Adapt-Mitig Label'] -{None}),axis=1)
96
+ haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
97
+ #df = pd.concat([df,df1])
98
 
99
+ return haystack_doc
utils/ghg_classifier.py CHANGED
@@ -10,9 +10,12 @@ from transformers import pipeline
10
 
11
  # Labels dictionary ###
12
  _lab_dict = {
13
- 'NEGATIVE':'NO GHG TARGET',
14
- 'TARGET':'GHG TARGET',
15
- }
 
 
 
16
 
17
  @st.cache_resource
18
  def load_ghgClassifier(config_file:str = None, classifier_name:str = None):
@@ -82,6 +85,7 @@ def ghg_classification(haystack_doc:pd.DataFrame,
82
  labels_= [(l[0]['label'],l[0]['score']) for l in results]
83
  temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
84
  df = pd.concat([df,temp])
 
85
  df = df.reset_index(drop =True)
86
  df.index += 1
87
 
 
10
 
11
  # Labels dictionary ###
12
  _lab_dict = {
13
+ 'LABEL_0':'NEGATIVE',
14
+ 'LABEL_1':'NOT GHG',
15
+ 'LABEL_2':'GHG',
16
+ 'NA':'NA',
17
+ }
18
+
19
 
20
  @st.cache_resource
21
  def load_ghgClassifier(config_file:str = None, classifier_name:str = None):
 
85
  labels_= [(l[0]['label'],l[0]['score']) for l in results]
86
  temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
87
  df = pd.concat([df,temp])
88
+ df['GHG Label'] = df['GHG Label'].apply(lambda i: _lab_dict[i])
89
  df = df.reset_index(drop =True)
90
  df.index += 1
91
 
utils/indicator_classifier.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+
12
+ @st.cache_resource
13
+ def load_indicatorClassifier(config_file:str = None, classifier_name:str = None):
14
+ """
15
+ loads the document classifier using haystack, where the name/path of model
16
+ in HF-hub as string is used to fetch the model object.Either configfile or
17
+ model should be passed.
18
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
+ Params
21
+ --------
22
+ config_file: config file path from which to read the model name
23
+ classifier_name: if modelname is passed, it takes a priority if not \
24
+ found then will look for configfile, else raise error.
25
+ Return: document classifier model
26
+ """
27
+ if not classifier_name:
28
+ if not config_file:
29
+ logging.warning("Pass either model name or config file")
30
+ return
31
+ else:
32
+ config = getconfig(config_file)
33
+ classifier_name = config.get('indicator','MODEL')
34
+
35
+ logging.info("Loading indicator classifier")
36
+ # we are using the pipeline as the model is multilabel and DocumentClassifier
37
+ # from Haystack doesnt support multilabel
38
+ # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
+ # if not then it will automatically use softmax, which is not a desired thing.
40
+ # doc_classifier = TransformersDocumentClassifier(
41
+ # model_name_or_path=classifier_name,
42
+ # task="text-classification",
43
+ # top_k = None)
44
+
45
+ doc_classifier = pipeline("text-classification",
46
+ model=classifier_name,
47
+ return_all_scores=True,
48
+ function_to_apply= "sigmoid")
49
+
50
+ return doc_classifier
51
+
52
+
53
+ @st.cache_data
54
+ def indicator_classification(haystack_doc:pd.DataFrame,
55
+ threshold:float = 0.5,
56
+ classifier_model:pipeline= None
57
+ )->Tuple[DataFrame,Series]:
58
+ """
59
+ Text-Classification on the list of texts provided. Classifier provides the
60
+ most appropriate label for each text. these labels are in terms of if text
61
+ belongs to which particular Sustainable Devleopment Goal (SDG).
62
+ Params
63
+ ---------
64
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
65
+ contains the list of paragraphs in different format,here the list of
66
+ Haystack Documents is used.
67
+ threshold: threshold value for the model to keep the results from classifier
68
+ classifiermodel: you can pass the classifier model directly,which takes priority
69
+ however if not then looks for model in streamlit session.
70
+ In case of streamlit avoid passing the model directly.
71
+ Returns
72
+ ----------
73
+ df: Dataframe with two columns['SDG:int', 'text']
74
+ x: Series object with the unique SDG covered in the document uploaded and
75
+ the number of times it is covered/discussed/count_of_paragraphs.
76
+ """
77
+ logging.info("Working on Indicator Identification")
78
+ haystack_doc['Indicator Label'] = 'NA'
79
+ haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
80
+
81
+ df1 = haystack_doc[haystack_doc['PA_check'] == True]
82
+ df = haystack_doc[haystack_doc['PA_check'] == False]
83
+ if not classifier_model:
84
+ classifier_model = st.session_state['indicator_classifier']
85
+
86
+ predictions = classifier_model(list(df1.text))
87
+
88
+ list_ = []
89
+ for i in range(len(predictions)):
90
+
91
+ temp = predictions[i]
92
+ placeholder = {}
93
+ for j in range(len(temp)):
94
+ placeholder[temp[j]['label']] = temp[j]['score']
95
+ list_.append(placeholder)
96
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
97
+ truth_df = DataFrame.from_dict(labels_)
98
+ truth_df = truth_df.round(2)
99
+ truth_df = truth_df.astype(float) >= threshold
100
+ truth_df = truth_df.astype(str)
101
+ categories = list(truth_df.columns)
102
+ truth_df['Indicator Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
103
+ None for i in categories}, axis=1)
104
+ truth_df['Indicator Label'] = truth_df.apply(lambda x: list(x['Indicator Label']
105
+ -{None}),axis=1)
106
+ df1['Indicator Label'] = list(truth_df['Indicator Label'])
107
+ df = pd.concat([df,df1])
108
+ df = df.drop(columns = ['PA_check'])
109
+ return df
utils/policyaction_classifier.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+ ## Labels dictionary ###
12
+ _lab_dict = {
13
+ 'NEGATIVE':'NO TARGET INFO',
14
+ 'TARGET':'TARGET',
15
+ }
16
+
17
+ @st.cache_resource
18
+ def load_policyactionClassifier(config_file:str = None, classifier_name:str = None):
19
+ """
20
+ loads the document classifier using haystack, where the name/path of model
21
+ in HF-hub as string is used to fetch the model object.Either configfile or
22
+ model should be passed.
23
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
+ Params
26
+ --------
27
+ config_file: config file path from which to read the model name
28
+ classifier_name: if modelname is passed, it takes a priority if not \
29
+ found then will look for configfile, else raise error.
30
+ Return: document classifier model
31
+ """
32
+ if not classifier_name:
33
+ if not config_file:
34
+ logging.warning("Pass either model name or config file")
35
+ return
36
+ else:
37
+ config = getconfig(config_file)
38
+ classifier_name = config.get('policyaction','MODEL')
39
+
40
+ logging.info("Loading classifier")
41
+
42
+ doc_classifier = pipeline("text-classification",
43
+ model=classifier_name,
44
+ return_all_scores=True,
45
+ function_to_apply= "sigmoid")
46
+
47
+ return doc_classifier
48
+
49
+
50
+ @st.cache_data
51
+ def policyaction_classification(haystack_doc:pd.DataFrame,
52
+ threshold:float = 0.5,
53
+ classifier_model:pipeline= None
54
+ )->Tuple[DataFrame,Series]:
55
+ """
56
+ Text-Classification on the list of texts provided. Classifier provides the
57
+ most appropriate label for each text. these labels are in terms of if text
58
+ belongs to which particular Sustainable Devleopment Goal (SDG).
59
+ Params
60
+ ---------
61
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
+ contains the list of paragraphs in different format,here the list of
63
+ Haystack Documents is used.
64
+ threshold: threshold value for the model to keep the results from classifier
65
+ classifiermodel: you can pass the classifier model directly,which takes priority
66
+ however if not then looks for model in streamlit session.
67
+ In case of streamlit avoid passing the model directly.
68
+ Returns
69
+ ----------
70
+ df: Dataframe with two columns['SDG:int', 'text']
71
+ x: Series object with the unique SDG covered in the document uploaded and
72
+ the number of times it is covered/discussed/count_of_paragraphs.
73
+ """
74
+ logging.info("Working on Policy/Action. Extraction")
75
+ haystack_doc['Policy-Action Label'] = 'NA'
76
+ if not classifier_model:
77
+ classifier_model = st.session_state['policyaction_classifier']
78
+
79
+ predictions = classifier_model(list(haystack_doc.text))
80
+ list_ = []
81
+ for i in range(len(predictions)):
82
+
83
+ temp = predictions[i]
84
+ placeholder = {}
85
+ for j in range(len(temp)):
86
+ placeholder[temp[j]['label']] = temp[j]['score']
87
+ list_.append(placeholder)
88
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
89
+ truth_df = DataFrame.from_dict(labels_)
90
+ truth_df = truth_df.round(2)
91
+ truth_df = truth_df.astype(float) >= threshold
92
+ truth_df = truth_df.astype(str)
93
+ categories = list(truth_df.columns)
94
+ truth_df['Policy-Action Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
95
+ else None for i in categories}, axis=1)
96
+ truth_df['Policy-Action Label'] = truth_df.apply(lambda x:
97
+ list(x['Policy-Action Label'] -{None}),axis=1)
98
+
99
+ haystack_doc['Policy-Action Label'] = list(truth_df['Policy-Action Label'])
100
+
101
+ return haystack_doc
utils/sector_classifier.py CHANGED
@@ -76,12 +76,12 @@ def sector_classification(haystack_doc:pd.DataFrame,
76
  """
77
  logging.info("Working on Sector Identification")
78
  haystack_doc['Sector Label'] = 'NA'
79
- df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
80
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
81
  if not classifier_model:
82
  classifier_model = st.session_state['sector_classifier']
83
 
84
- predictions = classifier_model(list(df1.text))
85
 
86
  list_ = []
87
  for i in range(len(predictions)):
@@ -101,6 +101,6 @@ def sector_classification(haystack_doc:pd.DataFrame,
101
  None for i in categories}, axis=1)
102
  truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
103
  -{None}),axis=1)
104
- df1['Sector Label'] = list(truth_df['Sector Label'])
105
- df = pd.concat([df,df1])
106
- return df
 
76
  """
77
  logging.info("Working on Sector Identification")
78
  haystack_doc['Sector Label'] = 'NA'
79
+ # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
80
+ # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
81
  if not classifier_model:
82
  classifier_model = st.session_state['sector_classifier']
83
 
84
+ predictions = classifier_model(list(haystack_doc.text))
85
 
86
  list_ = []
87
  for i in range(len(predictions)):
 
101
  None for i in categories}, axis=1)
102
  truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
103
  -{None}),axis=1)
104
+ haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
105
+ # df = pd.concat([df,df1])
106
+ return haystack_doc