Spaces:

mtyrrell
/

cpu-demo

Sleeping

App Files Files Community

ppsingh commited on May 22, 2023

Commit

031e5e2

0 Parent(s):

Duplicate from ppsingh/cpu-demo

Browse files

Files changed (31) hide show

.gitattributes +35 -0
.gitignore +1 -0
.vscode/launch.json +20 -0
README.md +13 -0
app.py +20 -0
appStore/__init__.py +1 -0
appStore/adapmit.py +212 -0
appStore/info.py +67 -0
appStore/multiapp.py +67 -0
appStore/netzero.py +206 -0
appStore/sector.py +211 -0
appStore/target.py +211 -0
docStore/img/dsc_giz.png +0 -0
docStore/img/ndc.png +0 -0
docStore/img/paris.png +0 -0
docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt +737 -0
docStore/sample/Seychelles-revised_first_ndc-EN.pdf +0 -0
docStore/sample/South Africa_s Low Emission Development Strategy.pdf +3 -0
docStore/sample/files.json +4 -0
packages.txt +4 -0
paramconfig.cfg +39 -0
requirements.txt +19 -0
style.css +180 -0
utils/__init__.py +1 -0
utils/adapmit_classifier.py +136 -0
utils/config.py +31 -0
utils/netzero_classifier.py +137 -0
utils/preprocessing.py +275 -0
utils/sector_classifier.py +146 -0
utils/target_classifier.py +138 -0
utils/uploadAndExample.py +33 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+docStore/sample/South[[:space:]]Africa_s[[:space:]]Low[[:space:]]Emission[[:space:]]Development[[:space:]]Strategy.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Streamlit",
+            "type": "python",
+            "request": "launch",
+            "program": ".venv/bin/streamlit",
+            "args": [
+                "run",
+                "app.py"
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": false
+        }
+    ]
+}

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Cpu Demo
+emoji: 🦀
+colorFrom: blue
+colorTo: pink
+sdk: streamlit
+sdk_version: 1.19.0
+app_file: app.py
+pinned: false
+duplicated_from: ppsingh/cpu-demo
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import appStore.target as target_extraction
+import appStore.netzero as netzero
+import appStore.sector as sector
+import appStore.adapmit as adapmit
+# import appStore.info as info
+from appStore.multiapp import MultiApp
+import streamlit as st
+st.set_page_config(page_title = 'Climate Policy Intelligence',
+                   initial_sidebar_state='expanded', layout="wide")
+app = MultiApp()
+# app.add_app("About","house", info.app)
+app.add_app("Economy-Wide Target Extraction","gear",target_extraction.app)
+app.add_app("NetZero Target Extraction","gear", netzero.app)
+app.add_app("Sector Classification","gear", sector.app)
+app.add_app("Adaptation-Mitigation","gear", adapmit.app)
+app.run()

appStore/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # creating appstore package

appStore/adapmit.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# set path
+import glob, os, sys
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+# from st_aggrid import AgGrid
+# from st_aggrid.shared import ColumnsAutoSizeMode
+from utils.adapmit_classifier import adapmit_classification
+from utils.adapmit_classifier import runAdapMitPreprocessingPipeline, load_adapmitClassifier
+# from utils.keyword_extraction import textrank
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from utils.preprocessing import paraLengthCheck
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+# Declare all the necessary variables
+classifier_identifier = 'adapmit'
+params  = get_classifier_params(classifier_identifier)
+@st.cache_data
+def to_excel(df):
+    len_df = len(df)
+    output = BytesIO()
+    writer = pd.ExcelWriter(output, engine='xlsxwriter')
+    df.to_excel(writer, index=False, sheet_name='Sheet1')
+    workbook = writer.book
+    worksheet = writer.sheets['Sheet1']
+    worksheet.data_validation('E2:E{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    worksheet.data_validation('F2:F{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    worksheet.data_validation('G2:G{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    writer.save()
+    processed_data = output.getvalue()
+    return processed_data
+def app():
+    #### APP INFO #####
+    with st.container():
+        st.markdown("<h1 style='text-align: center; color: black;'> Adaptation-Mitigation Classification </h1>", unsafe_allow_html=True)
+        st.write(' ')
+        st.write(' ')
+    with st.expander("ℹ️ - About this app", expanded=False):
+        st.write(
+            """
+            The **Adaptation-Mitigation Classification** app is an easy-to-use interface built \
+                in Streamlit for analyzing policy documents for \
+                 Classification of the paragraphs/texts in the document *If it \
+                belongs to 'Adaptation' and 'Mitigation' category or not. The paragraph \
+                can belong to both category too. \
+                 - developed by GIZ Data Service Center, GFA, IKI Tracs, \
+                 SV Klima and SPA. \n
+            """)
+        st.write("""**Document Processing:** The Uploaded/Selected document is \
+            automatically cleaned and split into paragraphs with a maximum \
+            length of 60 words using a Haystack preprocessing pipeline. The \
+            length of 60 is an empirical value which should reflect the length \
+            of a “context” and should limit the paragraph length deviation. \
+            However, since we want to respect the sentence boundary the limit \
+            can breach and hence this limit of 60 is tentative.  \n
+            """)
+        st.write("")
+    ### Main app code ###
+    with st.container():
+        if st.button("RUN Adaptation-Mitigation Classification"):
+            if 'key4' not in st.session_state:
+                st.session_state['key4'] = None
+            if 'filepath' in st.session_state:
+                file_name = st.session_state['filename']
+                file_path = st.session_state['filepath']
+                all_documents = runAdapMitPreprocessingPipeline(file_name= file_name,
+                                        file_path= file_path, split_by= params['split_by'],
+                                        split_length= params['split_length'],
+                split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
+                split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
+                classifier = load_adapmitClassifier(classifier_name=params['model_name'])
+                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+                verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
+                if len(verified_paralist) > 100:
+                    warning_msg = ": This might take sometime, please sit back and relax."
+                else:
+                    warning_msg = ""
+            #     # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
+                df = adapmit_classification(haystack_doc=verified_paralist,
+                                            threshold= params['threshold'])
+                threshold= params['threshold']
+                truth_df = df.drop(['text'],axis=1)
+                truth_df = truth_df.astype(float) >= threshold
+                truth_df = truth_df.astype(str)
+                categories = list(truth_df.columns)
+                placeholder = {}
+                for val in categories:
+                    placeholder[val] = dict(truth_df[val].value_counts())
+                count_df = pd.DataFrame.from_dict(placeholder)
+                count_df = count_df.T
+                count_df = count_df.reset_index()
+                # st.write(count_df)
+                placeholder  = []
+                for i in range(len(count_df)):
+                    placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
+                    placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
+                count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
+                # st.write("Total Paragraphs: {}".format(len(df)))
+                fig = px.bar(count_df, y='category', x='count',
+                            color='truth_value',orientation='h', height =200)
+                c1, c2 = st.columns([1,1])
+                with c1:
+                    st.plotly_chart(fig,use_container_width= True)
+                truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
+                truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
+                # st.write(truth_df)
+                df = pd.concat([df,truth_df['labels']],axis=1)
+                st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
+                df = df.sort_values(by = ['Mitigation'], ascending=False)
+                for i in range(3):
+                    if df.iloc[i]['Mitigation'] >= 0.50:
+                        st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
+                        st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
+                st.markdown("###### Top few 'Adaptation' related paragraph/text ######")
+                df = df.sort_values(by = ['Adaptation'], ascending=False)
+                for i in range(3):
+                    if df.iloc[i]['Adaptation'] > 0.5:
+                        st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Adaptation']))
+                        st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
+                # st.write(df[['text','labels']])
+                df['Validation'] =  'No'
+                df['Val-Mitigation'] = 'No'
+                df['Val-Adaptation'] = 'No'
+                df_xlsx = to_excel(df)
+                st.download_button(label='📥 Download Current Result',
+                                data=df_xlsx ,
+                              file_name= 'file_adaptation-mitigation.xlsx')
+                # st.session_state.key4 =
+                # category =set(df.columns)
+                # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
+                # category  = list(category - removecols)
+            else:
+                st.info("🤔 No document found, please try to upload it at the sidebar!")
+                logging.warning("Terminated as no document provided")
+        # # Creating truth value dataframe
+        # if 'key4' in st.session_state:
+        #     if st.session_state.key4 is not None:
+        #         df = st.session_state.key4
+        #         st.markdown("###### Select the threshold for classifier ######")
+        #         c4, c5 = st.columns([1,1])
+        #         with c4:
+        #             threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
+        #                                   step=0.01, value=0.5,
+        #                 help = "Keep High Value if want refined result, low if dont want to miss anything" )
+        #         category =set(df.columns)
+        #         removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
+        #         category  = list(category - removecols)
+        #         placeholder = {}
+        #         for val in category:
+        #             temp = df[val].astype(float) > threshold
+        #             temp = temp.astype(str)
+        #             placeholder[val] = dict(temp.value_counts())
+        #         count_df = pd.DataFrame.from_dict(placeholder)
+        #         count_df = count_df.T
+        #         count_df = count_df.reset_index()
+        #         placeholder  = []
+        #         for i in range(len(count_df)):
+        #             placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
+        #             placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
+        #         count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
+        #         fig = px.bar(count_df, x='category', y='count',
+        #                     color='truth_value',
+        #                     height=400)
+        #         st.write("")
+        #         st.plotly_chart(fig)
+        #         df['Validation'] =  'No'
+        #         df['Val-Mitigation'] = 'No'
+        #         df['Val-Adaptation'] = 'No'
+        #         df_xlsx = to_excel(df)
+        #         st.download_button(label='📥 Download Current Result',
+        #                         data=df_xlsx ,
+        #                       file_name= 'file_adaptation-mitigation.xlsx')

appStore/info.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import streamlit as st
+import os
+from PIL import Image
+_ROOT = os.path.abspath(os.path.dirname(__file__))
+def get_data(path):
+    return os.path.join(_ROOT, 'data', path)
+def app():
+    with open('style.css') as f:
+        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+    st.markdown("<h2 style='text-align: center;  \
+                      color: black;'> Climate Policy Understanding App</h2>",
+                      unsafe_allow_html=True)
+    st.markdown("<div style='text-align: center; \
+                    color: grey;'>Climate Policy Understanding App is an open-source\
+                         digital tool which aims to assist policy analysts and \
+                          other users in extracting and filtering relevant \
+                            information from public documents.</div>",
+                        unsafe_allow_html=True)
+    footer = """
+           <div class="footer-custom">
+               Guidance & Feedback - <a>Nadja Taeger</a> |<a>Marie Hertel</a> | <a>Cecile Schneider</a> |
+               Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a>  |
+               <a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
+           </div>
+       """
+    st.markdown(footer, unsafe_allow_html=True)
+    c1, c2, c3 =  st.columns([8,1,12])
+    with c1:
+        image = Image.open('docStore/img/ndc.png')
+        st.image(image)
+    with c3:
+        st.markdown('<div style="text-align: justify;">The manual extraction \
+        of relevant information from text documents is a \
+    time-consuming task for any policy analysts. As the amount and length of \
+    public policy documents in relation to sustainable development (such as \
+    National Development Plans and Nationally Determined Contributions) \
+    continuously increases, a major challenge for policy action tracking – the \
+    evaluation of stated goals and targets and their actual implementation on \
+    the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
+    Language Processing (NLP) methods can help in shortening and easing this \
+    task for policy analysts.</div><br>',
+    unsafe_allow_html=True)
+    intro = """
+    <div style="text-align: justify;">
+    For this purpose, IKI Tracs, SV KLIMA, SPA and Data Service Center (Deutsche Gesellschaft für Internationale \
+    Zusammenarbeit (GIZ) GmbH) are collaborating since 2022 in the development \
+    of an AI-powered open-source web application that helps find and extract \
+    relevant information from public policy documents faster to facilitate \
+    evidence-based decision-making processes in sustainable development and beyond.
+    </div>
+    <br>
+    """
+    st.markdown(intro, unsafe_allow_html=True)
+    image2  = Image.open('docStore/img/paris.png')
+    st.image(image2)

appStore/multiapp.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Frameworks for running multiple Streamlit applications as a single app.
+"""
+import streamlit as st
+from PIL import Image
+from utils.uploadAndExample import add_upload
+class MultiApp:
+    """Framework for combining multiple streamlit applications.
+    Usage:
+        def foo():
+            st.title("Hello Foo")
+        def bar():
+            st.title("Hello Bar")
+        app = MultiApp()
+        app.add_app("Foo", foo)
+        app.add_app("Bar", bar)
+        app.run()
+    It is also possible keep each application in a separate file.
+        import foo
+        import bar
+        app = MultiApp()
+        app.add_app("Foo", foo.app)
+        app.add_app("Bar", bar.app)
+        app.run()
+    """
+    def __init__(self):
+        self.apps = []
+    def add_app(self,title,icon, func):
+        """Adds a new application.
+        Parameters
+        ----------
+        func:
+            the python function to render this app.
+        title:
+            title of the app. Appears in the dropdown in the sidebar.
+        """
+        self.apps.append({
+            "title": title,
+            "icon": icon,
+            "function": func
+        })
+    def run(self):
+        st.sidebar.write(format_func=lambda app: app['title'])
+        #image = Image.open('docStore/img/dsc_giz.png')
+        #st.sidebar.image(image, width =200)
+        with st.sidebar:
+            selected = st.selectbox("Select the Task to perform", [page["title"] for page in self.apps],)
+            st.markdown("---")
+        for index, item in enumerate(self.apps):
+            if item["title"] == selected:
+                self.apps[index]["function"]()
+                break
+        choice = st.sidebar.radio(label = 'Select the Document',
+                            help = 'You can upload the document \
+                            or else you can try a example document',
+                            options = ('Upload Document', 'Try Example'),
+                            horizontal = True)
+        add_upload(choice)

appStore/netzero.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+# from st_aggrid import AgGrid
+# from st_aggrid.shared import ColumnsAutoSizeMode
+from utils.netzero_classifier import netzero_classification
+from utils.netzero_classifier import runNetZeroPreprocessingPipeline, load_netzeroClassifier
+# from utils.keyword_extraction import textrank
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+# Declare all the necessary variables
+classifier_identifier = 'netzero'
+params  = get_classifier_params(classifier_identifier)
+# Labels dictionary ###
+_lab_dict = {
+            'NEGATIVE':'NO NETZERO TARGET',
+            'NETZERO':'NETZERO TARGET',
+            }
+@st.cache_data
+def to_excel(df):
+    len_df = len(df)
+    output = BytesIO()
+    writer = pd.ExcelWriter(output, engine='xlsxwriter')
+    df.to_excel(writer, index=False, sheet_name='Sheet1')
+    workbook = writer.book
+    worksheet = writer.sheets['Sheet1']
+    worksheet.data_validation('E2:E{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    writer.save()
+    processed_data = output.getvalue()
+    return processed_data
+def app():
+    #### APP INFO #####
+    with st.container():
+        st.markdown("<h1 style='text-align: center; color: black;'> NetZero Target Extraction </h1>", unsafe_allow_html=True)
+        st.write(' ')
+        st.write(' ')
+    with st.expander("ℹ️ - About this app", expanded=False):
+        st.write(
+            """
+            The **NetZero Extraction** app is an easy-to-use interface built \
+                in Streamlit for analyzing policy documents for \
+                 Classification of the paragraphs/texts in the document *If it \
+                contains any Net-Zero target related information* - \
+                developed by GIZ Data Service Center, GFA, IKI Tracs, \
+                 SV Klima and SPA. \n
+            """)
+        st.write("""**Document Processing:** The Uploaded/Selected document is \
+            automatically cleaned and split into paragraphs with a maximum \
+            length of 60 words using a Haystack preprocessing pipeline. The \
+            length of 60 is an empirical value which should reflect the length \
+            of a “context” and should limit the paragraph length deviation. \
+            However, since we want to respect the sentence boundary the limit \
+            can breach and hence this limit of 60 is tentative.  \n
+            """)
+        st.write("")
+    ### Main app code ###
+    with st.container():
+        if st.button("RUN NetZero Related Paragraph Extractions"):
+            if 'key2' not in st.session_state:
+                st.session_state['key2'] = None
+            if 'filepath' in st.session_state:
+                file_name = st.session_state['filename']
+                file_path = st.session_state['filepath']
+                # Do the preprocessing of the PDF
+                all_documents = runNetZeroPreprocessingPipeline(file_name= file_name,
+                                        file_path= file_path, split_by= params['split_by'],
+                                        split_length= params['split_length'],
+                split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
+                split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
+                # st.dataframe(all_documents['documents'])
+                # Load the classifier model
+                classifier = load_netzeroClassifier(classifier_name=params['model_name'])
+                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+                if len(all_documents['documents']) > 100:
+                    warning_msg = ": This might take sometime, please sit back and relax."
+                else:
+                    warning_msg = ""
+                # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
+                # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
+                df = netzero_classification(haystack_doc=all_documents['documents'],
+                                            threshold= params['threshold'])
+                st.session_state.key2 = df
+                hits  = df[df['Target Label'] == 'NETZERO']
+                range_val = min(5,len(hits))
+                if range_val !=0:
+                    count_df = df['Target Label'].value_counts()
+                    count_df = count_df.rename('count')
+                    count_df = count_df.rename_axis('Target Label').reset_index()
+                    count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
+                    fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
+                    c1, c2 = st.columns([1,1])
+                    with c1:
+                        st.plotly_chart(fig,use_container_width= True)
+                    hits = hits.sort_values(by=['Relevancy'], ascending=False)
+                    st.write("")
+                    st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
+                    range_val = min(5,len(hits))
+                    for i in range(range_val):
+                        # the page number reflects the page that contains the main paragraph
+                        # according to split limit, the overlapping part can be on a separate page
+                        st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
+                        st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
+                else:
+                    st.info("🤔 No Netzero target found")
+                df['Validation'] =  'No'
+                df_xlsx = to_excel(df)
+                st.download_button(label='📥 Download Current Result',
+                                data=df_xlsx ,
+                                file_name= 'file_target.xlsx')
+            else:
+                st.info("🤔 No document found, please try to upload it at the sidebar!")
+                logging.warning("Terminated as no document provided")
+        # # Creating truth value dataframe
+        # if 'key2' in st.session_state:
+        #     if st.session_state.key2 is not None:
+        #         df = st.session_state.key2
+        #         st.markdown("###### Select the threshold for classifier ######")
+        #         c1, c2 = st.columns([1,1])
+        #         netzero_df = df[df['Target Label'] == 'NETZERO'].reset_index(drop = True)
+        #         if len(netzero_df) >0:
+        #             with c1:
+        #                 threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
+        #                                       step=0.01, value=0.5,
+        #                     help = "Keep High Value if want refined result, low if dont want to miss anything" )
+        #             # creating the dataframe for value counts of Labels, along with 'title' of Labels
+        #             temp = df[df['Relevancy']>threshold]
+        #             count_df = temp['Target Label'].value_counts()
+        #             count_df = count_df.rename('count')
+        #             count_df = count_df.rename_axis('Target Label').reset_index()
+        #             count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
+        #             plt.rcParams['font.size'] = 25
+        #             colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
+        #             # plot
+        #             fig, ax = plt.subplots()
+        #             ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
+        #                 wedgeprops={"linewidth": 1, "edgecolor": "white"},
+        #                 textprops={'fontsize': 14},
+        #                 frame=False,labels =list(count_df.Label_def),
+        #                 labeldistance=1.2)
+        #             st.markdown("#### Anything related to NetZero Targets? ####")
+        #             c4, c5, c6 = st.columns([1,2,2])
+        #             with c5:
+        #                 st.pyplot(fig)
+        #             with c6:
+        #                 st.write(count_df[['Label_def','count']])
+        #             st.write("")
+        #             st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
+        #             st.dataframe(netzero_df.head())
+        #         else:
+        #             st.write("🤔 No Results found")
+        #         df['Validation'] =  'No'
+        #         df_xlsx = to_excel(df)
+        #         st.download_button(label='📥 Download Current Result',
+        #                         data=df_xlsx ,
+        #                        file_name= 'file_netzero.xlsx')

appStore/sector.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+# from st_aggrid import AgGrid
+# from st_aggrid.shared import ColumnsAutoSizeMode
+from utils.sector_classifier import sector_classification
+from utils.sector_classifier import runSectorPreprocessingPipeline, load_sectorClassifier
+# from utils.keyword_extraction import textrank
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from utils.preprocessing import paraLengthCheck
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+# Declare all the necessary variables
+classifier_identifier = 'sector'
+params  = get_classifier_params(classifier_identifier)
+@st.cache_data
+def to_excel(df,sectorlist):
+    len_df = len(df)
+    output = BytesIO()
+    writer = pd.ExcelWriter(output, engine='xlsxwriter')
+    df.to_excel(writer, index=False, sheet_name='Sheet1')
+    workbook = writer.book
+    worksheet = writer.sheets['Sheet1']
+    worksheet.data_validation('S2:S{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    worksheet.data_validation('X2:X{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('T2:T{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('U2:U{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('V2:V{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('W2:U{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    writer.save()
+    processed_data = output.getvalue()
+    return processed_data
+def app():
+    #### APP INFO #####
+    with st.container():
+        st.markdown("<h1 style='text-align: center; color: black;'> Sector Classification </h1>", unsafe_allow_html=True)
+        st.write(' ')
+        st.write(' ')
+    with st.expander("ℹ️ - About this app", expanded=False):
+        st.write(
+            """
+            The **Sector Classification** app is an easy-to-use interface built \
+                in Streamlit for analyzing policy documents for \
+                 Classification of the paragraphs/texts in the document *If it \
+                belongs to particular sector or not*. The paragraph can belong to multiple sectors - \
+                developed by GIZ Data Service Center, GFA, IKI Tracs, \
+                 SV Klima and SPA. \n
+            """)
+        st.write("""**Document Processing:** The Uploaded/Selected document is \
+            automatically cleaned and split into paragraphs with a maximum \
+            length of 60 words using a Haystack preprocessing pipeline. The \
+            length of 60 is an empirical value which should reflect the length \
+            of a “context” and should limit the paragraph length deviation. \
+            However, since we want to respect the sentence boundary the limit \
+            can breach and hence this limit of 60 is tentative.  \n
+            """)
+        st.write("")
+    ### Main app code ###
+    with st.container():
+        if st.button("RUN Sector Classification"):
+            if 'key' not in st.session_state:
+                st.session_state['key'] = None
+            if 'filepath' in st.session_state:
+                file_name = st.session_state['filename']
+                file_path = st.session_state['filepath']
+                all_documents = runSectorPreprocessingPipeline(file_name= file_name,
+                                        file_path= file_path, split_by= params['split_by'],
+                                        split_length= params['split_length'],
+                split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
+                split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
+                # st.write(all_documents['documents'])
+                classifier = load_sectorClassifier(classifier_name=params['model_name'])
+                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+                verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
+                if len(verified_paralist) > 100:
+                    warning_msg = ": This might take sometime, please sit back and relax."
+                else:
+                    warning_msg = ""
+                # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
+                # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
+                df = sector_classification(haystack_doc=verified_paralist,
+                                            threshold= params['threshold'])
+                # st.write(df)
+                threshold= params['threshold']
+                truth_df = df.drop(['text'],axis=1)
+                truth_df = truth_df.astype(float) >= threshold
+                truth_df = truth_df.astype(str)
+                categories = list(truth_df.columns)
+                placeholder = {}
+                for val in categories:
+                    placeholder[val] = dict(truth_df[val].value_counts())
+                count_df = pd.DataFrame.from_dict(placeholder)
+                count_df = count_df.T
+                count_df = count_df.reset_index()
+                # st.write(count_df)
+                placeholder  = []
+                for i in range(len(count_df)):
+                    placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
+                    placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
+                count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
+                # st.write("Total Paragraphs: {}".format(len(df)))
+                fig = px.bar(count_df, x='category', y='count',
+                            color='truth_value')
+                # c1, c2 = st.columns([1,1])
+                # with c1:
+                st.plotly_chart(fig,use_container_width= True)
+                truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
+                truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
+                # st.write(truth_df)
+                df = pd.concat([df,truth_df['labels']],axis=1)
+                df['Validation'] =  'No'
+                df['Sector1'] = 'Blank'
+                df['Sector2'] = 'Blank'
+                df['Sector3'] = 'Blank'
+                df['Sector4'] = 'Blank'
+                df['Sector5'] = 'Blank'
+                df_xlsx = to_excel(df,categories)
+                st.download_button(label='📥 Download Current Result',
+                                data=df_xlsx ,
+                              file_name= 'file_sector.xlsx')
+            else:
+                st.info("🤔 No document found, please try to upload it at the sidebar!")
+                logging.warning("Terminated as no document provided")
+        # # Creating truth value dataframe
+        # if 'key' in st.session_state:
+        #     if st.session_state.key is not None:
+        #         df = st.session_state.key
+        #         st.markdown("###### Select the threshold for classifier ######")
+        #         c4, c5 = st.columns([1,1])
+        #         with c4:
+        #             threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
+        #                                   step=0.01, value=0.5,
+        #                 help = "Keep High Value if want refined result, low if dont want to miss anything" )
+        #         sectors =set(df.columns)
+        #         removecols = {'Validation','Sector1','Sector2','Sector3','Sector4',
+        #                       'Sector5','text'}
+        #         sectors  = list(sectors - removecols)
+        #         placeholder = {}
+        #         for val in sectors:
+        #             temp = df[val].astype(float) > threshold
+        #             temp = temp.astype(str)
+        #             placeholder[val] = dict(temp.value_counts())
+        #         count_df = pd.DataFrame.from_dict(placeholder)
+        #         count_df = count_df.T
+        #         count_df = count_df.reset_index()
+        #         placeholder  = []
+        #         for i in range(len(count_df)):
+        #             placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
+        #             placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
+        #         count_df = pd.DataFrame(placeholder, columns = ['sector','count','truth_value'])
+        #         fig = px.bar(count_df, x='sector', y='count',
+        #                     color='truth_value',
+        #                     height=400)
+        #         st.write("")
+        #         st.plotly_chart(fig)
+                # df['Validation'] =  'No'
+                # df['Sector1'] = 'Blank'
+                # df['Sector2'] = 'Blank'
+                # df['Sector3'] = 'Blank'
+                # df['Sector4'] = 'Blank'
+                # df['Sector5'] = 'Blank'
+                # df_xlsx = to_excel(df,sectors)
+                # st.download_button(label='📥 Download Current Result',
+                #                 data=df_xlsx ,
+                #               file_name= 'file_sector.xlsx')

appStore/target.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+# from st_aggrid import AgGrid
+# from st_aggrid.shared import ColumnsAutoSizeMode
+from utils.target_classifier import target_classification
+from utils.target_classifier import runTargetPreprocessingPipeline, load_targetClassifier
+# from utils.keyword_extraction import textrank
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+# Declare all the necessary variables
+classifier_identifier = 'target'
+params  = get_classifier_params(classifier_identifier)
+## Labels dictionary ###
+_lab_dict = {
+            'LABEL_0':'NO TARGET INFO',
+            'LABEL_1':'ECONOMY-WIDE TARGET',
+            }
+@st.cache_data
+def to_excel(df):
+    len_df = len(df)
+    output = BytesIO()
+    writer = pd.ExcelWriter(output, engine='xlsxwriter')
+    df.to_excel(writer, index=False, sheet_name='Sheet1')
+    workbook = writer.book
+    worksheet = writer.sheets['Sheet1']
+    worksheet.data_validation('E2:E{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    writer.save()
+    processed_data = output.getvalue()
+    return processed_data
+def app():
+    #### APP INFO #####
+    with st.container():
+        st.markdown("<h1 style='text-align: center; color: black;'> Targets Extraction </h1>", unsafe_allow_html=True)
+        st.write(' ')
+        st.write(' ')
+    with st.expander("ℹ️ - About this app", expanded=False):
+        st.write(
+            """
+            The **Target Extraction** app is an easy-to-use interface built \
+                in Streamlit for analyzing policy documents for \
+                 Classification of the paragraphs/texts in the document *If it \
+                contains any Economy-Wide Targets related information* - \
+                developed by GIZ Data Service Center, GFA, IKI Tracs, \
+                 SV Klima and SPA. \n
+            """)
+        st.write("""**Document Processing:** The Uploaded/Selected document is \
+            automatically cleaned and split into paragraphs with a maximum \
+            length of 60 words using a Haystack preprocessing pipeline. The \
+            length of 60 is an empirical value which should reflect the length \
+            of a “context” and should limit the paragraph length deviation. \
+            However, since we want to respect the sentence boundary the limit \
+            can breach and hence this limit of 60 is tentative.  \n
+            """)
+        st.write("")
+    ### Main app code ###
+    with st.container():
+        if st.button("RUN Target Related Paragraph Extractions"):
+            if 'key1' not in st.session_state:
+                st.session_state['key1'] = None
+            if 'filepath' in st.session_state:
+                file_name = st.session_state['filename']
+                file_path = st.session_state['filepath']
+                all_documents = runTargetPreprocessingPipeline(file_name= file_name,
+                                        file_path= file_path, split_by= params['split_by'],
+                                        split_length= params['split_length'],
+                split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
+                split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
+                # st.write(all_documents['documents'])
+                #load Classifier
+                classifier = load_targetClassifier(classifier_name=params['model_name'])
+                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+                if len(all_documents['documents']) > 100:
+                    warning_msg = ": This might take sometime, please sit back and relax."
+                else:
+                    warning_msg = ""
+                # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
+                # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
+                df = target_classification(haystack_doc=all_documents['documents'],
+                                            threshold= params['threshold'])
+                st.session_state.key1 = df
+                # temp = df[df['Relevancy']>threshold]
+                hits  = df[df['Target Label'] == 'LABEL_1']
+                range_val = min(5,len(hits))
+                if range_val !=0:
+                    count_df = df['Target Label'].value_counts()
+                    count_df = count_df.rename('count')
+                    count_df = count_df.rename_axis('Target Label').reset_index()
+                    count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
+                    fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
+                    c1, c2 = st.columns([1,1])
+                    with c1:
+                        st.plotly_chart(fig,use_container_width= True)
+                    hits = hits.sort_values(by=['Relevancy'], ascending=False)
+                    st.write("")
+                    st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
+                    range_val = min(5,len(hits))
+                    for i in range(range_val):
+                        # the page number reflects the page that contains the main paragraph
+                        # according to split limit, the overlapping part can be on a separate page
+                        st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
+                        st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
+                else:
+                    st.info("🤔 No Economy Wide Target found")
+                df['Validation'] =  'No'
+                df_xlsx = to_excel(df)
+                st.download_button(label='📥 Download Current Result',
+                                data=df_xlsx ,
+                                file_name= 'file_target.xlsx')
+            else:
+                st.info("🤔 No document found, please try to upload it at the sidebar!")
+                logging.warning("Terminated as no document provided")
+        # # Creating truth value dataframe
+        # if 'key1' in st.session_state:
+        #     if st.session_state.key1 is not None:
+        #         df = st.session_state.key1
+        #         st.markdown("###### Select the threshold for classifier ######")
+        #         c1, c2 = st.columns([1,1])
+        #         with c1:
+        #             threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
+        #                                   step=0.01, value=0.5,
+        #                 help = "Keep High Value if want refined result, low if dont want to miss anything" )
+        #         sectors =set(df.columns)
+        #         removecols = {'Validation','Sectors','text'}
+        #         sectors  = list(sectors - removecols)
+        #         # creating the dataframe for value counts of Labels, along with 'title' of Labels
+        #         temp = df[df['Relevancy']>threshold]
+        #         count_df = temp['Target Label'].value_counts()
+        #         count_df = count_df.rename('count')
+        #         count_df = count_df.rename_axis('Target Label').reset_index()
+        #         count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
+        #         plt.rcParams['font.size'] = 25
+        #         colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
+        #         # plot
+        #         fig, ax = plt.subplots()
+        #         ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
+        #             wedgeprops={"linewidth": 1, "edgecolor": "white"},
+        #             textprops={'fontsize': 14},
+        #             frame=False,labels =list(count_df.Label_def),
+        #             labeldistance=1.2)
+        #         st.markdown("#### Anything related to Targets? ####")
+        #         c4, c5, c6 = st.columns([1,2,2])
+        #         with c5:
+        #             st.pyplot(fig)
+        #         with c6:
+        #             st.write(count_df[['Label_def','count']])
+        #         st.write("")
+        #         st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
+        #         st.dataframe(df[df['Target Label'] == 'LABEL_1'].reset_index(drop = True))
+        #         df['Validation'] =  'No'
+        #         df_xlsx = to_excel(df)
+        #         st.download_button(label='📥 Download Current Result',
+        #                         data=df_xlsx ,
+        #                         file_name= 'file_target.xlsx')

docStore/img/dsc_giz.png ADDED Viewed

docStore/img/ndc.png ADDED Viewed

docStore/img/paris.png ADDED Viewed

docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt ADDED Viewed

	@@ -0,0 +1,737 @@

+Ethiopia 2030: The Pathway to Prosperity
+Ten Years Perspective Development Plan (2021 � 2030)
+1. Baselines and Assumptions
+2. Strategic pillars
+3. Departures
+4. Macroeconomic goals
+5. Implications of the COVID-19 pandemic and necessary mitigation measures
+6. Potentials/capabilities
+7. Focus areas
+7.1. Productive sectors
+7.2. Services sector
+7.3. Enabling sectors
+8. Balanced and competitive development (nationally, regionally and locally)
+9. Monitoring and Evaluation
+Content
+1. Baselines and Assumptions
+Poverty Reduction (%)
+Key performances of previous years
+45.5 44.2
+38.7
+29.6
+23.5
+19
+0
+5
+10
+15
+20
+25
+30
+35
+40
+45
+50
+1994 2000 2005 2011 2016 2020
+Percent
+Year
+Proportion of people living below poverty line
+10.5
+8.8
+10.1
+7.7
+9
+5.19-6.20
+0 2 4 6 8 10 12
+GTP I: 2011-2015
+GTP II: 2015/16
+GTP II: 2016/17
+GTP II: 2017/18
+GTP II: 2018/19
+GTP II: 2019/20 (projection, with
+COVID-19)
+GDP growth rate (%)
+1. Baselines and Assumptions
+Share of economic sectors in GDP (%) Merchandise export as % of GDP
+8.66
+7.33
+6.57
+5.93
+4.91
+3.86 3.56 3.37
+2.77
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+Percent
+Year
+46.9
+45
+43.5
+41.4
+39.5
+37.1 35.9
+34.5
+32.8
+13.4
+15
+17.3
+18.8
+21
+23.5
+25.7 26.9 27.8
+4.7 4.8 5 5.3 5.6 6.1 6.9 6.8 6.8
+7.1
+8.6
+10.7 12
+14.2
+16.2
+17.8 19.1 20.1
+39.8 40.1 39.2 39.8 39.4 38.4 38.6 39.4
+0
+5
+10
+15
+20
+25
+30
+35
+40
+45
+50
+2010/11 2011/12 2012/13 2013/14 2014/15 2015/16 2016/17 2017/18 2018/19
+Percent
+Agriculture Industry Manufacturing Construction Services
+1. Baselines and Assumptions
+Labour force participation (2013)
+73%
+7%
+20%
+Agriculture
+Industry
+Services
+7%
+22%
+71%
+Agriculture
+Industry
+Services
+Urban labour force participation (2013)
+1. Baselines and Assumptions
+High and increasing Unemployment Rate
+� Urban unemployment rate = 19.1% in 2018
+� Youth unemployment rate = 25.3 %
+? Male = 18.6%
+? Female 30.9 %
+� Rural unemployment rate = 2% in 2013
+� Declining per capita rural land creating
+disguised unemployment
+402,869
+471,535
+Male Female Total Male Female Total
+2014 2018
+15-19 yr. 20-24 yr. 25-29 yr. Linear (20-24 yr.)
+Number of unemployed people in urban areas
+1. Baselines and Assumptions
+Challenges
+1. Macroeconomic imbalances
+?Sustained high inflation
+?High and rising unemployment especially
+in urban areas
+?High and rising debt burden
+?Chronic foreign currency shortage
+?Sluggish (though encouraging) rate of
+structural change
+2. Vulnerability to shocks (COVID-19, Climate
+changes, Desert Locust infestation, etc)
+3. Poor quality and high inequity in
+infrastructure projects
+4. Poor quality services in health and
+education
+� High repetition and dropout rates from school
+1. Baselines and Assumptions
+� Poor quality of growth and slow
+structural change
+� Excessive aid and loan
+dependence for financing
+infrastructural and construction
+investments
+� Limited success in expanding
+manufacturing and modern
+agriculture which have high job
+creation potentials
+� Weak institutional capacity as
+the main culprit of all failures
+? Provision of quality services
+(electricity, water, telephone,
+internet)
+? Creation of enough jobs and
+improved living standards
+? Generation of reliable foreign
+exchange revenue and debtsustainable
+national economic
+capacity
+? Completion of development
+projects and investment plans
+under public-private
+partnerships
+� Low reward for merit, productivity and effort
+while low disincentive for laziness, wastefulness
+and corruption
+� Slow institutional change and transformation in:
+? Government policies
+? Investor attitude
+? Youth behaviour
+? Role of the intellectuals
+� The need for sustained increase in production
+and productivity
+� The need to set a common national vision to
+achieve major successes with consensus and
+popular legitimacy
+Major areas of failure in the economy
+1. Baselines and Assumptions
+� Poor quality of growth and slow
+structural change
+� Excessive aid and loan
+dependence for financing
+infrastructural and construction
+investments
+� Limited success in expanding
+manufacturing and modern
+agriculture which have high job
+creation potentials
+� Weak institutional capacity as
+the main culprit of all failures
+? Provision of quality services
+(electricity, water, telephone,
+internet)
+? Creation of enough jobs and
+improved living standards
+? Generation of reliable foreign
+exchange revenue and debtsustainable
+national economic
+capacity
+? Completion of development
+projects and investment plans
+under public-private
+partnerships
+� Low reward for merit, productivity and effort
+while low disincentive for laziness, wastefulness
+and corruption
+� Slow institutional change and transformation in:
+? Government policies
+? Investor attitude
+? Youth behaviour
+? Role of the intellectuals
+� The need for sustained increase in production
+and productivity
+� The need to set a common national vision to
+achieve major successes with consensus and
+popular legitimacy
+Major areas of failure in the economy
+2. Departures
+1. Emphasis on quality of economic growth
+2. Participation and coordination of sectors in the planning process
+3. Sectoral linkages and multi-sectoral development focus
+4. Preparation of national development corridors based on development potentials
+5. Focus on solving institutional bottlenecks
+6. The ongoing home grown economic reform programme as a sprinting board
+7. Emphasis on resilience building, innovation and entrepreneurship
+3. Strategic pillars
+1. Ensure quality growth
+2. Improve productivity and competitiveness
+3. Undertake institutional transformation
+4. Ensure private sector's leadership in the economy
+5. Ensure equitable participation of women and children
+6. Build climate resilient green economy
+3. Strategic pillars
+� Increasing export revenues and substituting imports by
+reducing production costs
+� Availing quality and massive infrastructure
+? Linking infrastructural development with development corridors
+� Producing required human resources with quality
+� Producing enough and quality human resources
+� Prioritizing innovative production systems
+� Linking incentives with export revenue and job creation
+performances
+� Modernizing and enhancing the logistic system
+� Creating technological competences needed for longterm
+growth
+� The economic growth should ensure:
+? Participation of all citizens and equitable utilization of the
+growth proceeds
+? Improved standard of living of every citizen
+? Reduced poverty in all indicators
+? Reduced inflation and unemployment
+� The economic growth should lead to increased
+aggregate supply
+� Focus on modern agriculture, manufacturing and
+mining
+� Emphasis on exploiting the sources of growth through
+structural change
+1.Ensuring quality economic growth 2. Raising production and productivity
+3. Strategic pillars
+� Build democratic and judicial institutions that ensure elite bargain,
+national consensus, common vision and government legitimacy
+� Build private sector and competition friendly bureaucracy
+� Coordinate with parents, the society and teachers to make
+educational institutions centers of excellence and virtuous citizens
+� Coordinate with parents as well as social and religious leaders to
+encourage religious institutions and their teachings contribute
+towards poverty reduction efforts
+� Prepare policies, strategies and legal frameworks for achieving
+prosperity
+� Increased focus on innovation and research
+� Creating strong social security system
+3. Institutional Transformation 4. Private sector's leadership in the economy
+� Create conducive investment climate and incentivize
+domestic investors in key sectors
+� Build strong and market-led public-private partnerships in
+order to ensure the establishment of inclusive and
+pragmatic market economy
+� Enhance access and quality of infrastructure to attract
+quality foreign direct investment
+� Identify new sources of growth, empower and stimulate
+the private sector, and supplement the private sector in
+strategic areas
+� Emphasis for public-private partnership on problem
+solving innovations and research activities
+3. Strategic pillars
+� Ensure gender equity in economic and social
+sectors
+? Participation of women at all levels of education
+? Asset ownership of women
+� Ensure fair participation of women and youth in
+leadership and decision making positions
+� Create awareness among citizens about the role of
+women and youth in the country�s overall
+development
+� Increase basin development efforts to fight land
+degradation and to reduce pollutions
+� Improve productivity and reduce GHG emissions
+� Increase forest protection and development
+� Increase production of electricity from renewable
+sources for domestic use and for export
+� Focus on modern and energy saving technologies
+5. Equitable participation of women and children 6. Climate resilient green economy
+4. Macroeconomic Goals
+Assumptions
+? Requirement to significantly reduce
+poverty
+? Available national potentials
+? Potential for investment in the economy
+? Existing potentials in each sector
+? Low productivity that needs to be
+improved
+� Make Ethiopia a middle income
+economy by 2022
+� Raise per capita income to USD 1,115
+in 2022
+? Threshold for middle-income is USD 1,026
+? Plus human development index and
+economic vulnerability index
+� Raise per capita income to USD 2,220
+by 2030
+Sectoral growth Targets (2021-2030)
+Assured middle- income potential
+10.2%
+Average
+Growth
+Target
+Percentage of population below poverty line
+4. Macroeconomic Goals
+Structural change
+Financing Gaps
+Reduce urban unemployment to less than 9%
+?1.36 million new jobs need to be
+created per annum
+Sectoral composition of GDP Labour force participation
+Economic
+Sectors
+Performance Target
+2011 2015 2018/19 2030
+Agriculture 45 39.7 32.8 22.0
+Industry 15.1 21.2 27.6 35.9
+Manufacturing 4.7 5.5 6.8 17.2
+Services 39.9 39 39.4 42.1
+5. Implications of the COVID-19 pandemic and necessary mitigation measures
+� GDP growth for 2019/20 fiscal year is projected to be lower than its target of 9.0% by between 2.81
+and 3.80 percentage points (equivalent to 58.3 - 78.8 billion birr) due to COVID-19 pandemic
+� If the current scenario continues, next year�s GDP growth could decline by 2.8 percentage points
+� Returning the economy to its high growth trajectory requires focusing on sectors with high
+productivity and job creation potentials
+� Public investment should focus on empowering the private sector
+� Promoting both domestic and foreign investments with the right set of incentives (merit based)
+� Modernizing production systems and improving uptake of technology
+� Conducting demand analysis for export commodities to remedy for the declining trend in exports
+and foreign exchange earnings.
+6. Potentials
+� Endowment of various natural resources contributing to the growth potential
+� Huge unutilized arable land creates great potential for the success of the plan
+� Endowment of gemstones, ornamental, energy, metals, and metallic minerals
+� Gold, coal, iron ore, potash, tantalum, marble, petroleum and other natural resources
+Natural
+Resources
+� Large youth population and potential for demographic dividend
+� Cumulative capacity in education and health
+� Positive attitude and noble culture of reaching agreement among citizens
+Human
+capital
+6. Potentials
+Built physical and material capitals
+?Transport and communication
+? Irrigation infrastructures for modern agriculture
+?Industrial Parks
+?Mega energy infrastructures
+Physical
+capital
+Unexploited
+growth
+potentials
+� Utilizing the tourism potential through modernization
+� Using the mining subsector as a source of input as well as a competitive industry in its
+own right
+6. Potentials
+� Solving supply side bottlenecks to satisfy the existing demand
+� Improving international acceptance and reliable partnerships
+? The �medemer�/synergy philosophy
+? The ongoing political reform measures
+? The Homegrown Economic Reform programme
+� Increased finance from partners and multilateral institutions
+? Increased availability of foreign exchange
+? Reduced debt stress for the short to medium term
+? Increased potential for development
+Increased
+demand as
+potential
+Political Capital
+Continental
+and regional
+integrations
+� Regional and continental economic integration agreements
+� International and continental free trade agreements
+6. Potentials
+Low
+technology as
+a potential
+� Undeniably low status of technological development
+� International mobility and spillover effect of technology
+� Potential for development and catching up by filling the technological gaps
+� Doubling crop productivity from the current 24-36 quintals per hectare will result
+in 7% increase in crop production
+� Raise the production efficiency of manufacturing from the current 50% to 80%
+7. Focus Areas
+7.1. Productive sectors: agriculture, manufacturing, mining
+7.2. Service sector: tourism
+7.3. Enabling sectors: energy, transport, sustainable finance,
+innovation and technology, urban development, irrigation,
+human capital development
+7.1. Productive sectors
+Agriculture Objectives
+1. Free agriculture from rain dependence
+2. Agricultural mechanization services
+3. Contract farming, cluster approach and
+land consolidation
+4. Livestock, animal feed and animal health
+5. Horticulture (irrigation and urban farming)
+6. Private sector participation
+7. Institutional implementation capacity
+8. Climate resilient sustainable agricultural
+development
+1. Improve income and livelihood options for farming and pastoral
+communities through increased productivity and competitiveness
+2. Modernize agriculture and ensure national food and nutrition security
+3. Raise export of agricultural output and substitute imports
+4. Make agriculture a viable and profitable enterprise through value addition
+5. Create rural employment opportunities
+6. Enhance livestock health access and quality
+7. Preserve animal genetic resources and increase pastoral research
+8. Improve the development of animal feed and access to markets
+9. Develop livestock specific extension package for each livestock type
+Focus Areas
+7.1. Productive sector
+Manufacturing Industry
+Objectives
+1. Production of quality and competitive food, textile, housing and
+pharmaceutical products for export and domestic markets
+2. Production and productivity of existing manufacturing industries
+3. Utilization of locally available inputs
+4. Value chains, linkages and interdependencies
+5. Linkages between large scale metallurgical and engineering,
+chemical and pharmaceutical industries with other industries
+6. Job creation, cluster approaches and expanding small and medium
+scale manufacturing
+7. Private sector participation and partnership
+1. Establish basis for domestic industrialization
+2. Value addition through enhanced inter-sectoral
+linkages
+3. Enhance productivity through private sector
+leadership and supportive role of the
+government
+? Create job opportunities for the youth leaving
+agriculture and concentrating in urban areas
+? Make exportable commodities internationally
+competitive
+? Ensure structural change
+Focus areas
+7.1. Productive sectors
+Mining
+Objectives
+� Foreign exchange earning and
+domestic revenues
+� Increased investment in mining
+� Participation of manufacturing
+industries that add value
+� Job creation
+� Add value for improved contribution of the subsector
+� Increase inter-sectoral linkages to raise raw material inputs to other
+sectors
+� Make mining a competent subsector and induce structural change
+� Increase human resource and technological capabilities through
+research and trainings
+� Raise foreign exchange revenue from mining through increased
+exploration and production
+� Improve traditional mining production and marketing systems
+� Improve the country�s geological information
+Focus areas
+7.2. Service sector
+Tourism
+Objectives
+� Identification and developing destinations
+� Infrastructure
+� Competitiveness
+?improve existing destinations
+?develop new destinations
+? diversify service and raise quality
+� Market linkages, branding, and promotion
+� Technology, research and development
+� Preservation, maintenance and proper
+utilization of heritage resources
+� Expand job opportunities
+� Raise incomes
+� Build information management
+systems
+� Increase implementation capacity
+Focus areas
+7.3. Enabling sectors
+Urban development
+Objectives
+? Prioritize productive sectors in job creation and enterprise
+development plans
+? Rapid development and equity goals in land provision system
+? Participation of indigenous people in land redevelopment and
+expansion
+? Urban land registration and cadaster system, modern
+property valuation
+? Greenery and public spaces as well as waste disposal and
+management in urban planning and implementation
+? Housing development and financing options to reduce
+housing shortages
+? Integrated infrastructure and services provision
+? Role of private sector in infrastructure development and
+service provision
+� Expand micro and small-scale
+enterprises to reduce urban
+unemployment
+� Develop and avail urban land based on
+demand, equity and cost effectiveness
+� Make quality housing accessible both in
+rural and urban areas
+� Develop quality and integrated
+infrastructure as well as service
+provision in towns
+� Improve financial management and
+resource utilization in urban areas
+Focus areas
+7.3. Enabling sectors
+Innovation and Technology
+Objectives
+? Access to innovation and
+technological information
+? Developing a digital economy
+? Productivity enhancement and
+competitiveness
+? Build a digital economy
+? Develop national scientific research and technological
+capabilities
+? Support problem solving research and development of
+technologies necessary for raising production,
+productivity and service provision
+? Create jobs and capital that are based on technology
+? Develop technological and data security protection
+systems
+Focus areas
+7.3. Enabling sectors
+Sustainable finance
+Objectives
+� Access to modern finance and saving culture in rural
+areas
+� Support to the private sector and corporations to
+reinvest profits in productive sectors
+� Role of private financial institutions in manufacturing
+and agriculture
+� Digital revenue collection system
+� Tax equity (contraband, tax evasion, and bringing the
+underground economy to the tax system)
+� Domestic and foreign strategic partnerships
+� Transform financing from short term to long-term,
+sustainable and quality sources
+� Ensure financing quality based on sectoral prioritization
+and reduction of wastage
+� Increase the number of domestic saving institutions both
+in rural and urban areas
+� Support domestic finance with foreign exchange capacity
+and foreign direct investment
+� Modernize domestic revenue collection system
+� Raise voluntary tax payment attitude
+� Bring the informal sector to the formal tax system
+Focus areas
+7.3. Enabling sectors
+Transport
+Objectives
+� Access to infrastructure
+� Implementation capacity
+� Participation of the private sector and the general
+public
+� Financing capacity
+� Ensure equitable access to transport infrastructure and
+services
+� Improve transport safety
+� Make logistics services fast and reliable
+� Build transport infrastructure and service that is
+resilient to climate change
+Focus areas
+7.3. Enabling sectors
+Energy
+Objectives
+? Equity in access to electricity services
+? Energy access and quality
+? Alternative sources of energy
+? Reliability of electricity infrastructure
+? Investment and income in energy subsector
+� Ensure equitable access to transport
+infrastructure and services
+� Improve transport safety
+� Make logistics services fast and reliable
+� Build transport infrastructure and service that is
+resilient to climate change
+Focus areas
+7.3. Enabling sectors
+Irrigation
+Objectives
+? Medium and large scale irrigation infrastructure
+? Job creation
+? Share of government expenditure and alternative
+financing options
+? Institutional capacity and human resource
+development
+? Improve agricultural output and productivity
+? Reduce government spending and enhance
+institutional capacity and human resources
+development
+? Ensure the inclusion of all genders and
+disabled citizens
+? Develop alternative financing options for
+irrigation development
+Focus areas
+7.3. Enabling sectors
+Human capital development
+Objectives
+� Make education and training inclusive and equitable by
+harmonizing the system with ability, need and capacity
+� Develop capacity of educational institutions (teacher capacity,
+inputs and technology)
+� Establish education and training quality assurance system
+� Avail free and compulsory education for pre-primary to junior
+secondary levels and free education at the senior secondary levels
+equitably
+� Ensure the relevance of education and training system and
+synchronize education policy with economic and social
+development needs
+� Make the education and training policy compatible with the
+nation�s contemporary capacities as well as global and regional
+market opportunities
+� Enhance commitment, capability and responsibility of citizens
+? Ensure equitable and quality health services
+? Raise average life expectancy
+? Achieve universal health coverage through
+proactive and prevention health system
+? Curtail preventable maternal and child deaths
+? Reduce incidences of contagious and noncontagious
+related diseases and deaths
+? Build capacity for health tourism through
+increased treatment capabilities
+? Create a healthy society that is free from
+addictions and use technology for supporting
+knowledge led economic development
+Focus areas
+8 Nationally, regionally and locally balanced and competitive development
+1. Lack of synchronization of investment with
+resource potentials and development needs
+2. Poor alignment of federal, regional and
+district level investment plans with the
+national development goals and envisioned
+settlement patterns
+3. Poor regional coordination due to low
+consideration for trans-regional and
+spatial issues in development plans of
+regional states
+4. Inter-regional and intra-regional
+disparities in infrastructural development
+and access to services
+Challenges
+8. Nationally, regionally and locally balanced and competitive development
+1. Ensure that the investment flow and
+infrastructural development plans fairly go hand in
+hand with resource potential and development
+needs
+?Developing underutilized natural resources
+?Equitable distribution and access to
+infrastructure
+?Sustainable environmental protection
+2. Ensure the inclusion of pastoral and agro-pastoral
+areas in the development
+?Focused infrastructural development in pastoral
+areas such as education and health sector input
+provision as well as governance
+?Market linkages with other areas and the central
+markets
+?Improve rural finance (credit and insurance) to
+encourage fattening, milk processing, leather
+production and irrigation agriculture
+Focus areas
+9. Monitoring and Evaluation
+10 Years Perspective
+Plan KPIs
+Federal Implementing
+Institutions
+Planning and
+Development Commission
+Generate Data (Census,
+Sample and administrative
+data)
+Annual Reports
+Dialogue forums
+(Civic Organizations, professional
+associations, development partners,
+intellectuals)
+Central Statistical Agency
+Database
+National
+Information Portal
+National Statistics
+Development Strategic
+plan
+Evaluation Reports
+Prime Minister�s Office
+House of People�s
+Representatives
+Thank you!

docStore/sample/Seychelles-revised_first_ndc-EN.pdf ADDED Viewed

Binary file (372 kB). View file

docStore/sample/South Africa_s Low Emission Development Strategy.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd18bff36fff79b97c5a343912f1296ea2d9d5481cf92c2887774fb4f2800418
+size 1503168

docStore/sample/files.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{"Ethiopia: 10 Year Development Plan":"docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt",
+"Seychells:Revised NDC":"docStore/sample/Seychelles-revised_first_ndc-EN.pdf",
+"South Africa:Low Emission strategy":"docStore/sample/South Africa_s Low Emission Development Strategy.pdf"
+ }

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+poppler-utils
+xpdf
+tesseract-ocr
+libtesseract-dev

paramconfig.cfg ADDED Viewed

	@@ -0,0 +1,39 @@

+[target]
+THRESHOLD = 0.50
+MODEL = mtyrrell/ikitracs_economywide
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 60
+SPLIT_OVERLAP = 10
+RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10
+[netzero]
+THRESHOLD = 0.50
+MODEL = ilaria-oneofftech/ikitracks_netzero
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 60
+SPLIT_OVERLAP = 10
+RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10
+[sector]
+THRESHOLD = 0.50
+MODEL = ppsingh/bert-multilabel-sector-classifier
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 60
+SPLIT_OVERLAP = 10
+RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10
+[adapmit]
+THRESHOLD = 0.50
+MODEL = ppsingh/mpnet-adaptation_mitigation-classifier
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 60
+SPLIT_OVERLAP = 10
+RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+farm-haystack == 1.16
+farm-haystack[ocr,pdf]==1.16.0
+spacy==3.2.0
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
+matplotlib==3.5.1
+nltk==3.7
+numpy==1.22.1
+pandas==1.4.0
+pdfplumber==0.6.2
+Pillow==9.1.1
+seaborn==0.11.2
+transformers==4.25.1
+st-annotated-text==3.0.0
+markdown==3.4.1
+summa==1.2.0
+plotly
+xlsxwriter
+streamlit-aggrid
+python-docx

style.css ADDED Viewed

	@@ -0,0 +1,180 @@

+.row-widget.stTextInput > div:first-of-type {
+    background: #fff;
+    display: flex;
+    border: 1px solid #dfe1e5;
+    box-shadow: none;
+    border-radius: 24px;
+    height: 50px;
+    width: auto;
+    margin: 10px auto 30px;
+}
+.row-widget.stTextInput > div:first-of-type:hover,
+.row-widget.stTextInput > div:first-of-type:focus {
+    box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
+}
+.row-widget.stTextInput .st-bq {
+    background-color: #fff;
+}
+.row-widget.stTextInput > label {
+    color: #b3b3b3;
+}
+.row-widget.stButton > button {
+    border-radius: 24px;
+    background-color: #B6C9B1;
+    color: #fff;
+    border: none;
+    padding: 6px 20px;
+    float: right;
+    background-image: none;
+}
+.row-widget.stButton > button:hover {
+    box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
+}
+.row-widget.stButton > button:focus {
+    border: none;
+    color: #fff;
+}
+.footer-custom {
+    position: fixed;
+    bottom: 0;
+    width: 100%;
+    color: var(--text-color);
+    max-width: 698px;
+    font-size: 14px;
+    height: 50px;
+    padding: 10px 0;
+    z-index: 50;
+}
+.main {
+    padding: 20px;
+}
+footer {
+    display: none !important;
+}
+.footer-custom a {
+    color: var(--text-color);
+}
+#wikipedia-assistant {
+    font-size: 36px;
+}
+.generated-answer p {
+    font-size: 16px;
+    font-weight: bold;
+}
+.react-json-view {
+    margin: 40px 0 80px;
+}
+.tooltip {
+    text-align: center;
+    line-height: 20px;
+    display: table-caption;
+    font-size: 10px;
+    border-radius: 50%;
+    height: 20px;
+    width: 20px;
+    position: relative;
+    cursor: pointer;
+    color:#000;
+}
+.tooltip .tooltiptext {
+    visibility: hidden;
+    width: 280px;
+    text-align: center;
+    border-radius: 6px;
+    padding: 10px;
+    position: absolute;
+    z-index: 1;
+    top: 25px;
+    left: 50%;
+    margin-left: -140px;
+    font-size: 14px;
+    background-color: #fff;
+    border: 1px solid #ccc;
+    box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
+    color: #000;
+}
+.tooltip:hover .tooltiptext {
+    visibility: visible;
+}
+.sentence-wrapper {
+    border-left: 4px solid #ffc423;
+    padding-left: 20px;
+    margin-bottom: 40px;
+}
+#context {
+    padding: 2rem 0 1rem;
+}
+hr {
+    margin: 2em 0 1em;
+}
+.technical-details-info {
+    margin-bottom: 100px;
+}
+.loader-wrapper {
+    display: flex;
+    align-items: center;
+    background-color: rgba(250, 202, 43, 0.2);
+    padding: 15px 20px;
+    border-radius: 6px;
+}
+.loader-wrapper p {
+    margin-bottom: 0;
+    margin-left: 20px;
+}
+.loader {
+    width: 30px;
+    height: 30px;
+    border: dotted 5px #868686;
+    border-radius: 100%;
+    animation: spin 1s linear infinite;
+}
+.loader-note {
+    font-size: 14px;
+    color: #b3b3b3;
+    margin-left: 5px;
+}
+@keyframes spin {
+  0% {
+    transform: rotate(0deg) scale(0.8);
+    border-top-color: transparent;
+    border-right-color: transparent;
+  }
+  50% { transform: rotate(180deg) scale(1.2);
+    border-color: #949494;
+    border-top-color: transparent;
+    border-right-color: transparent;
+  }
+  100% { transform: rotate(360deg) scale(0.8);
+    border-color: #bbbbbb;
+    border-top-color: transparent;
+    border-right-color: transparent;
+  }
+}

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # adding for package implementation

utils/adapmit_classifier.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from haystack.schema import Document
+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+from haystack.nodes import TransformersDocumentClassifier
+from transformers import pipeline
+@st.cache_resource
+def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('adapmit','MODEL')
+    logging.info("Loading Adaptation Mitigation classifier")
+    # doc_classifier = TransformersDocumentClassifier(
+    #                     model_name_or_path=classifier_name,
+    #                     task="text-classification",
+    #                     top_k = None)
+    doc_classifier = pipeline("text-classification",
+                            model=classifier_name,
+                            return_all_scores=True,
+                            function_to_apply= "sigmoid")
+    return doc_classifier
+def runAdapMitPreprocessingPipeline(file_name:str, file_path:str,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+    splititng of text.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    adapmit_processing_pipeline = processingpipeline()
+    output_adapmit_pre = adapmit_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap, \
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
+    return output_adapmit_pre
+@st.cache_data
+def adapmit_classification(haystack_doc:List[Document],
+                        threshold:float = 0.5,
+                        classifier_model:pipeline= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    belongs to which particular Sustainable Devleopment Goal (SDG).
+    Params
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("Working on Adaptation-Mitigation Identification")
+    if not classifier_model:
+        classifier_model = st.session_state['adapmit_classifier']
+    predictions = classifier_model(haystack_doc)
+     # converting the predictions to desired format
+    list_ = []
+    for i in range(len(predictions)):
+      temp = predictions[i]
+      placeholder = {}
+      for j in range(len(temp)):
+        placeholder[temp[j]['label']] = temp[j]['score']
+      list_.append(placeholder)
+    labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
+    # labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
+    df = DataFrame.from_dict(labels_)
+    df = df.round(2)
+    return df

utils/config.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import configparser
+import logging
+def getconfig(configfile_path:str):
+    """
+    configfile_path: file path of .cfg file
+    """
+    config = configparser.ConfigParser()
+    try:
+        config.read_file(open(configfile_path))
+        return config
+    except:
+        logging.warning("config file not found")
+# Declare all the necessary variables
+def get_classifier_params(model_name):
+    config = getconfig('paramconfig.cfg')
+    params = {}
+    params['model_name'] = config.get(model_name,'MODEL')
+    params['split_by'] = config.get(model_name,'SPLIT_BY')
+    params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
+    params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
+    params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
+    params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
+    params['threshold'] = float(config.get(model_name,'THRESHOLD'))
+    params['top_n'] = int(config.get(model_name,'TOP_KEY'))
+    return params

utils/netzero_classifier.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from haystack.nodes import TransformersDocumentClassifier
+from haystack.schema import Document
+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+# Labels dictionary ###
+_lab_dict = {
+            'NEGATIVE':'NO NETZERO TARGET',
+            'NETZERO':'NETZERO TARGET',
+            }
+@st.cache_resource
+def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('netzero','MODEL')
+    logging.info("Loading netzero classifier")
+    doc_classifier = TransformersDocumentClassifier(
+                        model_name_or_path=classifier_name,
+                        task="text-classification")
+    return doc_classifier
+def runNetZeroPreprocessingPipeline(file_name:str, file_path:str,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+    splititng of text.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    netzero_processing_pipeline = processingpipeline()
+    output_netzero_pre = netzero_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap, \
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
+    return output_netzero_pre
+@st.cache_data
+def netzero_classification(haystack_doc:List[Document],
+                        threshold:float = 0.8,
+                        classifier_model:TransformersDocumentClassifier= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    belongs to which particular Sustainable Devleopment Goal (SDG).
+    Params
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("Working on Netzero Extraction")
+    if not classifier_model:
+        classifier_model = st.session_state['netzero_classifier']
+    results = classifier_model.predict(haystack_doc)
+    labels_= [(l.meta['classification']['label'],
+            l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
+    df = DataFrame(labels_, columns=["Target Label","Relevancy", "page","text"])
+    df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
+    df.index += 1
+    # df =df[df['Relevancy']>threshold]
+    df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
+    # creating the dataframe for value counts of Labels, along with 'title' of Labels
+    # count_df = df['Target Label'].value_counts()
+    # count_df = count_df.rename('count')
+    # count_df = count_df.rename_axis('Target Label').reset_index()
+    # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
+    return df

utils/preprocessing.py ADDED Viewed

	@@ -0,0 +1,275 @@

+from haystack.nodes.base import BaseComponent
+from haystack.schema import Document
+from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
+from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
+from typing import Callable, Dict, List, Optional, Text, Tuple, Union
+from typing_extensions import Literal
+import pandas as pd
+import logging
+import re
+import string
+from haystack.pipelines import Pipeline
+def useOCR(file_path: str)-> Text:
+    """
+    Converts image pdfs into text, Using the Farm-haystack[OCR]
+    Params
+    ----------
+    file_path: file_path of uploade file, returned by add_upload function in
+    uploadAndExample.py
+    Returns the text file as string.
+    """
+    converter = PDFToTextOCRConverter(remove_numeric_tables=True,
+                                      valid_languages=["eng"])
+    docs = converter.convert(file_path=file_path, meta=None)
+    return docs[0].content
+class FileConverter(BaseComponent):
+    """
+    Wrapper class to convert uploaded document into text by calling appropriate
+    Converter class, will use internally haystack PDFToTextOCR in case of image
+    pdf. Cannot use the FileClassifier from haystack as its doesnt has any
+    label/output class for image.
+    1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
+    2. https://docs.haystack.deepset.ai/docs/file_converters
+    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
+    4. https://docs.haystack.deepset.ai/reference/file-converters-api
+    """
+    outgoing_edges = 1
+    def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
+            id_hash_keys: Optional[List[str]] = None,
+            ) -> Tuple[dict,str]:
+        """ this is required method to invoke the component in
+            the pipeline implementation.
+        Params
+        ----------
+        file_name: name of file
+        file_path: file_path of uploade file, returned by add_upload function in
+                    uploadAndExample.py
+        See the links provided in Class docstring/description to see other params
+        Return
+        ---------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case its the List of Hasyatck Document
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
+        """
+        try:
+            if file_name.endswith('.pdf'):
+                converter = PDFToTextConverter(remove_numeric_tables=True)
+            if file_name.endswith('.txt'):
+                converter = TextConverter(remove_numeric_tables=True)
+            if file_name.endswith('.docx'):
+                converter = DocxToTextConverter()
+        except Exception as e:
+            logging.error(e)
+            return
+        documents = []
+# encoding is empty, probably should be utf-8
+        document = converter.convert(
+                      file_path=file_path, meta=None,
+                      encoding=encoding, id_hash_keys=id_hash_keys
+                      )[0]
+        text = document.content
+        # in case of scanned/images only PDF the content might contain only
+        # the page separator (\f or \x0c). We check if is so and use
+        # use the OCR to get the text.
+        filtered = re.sub(r'\x0c', '', text)
+        if filtered == "":
+            logging.info("Using OCR")
+            text = useOCR(file_path)
+        documents.append(Document(content=text,
+                              meta={"name": file_name},
+                              id_hash_keys=id_hash_keys))
+        logging.info('file conversion succesful')
+        output = {'documents': documents}
+        return output, 'output_1'
+    def run_batch():
+        """
+        we dont have requirement to process the multiple files in one go
+        therefore nothing here, however to use the custom node we need to have
+        this method for the class.
+        """
+        return
+def basic(s:str, remove_punc:bool = False):
+    """
+    Performs basic cleaning of text.
+    Params
+    ----------
+    s: string to be processed
+    removePunc: to remove all Punctuation including ',' and '.' or not
+    Returns: processed string: see comments in the source code for more info
+    """
+    # Remove URLs
+    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
+    s = re.sub(r"http\S+", " ", s)
+    # Remove new line characters
+    s = re.sub('\n', ' ', s)
+    # Remove punctuations
+    if remove_punc == True:
+      translator = str.maketrans(' ', ' ', string.punctuation)
+      s = s.translate(translator)
+    # Remove distracting single quotes and dotted pattern
+    s = re.sub("\'", " ", s)
+    s = s.replace("..","")
+    return s.strip()
+def paraLengthCheck(paraList, max_len = 512):
+    new_para_list = []
+    for passage in paraList:
+        if len(passage.split()) > max_len:
+            iterations = int(len(passage.split())/max_len)
+        #     # st.write("Splitting")
+            for i in range(iterations):
+                temp  = " ".join(passage.split()[max_len*i:max_len*(i+1)])
+                new_para_list.append(temp)
+            temp  = " ".join(passage.split()[max_len*(i+1):])
+            new_para_list.append(temp)
+        else:
+            new_para_list.append(passage)
+    return new_para_list
+class UdfPreProcessor(BaseComponent):
+    """
+    class to preprocess the document returned by FileConverter. It will check
+    for splitting strategy and splits the document by word or sentences and then
+    synthetically create the paragraphs.
+    1. https://docs.haystack.deepset.ai/docs/preprocessor
+    2. https://docs.haystack.deepset.ai/reference/preprocessor-api
+    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
+    """
+    outgoing_edges = 1
+    def run(self, documents:List[Document], remove_punc:bool=False,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0):
+        """ this is required method to invoke the component in
+        the pipeline implementation.
+        Params
+        ----------
+        documents: documents from the output dictionary returned by Fileconverter
+        remove_punc: to remove all Punctuation including ',' and '.' or not
+        split_by: document splitting strategy either as word or sentence
+        split_length: when synthetically creating the paragrpahs from document,
+                      it defines the length of paragraph.
+        split_respect_sentence_boundary: Used when using 'word' strategy for
+        splititng of text.
+        split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+        Return
+        ---------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case the output will contain 4 objects
+                the paragraphs text list as List, Haystack document, Dataframe and
+                one raw text file.
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
+        """
+        if split_by == 'sentence':
+            split_respect_sentence_boundary = False
+        else:
+            split_respect_sentence_boundary = split_respect_sentence_boundary
+        preprocessor = PreProcessor(
+            clean_empty_lines=True,
+            clean_whitespace=True,
+            clean_header_footer=True,
+            split_by=split_by,
+            split_length=split_length,
+            split_respect_sentence_boundary= split_respect_sentence_boundary,
+            split_overlap=split_overlap,
+            # will add page number only in case of PDF not for text/docx file.
+            add_page_number=True
+            )
+        for i in documents:
+            # # basic cleaning before passing it to preprocessor.
+            # i = basic(i)
+            docs_processed = preprocessor.process([i])
+            for item in docs_processed:
+                item.content = basic(item.content, remove_punc= remove_punc)
+        df = pd.DataFrame(docs_processed)
+        all_text = " ".join(df.content.to_list())
+        para_list = df.content.to_list()
+        logging.info('document split into {} paragraphs'.format(len(para_list)))
+        output = {'documents': docs_processed,
+                  'dataframe': df,
+                  'text': all_text,
+                  'paraList': para_list
+                 }
+        return output, "output_1"
+    def run_batch():
+        """
+            we dont have requirement to process the multiple files in one go
+            therefore nothing here, however to use the custom node we need to have
+            this method for the class.
+        """
+        return
+def processingpipeline():
+    """
+    Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
+    from utils.preprocessing
+    """
+    preprocessing_pipeline = Pipeline()
+    file_converter = FileConverter()
+    custom_preprocessor = UdfPreProcessor()
+    preprocessing_pipeline.add_node(component=file_converter,
+                                    name="FileConverter", inputs=["File"])
+    preprocessing_pipeline.add_node(component = custom_preprocessor,
+                            name ='UdfPreProcessor', inputs=["FileConverter"])
+    return preprocessing_pipeline

utils/sector_classifier.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from haystack.schema import Document
+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+from haystack.nodes import TransformersDocumentClassifier
+from transformers import pipeline
+# # Labels dictionary ###
+# _lab_dict = {
+#             'NEGATIVE':'NO NETZERO TARGET',
+#             'NETZERO':'NETZERO TARGET',
+#             }
+@st.cache_resource
+def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('sector','MODEL')
+    logging.info("Loading sector classifier")
+    # we are using the pipeline as the model is multilabel and DocumentClassifier
+    # from Haystack doesnt support multilabel
+    # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
+    # if not then it will automatically use softmax, which is not a desired thing.
+    # doc_classifier = TransformersDocumentClassifier(
+    #                     model_name_or_path=classifier_name,
+    #                     task="text-classification",
+    #                     top_k = None)
+    doc_classifier = pipeline("text-classification",
+                            model=classifier_name,
+                            return_all_scores=True,
+                            function_to_apply= "sigmoid")
+    return doc_classifier
+def runSectorPreprocessingPipeline(file_name:str, file_path:str,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+    splititng of text.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    sector_processing_pipeline = processingpipeline()
+    output_sector_pre = sector_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap, \
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
+    return output_sector_pre
+@st.cache_data
+def sector_classification(haystack_doc:List[Document],
+                        threshold:float = 0.8,
+                        classifier_model:TransformersDocumentClassifier= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    belongs to which particular Sustainable Devleopment Goal (SDG).
+    Params
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("Working on Sector Identification")
+    if not classifier_model:
+        classifier_model = st.session_state['sector_classifier']
+        predictions = classifier_model(haystack_doc)
+    list_ = []
+    for i in range(len(predictions)):
+      temp = predictions[i]
+      placeholder = {}
+      for j in range(len(temp)):
+        placeholder[temp[j]['label']] = temp[j]['score']
+      list_.append(placeholder)
+    labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
+    # labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
+    df = DataFrame.from_dict(labels_)
+    df = df.round(2)
+    return df

utils/target_classifier.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from haystack.nodes import TransformersDocumentClassifier
+from haystack.schema import Document
+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+## Labels dictionary ###
+_lab_dict = {
+            'LABEL_0':'NO TARGET INFO',
+            'LABEL_1':'ECONOMY-WIDE TARGET',
+            }
+@st.cache_resource
+def load_targetClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('target','MODEL')
+    logging.info("Loading classifier")
+    doc_classifier = TransformersDocumentClassifier(
+                        model_name_or_path=classifier_name,
+                        task="text-classification")
+    return doc_classifier
+def runTargetPreprocessingPipeline(file_name:str, file_path:str,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+    splititng of text.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    target_processing_pipeline = processingpipeline()
+    output_target_pre = target_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap, \
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
+    return output_target_pre
+@st.cache_data
+def target_classification(haystack_doc:List[Document],
+                        threshold:float = 0.8,
+                        classifier_model:TransformersDocumentClassifier= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    belongs to which particular Sustainable Devleopment Goal (SDG).
+    Params
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("Working on Target Extraction")
+    if not classifier_model:
+        classifier_model = st.session_state['target_classifier']
+    results = classifier_model.predict(haystack_doc)
+    labels_= [(l.meta['classification']['label'],
+               l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
+    df = DataFrame(labels_, columns=["Target Label","Relevancy","page","text"])
+    df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
+    df.index += 1
+    # df =df[df['Relevancy']>threshold]
+    df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
+    # creating the dataframe for value counts of Labels, along with 'title' of Labels
+    # count_df = df['Target Label'].value_counts()
+    # count_df = count_df.rename('count')
+    # count_df = count_df.rename_axis('Target Label').reset_index()
+    # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
+    return df

utils/uploadAndExample.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import streamlit as st
+import tempfile
+import json
+def add_upload(choice):
+    """
+    Provdies the user with choice to either 'Upload Document' or 'Try Example'.
+    Based on user choice runs streamlit processes and save the path and name of
+    the 'file' to streamlit session_state which then can be fetched later.
+    """
+    if choice == 'Upload Document':
+        uploaded_file = st.sidebar.file_uploader('Upload the File',
+                            type=['pdf', 'docx', 'txt'])
+        if uploaded_file is not None:
+            with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
+                bytes_data = uploaded_file.getvalue()
+                temp.write(bytes_data)
+                st.session_state['filename'] = uploaded_file.name
+                st.session_state['filepath'] = temp.name
+    else:
+        # listing the options
+        with open('docStore/sample/files.json','r') as json_file:
+            files = json.load(json_file)
+        option = st.sidebar.selectbox('Select the example document',
+                              list(files.keys()))
+        file_name = file_path  = files[option]
+        st.session_state['filename'] = file_name
+        st.session_state['filepath'] = file_path