ppsingh commited on
Commit
463d9a8
·
1 Parent(s): 5cc5ff1

Update utils/sector_classifier.py

Browse files
Files changed (1) hide show
  1. utils/sector_classifier.py +107 -108
utils/sector_classifier.py CHANGED
@@ -1,108 +1,107 @@
1
- from haystack.schema import Document
2
- from typing import List, Tuple
3
- from typing_extensions import Literal
4
- import logging
5
- import pandas as pd
6
- from pandas import DataFrame, Series
7
- from utils.config import getconfig
8
- from utils.preprocessing import processingpipeline
9
- import streamlit as st
10
- from haystack.nodes import TransformersDocumentClassifier
11
- from transformers import pipeline
12
-
13
-
14
- @st.cache_resource
15
- def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
16
- """
17
- loads the document classifier using haystack, where the name/path of model
18
- in HF-hub as string is used to fetch the model object.Either configfile or
19
- model should be passed.
20
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
21
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
22
- Params
23
- --------
24
- config_file: config file path from which to read the model name
25
- classifier_name: if modelname is passed, it takes a priority if not \
26
- found then will look for configfile, else raise error.
27
- Return: document classifier model
28
- """
29
- if not classifier_name:
30
- if not config_file:
31
- logging.warning("Pass either model name or config file")
32
- return
33
- else:
34
- config = getconfig(config_file)
35
- classifier_name = config.get('sector','MODEL')
36
-
37
- logging.info("Loading sector classifier")
38
- # we are using the pipeline as the model is multilabel and DocumentClassifier
39
- # from Haystack doesnt support multilabel
40
- # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
41
- # if not then it will automatically use softmax, which is not a desired thing.
42
- # doc_classifier = TransformersDocumentClassifier(
43
- # model_name_or_path=classifier_name,
44
- # task="text-classification",
45
- # top_k = None)
46
-
47
- doc_classifier = pipeline("text-classification",
48
- model=classifier_name,
49
- return_all_scores=True,
50
- function_to_apply= "sigmoid")
51
-
52
- return doc_classifier
53
-
54
-
55
- @st.cache_data
56
- def sector_classification(haystack_doc:pd.DataFrame,
57
- threshold:float = 0.5,
58
- classifier_model:pipeline= None
59
- )->Tuple[DataFrame,Series]:
60
- """
61
- Text-Classification on the list of texts provided. Classifier provides the
62
- most appropriate label for each text. these labels are in terms of if text
63
- belongs to which particular Sustainable Devleopment Goal (SDG).
64
- Params
65
- ---------
66
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
67
- contains the list of paragraphs in different format,here the list of
68
- Haystack Documents is used.
69
- threshold: threshold value for the model to keep the results from classifier
70
- classifiermodel: you can pass the classifier model directly,which takes priority
71
- however if not then looks for model in streamlit session.
72
- In case of streamlit avoid passing the model directly.
73
- Returns
74
- ----------
75
- df: Dataframe with two columns['SDG:int', 'text']
76
- x: Series object with the unique SDG covered in the document uploaded and
77
- the number of times it is covered/discussed/count_of_paragraphs.
78
- """
79
- logging.info("Working on Sector Identification")
80
- haystack_doc['Sector Label'] = 'NA'
81
- df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
82
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
83
- if not classifier_model:
84
- classifier_model = st.session_state['sector_classifier']
85
-
86
- predictions = classifier_model(list(df1.text))
87
-
88
- list_ = []
89
- for i in range(len(predictions)):
90
-
91
- temp = predictions[i]
92
- placeholder = {}
93
- for j in range(len(temp)):
94
- placeholder[temp[j]['label']] = temp[j]['score']
95
- list_.append(placeholder)
96
- labels_ = [{**list_[l]} for l in range(len(predictions))]
97
- truth_df = DataFrame.from_dict(labels_)
98
- truth_df = truth_df.round(2)
99
- truth_df = truth_df.astype(float) >= threshold
100
- truth_df = truth_df.astype(str)
101
- categories = list(truth_df.columns)
102
- truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
103
- None for i in categories}, axis=1)
104
- truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
105
- -{None}),axis=1)
106
- df1['Sector Label'] = list(truth_df['Sector Label'])
107
- df = pd.concat([df,df1])
108
- return df
 
1
+ from haystack.schema import Document
2
+ from typing import List, Tuple
3
+ from typing_extensions import Literal
4
+ import logging
5
+ import pandas as pd
6
+ from pandas import DataFrame, Series
7
+ from utils.config import getconfig
8
+ from utils.preprocessing import processingpipeline
9
+ import streamlit as st
10
+ from transformers import pipeline
11
+
12
+
13
+ @st.cache_resource
14
+ def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
15
+ """
16
+ loads the document classifier using haystack, where the name/path of model
17
+ in HF-hub as string is used to fetch the model object.Either configfile or
18
+ model should be passed.
19
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
20
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
21
+ Params
22
+ --------
23
+ config_file: config file path from which to read the model name
24
+ classifier_name: if modelname is passed, it takes a priority if not \
25
+ found then will look for configfile, else raise error.
26
+ Return: document classifier model
27
+ """
28
+ if not classifier_name:
29
+ if not config_file:
30
+ logging.warning("Pass either model name or config file")
31
+ return
32
+ else:
33
+ config = getconfig(config_file)
34
+ classifier_name = config.get('sector','MODEL')
35
+
36
+ logging.info("Loading sector classifier")
37
+ # we are using the pipeline as the model is multilabel and DocumentClassifier
38
+ # from Haystack doesnt support multilabel
39
+ # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
40
+ # if not then it will automatically use softmax, which is not a desired thing.
41
+ # doc_classifier = TransformersDocumentClassifier(
42
+ # model_name_or_path=classifier_name,
43
+ # task="text-classification",
44
+ # top_k = None)
45
+
46
+ doc_classifier = pipeline("text-classification",
47
+ model=classifier_name,
48
+ return_all_scores=True,
49
+ function_to_apply= "sigmoid")
50
+
51
+ return doc_classifier
52
+
53
+
54
+ @st.cache_data
55
+ def sector_classification(haystack_doc:pd.DataFrame,
56
+ threshold:float = 0.5,
57
+ classifier_model:pipeline= None
58
+ )->Tuple[DataFrame,Series]:
59
+ """
60
+ Text-Classification on the list of texts provided. Classifier provides the
61
+ most appropriate label for each text. these labels are in terms of if text
62
+ belongs to which particular Sustainable Devleopment Goal (SDG).
63
+ Params
64
+ ---------
65
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
66
+ contains the list of paragraphs in different format,here the list of
67
+ Haystack Documents is used.
68
+ threshold: threshold value for the model to keep the results from classifier
69
+ classifiermodel: you can pass the classifier model directly,which takes priority
70
+ however if not then looks for model in streamlit session.
71
+ In case of streamlit avoid passing the model directly.
72
+ Returns
73
+ ----------
74
+ df: Dataframe with two columns['SDG:int', 'text']
75
+ x: Series object with the unique SDG covered in the document uploaded and
76
+ the number of times it is covered/discussed/count_of_paragraphs.
77
+ """
78
+ logging.info("Working on Sector Identification")
79
+ haystack_doc['Sector Label'] = 'NA'
80
+ df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
81
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
82
+ if not classifier_model:
83
+ classifier_model = st.session_state['sector_classifier']
84
+
85
+ predictions = classifier_model(list(df1.text))
86
+
87
+ list_ = []
88
+ for i in range(len(predictions)):
89
+
90
+ temp = predictions[i]
91
+ placeholder = {}
92
+ for j in range(len(temp)):
93
+ placeholder[temp[j]['label']] = temp[j]['score']
94
+ list_.append(placeholder)
95
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
96
+ truth_df = DataFrame.from_dict(labels_)
97
+ truth_df = truth_df.round(2)
98
+ truth_df = truth_df.astype(float) >= threshold
99
+ truth_df = truth_df.astype(str)
100
+ categories = list(truth_df.columns)
101
+ truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
102
+ None for i in categories}, axis=1)
103
+ truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
104
+ -{None}),axis=1)
105
+ df1['Sector Label'] = list(truth_df['Sector Label'])
106
+ df = pd.concat([df,df1])
107
+ return df