File size: 7,787 Bytes
0fdb130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def read_and_split_file(filename, chunk_size=1200, chunk_overlap=200):
    with open(filename, 'r') as f:
        text = f.read()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap,
        length_function = len, separators=[" ", ",", "\n"]
    )

    # st.write(f'Financial report char len: {len(text)}')
    texts = text_splitter.create_documents([text])
    return texts


def get_label_prediction(selected_predictor, texts):
    predicted_labels = []
    replies = []

        
    emdedding_model_name = predictors[selected_predictor]['embedding_model']
    emdedding_model = SentenceTransformer(emdedding_model_name)
    
    texts_str = [text.page_content for text in texts]
    embeddings = emdedding_model.encode(texts_str, show_progress_bar=True).tolist()
    
    # dataset = load_dataset(predictors[selected_predictor]['dataset_name'])    
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform([label.upper() for label in labels])

    input_size = predictors[selected_predictor]['embedding_dim']
    hidden_size = 256
    output_size = len(label_encoder.classes_)
    dropout_rate = 0.5
    batch_size = 8


    model = MLP(input_size, hidden_size, output_size, dropout_rate)
    load_model(model, predictors[selected_predictor]['mlp_model'])
    
    embeddings_tensor = torch.tensor(embeddings)
    
    data = TensorDataset(embeddings_tensor)
    dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
    
    with torch.no_grad():
        model.eval()
        for inputs in dataloader:
            # st.write(inputs[0])
            outputs = model(inputs[0])

            # _, predicted = torch.max(outputs, 1)
            
            probabilities = F.softmax(outputs, dim=1)
            predicted_indices = torch.argmax(probabilities, dim=1).tolist()
            predicted_labels_list = label_encoder.inverse_transform(predicted_indices)
            for pred_label in predicted_labels_list:
                predicted_labels.append(pred_label)
                # st.write(pred_label)
    
    predicted_labels_counter = Counter(predicted_labels)
    predicted_label = predicted_labels_counter.most_common(1)[0][0]
    return predicted_label





if __name__ == '__main__':
    # Comments and ideas to implement:
    # 1. Try sending list of inputs to the Inference API.
    


    from config import (
        labels, headers_inference_api, headers_inference_endpoint,
        # summarization_prompt_template,
        prompt_template,
        # task_explain_for_predictor_model,
        summarizers, predictors, summary_scores_template,
        summarization_system_msg, summarization_user_prompt, prediction_user_prompt, prediction_system_msg,
        # prediction_prompt, 
        chat_prompt, instruction_prompt
    )

    import streamlit as st
    from sys import exit
    from pprint import pprint
    from collections import Counter
    from itertools import zip_longest
    from random import choice
    import requests
    from re import sub
    from rouge import Rouge
    from time import sleep, perf_counter
    import os
    from textwrap import wrap
    from multiprocessing import Pool, freeze_support
    from tqdm import tqdm
    from stqdm import stqdm
    from langchain.document_loaders import TextLoader
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain.schema.document import Document
    # from langchain.schema import Document
    from langchain.chat_models import ChatOpenAI
    from langchain.llms import OpenAI
    from langchain.schema import AIMessage, HumanMessage, SystemMessage
    from langchain.prompts import PromptTemplate
    from datasets import Dataset, load_dataset
    from sklearn.preprocessing import LabelEncoder
    from test_models.train_classificator import MLP
    from safetensors.torch import load_model, save_model
    from sentence_transformers import SentenceTransformer
    from torch.utils.data import DataLoader, TensorDataset
    import torch.nn.functional as F
    import torch
    import torch.nn as nn
    import sys
    
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'test_models/')))
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'test_models/financial-roberta')))

    st.set_page_config(
        page_title="Financial advisor",
        page_icon="๐Ÿ’ณ๐Ÿ’ฐ",
        layout="wide",
    )
    # st.session_state.summarized = False


    



    with st.sidebar:
        "# How to use๐Ÿ”"

        
        """
        โœจThis is a holiday version of the web-UI with the magic ๐ŸŒ, allowing you to unwrap
        label predictions for a company based on its financial report text! ๐Ÿ“Šโœจ The prediction
        enchantment is performed using the sophisticated embedding classifier approach. ๐Ÿš€๐Ÿ”ฎ
        """


    center_style = "<h3 style='text-align: center; color: black;'>{} </h3>"
    st.markdown(center_style.format('Load the financial report'), unsafe_allow_html=True)


    upload_types = ['Text input', 'File upload']
    upload_captions = ['Paste the text', 'Upload a text file']
    upload_type = st.radio('Select how to upload the financial report', upload_types,
                        captions=upload_captions)


    match upload_type:
        case 'Text input':
            financial_report_text = st.text_area('Something', label_visibility='collapsed',
                                                placeholder='Financial report as TEXT')
            

        case 'File upload':
            uploaded_files = st.file_uploader("Choose a a text file", type=['.txt', '.docx'],
                            label_visibility='collapsed', accept_multiple_files=True)

            if not bool(uploaded_files):
                st.stop()

            financial_report_text = ''
            for uploaded_file in uploaded_files:
                if uploaded_file.name.endswith("docx"):
                    document = Document(uploaded_file)
                    document.save('./utils/texts/' + uploaded_file.name)
                    document = Document(uploaded_file.name)
                    financial_report_text += "".join([paragraph.text for paragraph in document.paragraphs]) + '\n'
                else:
                    financial_report_text += "".join([line.decode() for line in uploaded_file]) + '\n'

    # with open('./utils/texts/financial_report_text.txt', 'w') as file:
    #     file.write(financial_report_text)

    if st.button('Get label'):
        with st.spinner("Thinking..."):
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=3200, chunk_overlap=200,
                length_function = len, separators=[" ", ",", "\n"]
            )

            # st.write(f'Financial report char len: {len(financial_report_text)}')
            documents = text_splitter.create_documents([financial_report_text])
            # st.write(f'Num chunks: {len(documents)}')
            texts = [document.page_content for document in documents]
            # st.write(f'Each chunk char length: {[len(text) for text in texts]}')

            # predicted_label = get_label_prediction(texts)
            from test_models.create_setfit_model import model
            
            with torch.no_grad():
                model.model_head.eval()
                predicted_labels = model(texts)
                # st.write(predicted_labels)
            
            predicted_labels_counter = Counter(predicted_labels)
            predicted_label = predicted_labels_counter.most_common(1)[0][0]
            
            font_style = 'The predicted label is<span style="font-size: 32px"> **{}**</span>.'
            st.markdown(font_style.format(predicted_label), unsafe_allow_html=True)