File size: 13,241 Bytes
657d3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3daab2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
import streamlit as st
import os
import pandas as pd
# from langchain.chat_models import AzureChatOpenAI
from langchain_openai import AzureChatOpenAI
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from pydantic import BaseModel, Field, validator
from langchain.output_parsers.enum import EnumOutputParser
from langchain_core.prompts import PromptTemplate
from enum import Enum


#os.environ["LANGCHAIN_TRACING_V2"]="true"
#os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
#LANGCHAIN_API_KEY = st.secrets['LANGCHAIN_API_KEY']
#os.environ["LANGCHAIN_PROJECT"]="UC2e2e"

# LLM Langchain Definition
OPENAI_API_KEY = st.secrets['OPENAI_API_KEY']
OPENAI_API_TYPE = "azure"
OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
OPENAI_API_VERSION = "2024-08-01-preview"
OPENAI_MODEL = "gpt-4o-mini"


# Function to read file contents
def read_file(file):
    """
    Reads the content of a text file and returns it as a string.
    :param file: The file name to read from the 'assets' directory.
    :return: The content of the file as a string or None if an error occurs.
    """
    fp = f"assets/{file}.md"
    try:
        with open(fp, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        print(f"The file at {fp} was not found.")
    except IOError:
        print(f"An error occurred while reading the file at {fp}.")
    return None

# Function to generate structured insights
def process_insight(chunk, topic,source):
    
    GSKGlossary = read_file("GSKGlossary")
    if source== "intl":
        SystemMessage = read_file("intl_insight_system_message")
        UserMessage = read_file("intl_insight_user_message")
    else:
        SystemMessage = read_file("ext_insight_system_message")
        UserMessage = read_file("ext_insight_user_message")


    class Insights(BaseModel):
        completed: bool = Field(description="This field is used to indicate that you think the number of insights has been completed")
        insight: str = Field(description="This field is used to return the MECE insight in string format")
      

    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0,
    )

    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
    structured_llm = llm.with_structured_output(Insights)
    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])

    chain = prompt | structured_llm 

    new_insights = []
    insights_data = []

    while True:
        # Invoke the LLM with the current chunk and existing insights
        counter = 5 - len(new_insights)
        new_insight_response = chain.invoke({"chunk": chunk, "existing_insights": new_insights, "counter": counter, "GSKGlossary": GSKGlossary, "topic":topic})
        classification = selectClass(new_insight_response.insight)
        # Append the new insight to the list
        new_insights.append(new_insight_response.insight)
        insights_data.append({
            
            # "completed": new_insight_response.completed,
            "classification": classification,
            "insight": new_insight_response.insight,
            "chunk": chunk
        })
        

        # Check if "completed" is True or the list of "new_insights" is >= 3
        if new_insight_response.completed and len(new_insights) >= 3:
            return pd.DataFrame(insights_data)

        # If the list of "new_insights" reaches 5, return the list
        if len(new_insights) == 5:
            return pd.DataFrame(insights_data)

def selectClass(insight):

    classification_system_message = read_file("classification_system_message")
    classification_user_message = read_file("classification_user_message")

    class InsightClassification(Enum):
        IMPACT = "impact"
        CONSULTATION = "consultation"
        AWARENESS = "awareness"

    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0,
    )
    parser = EnumOutputParser(enum=InsightClassification)
    system_message_template = SystemMessagePromptTemplate.from_template(classification_system_message)

# structured_llm = llm.with_structured_output(Insights)
    prompt = ChatPromptTemplate.from_messages([system_message_template, classification_user_message]).partial(options=parser.get_format_instructions())

    chain = prompt | llm | parser

    result = chain.invoke({"insight": insight})
    return result.value

def process_chunks(chunk, topic,source):
    """
    Processes chunks from a specific dataframe column, invokes the get_structured function for each chunk,
    and combines the resulting dataframes into one dataframe.
    :param df: The dataframe containing chunks.
    :param temp: Temperature parameter for the LLM.
    :param SystemMessage: System message template.
    :param UserMessage: User message template.
    :param completedMessage: Completion message description.
    :param insightMessage: Insight message description.
    :param chunk_column: The name of the column containing text chunks to process.
    :return: A combined dataframe of insights from all chunks.
    """
    all_insights = []

    for chunk in chunk["ChunkText"]:
        insights_df = process_insight(chunk, topic,source)
        all_insights.append(insights_df)

    return pd.concat(all_insights, ignore_index=True)


def evaluation_llm(chunk, topic , source):

    GSKGlossary = read_file("GSKGlossary")
    if source == "intl":
        SystemMessage = read_file("intl_eval_system_message")
        UserMessage = read_file("intl_eval_user_message")
    else:
        SystemMessage = read_file("ext_eval_system_message")
        UserMessage = read_file("ext_eval_user_message")

    class Evaluate(BaseModel):
        decision: bool = Field(description="True: The content of the document relates to the topic.False: The content of the document does not relate to the topic.")
        justification: str = Field(description="Please justify your decision in a logical and structured way.")

    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0, 
    )


    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
    structured_llm = llm.with_structured_output(Evaluate)
    
    # Create a chat prompt template combining system and human messages
    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])

    chain = prompt | structured_llm

    return chain.invoke({
        "chunk": chunk,
        "topic": topic,
        "GSKGlossary": GSKGlossary
    })

def evaluation_process(df_chunks, topic,source):
    """
    Iterates over chunks in the DataFrame and processes them using `get_structured`.

    :param df_chunks: DataFrame containing chunks.
    :param systemMessage: System message for evaluation.
    :param userMessage: User message template for evaluation.
    :param temp: Temperature setting for the model.
    :param decisionMessage: Description for decision field.
    :param justificationMessage: Description for justification field.
    :return: Updated DataFrame with decision and justification columns and consensus value.
    """
    decisions = []
    justifications = []

    # Avoid re-inserting columns if they already exist
    if "Decision" in df_chunks.columns:
        df_chunks = df_chunks.drop(columns=["Decision", "Justification"])

    for _, chunk in df_chunks.iterrows():
        result = evaluation_llm(chunk['ChunkText'], topic,source)
        decisions.append("True" if result.decision else "False")  # Convert bool to string
        justifications.append(result.justification)

    # Add new columns to the DataFrame
    df_chunks.insert(0, "Decision", decisions)
    df_chunks.insert(1, "Justification", justifications)

    # Count all True/False values for consensus and get most frequent value
    consensus_count = df_chunks["Decision"].value_counts()
    consensus_value = consensus_count.idxmax()  # Most frequently occurring value

    return df_chunks, consensus_value, consensus_count


def process_compare(insight_df, sopChunk_df, topic):

    GSKGlossary = read_file("GSKGlossary")

    SystemMessage = read_file("compare_system_message")
    UserMessage = read_file("compare_user_message")

    # Define the structured output model
    class Compare(BaseModel):
        review: bool = Field(description="This field is used to indicate whether a review is needed")
        justification: str = Field(description="This field is used to justify why a review is needed")

    # Initialize the LLM
    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0,
    )

    # Create the structured output and prompt chain
    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
    structured_llm = llm.with_structured_output(Compare)
    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
    chain = prompt | structured_llm

    compare_data = []

    # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
    for sopChunk_index, sopChunk_row in sopChunk_df.iterrows():
        sop_chunk_text = sopChunk_row["ChunkText"]  # Extract the ChunkText column
        for insight_index, insight_row in insight_df.iterrows():
            insight_text = insight_row["insight"]  # Extract the insight column

            # Invoke the LLM with the extracted data
            compare_response = chain.invoke({
                "sopChunk": sop_chunk_text,
                "insight": insight_text,
                "topic": topic,
                "GSKGlossary": GSKGlossary
            })

            # Append the response to insights_data
            compare_data.append({
                "ReviewNeeded": compare_response.review,
                "Justification": compare_response.justification,
                "SOP": sop_chunk_text,
                "Insight": insight_text
            })

    # Return the insights as a single DataFrame
    print(compare_data)
    return pd.DataFrame(compare_data)

def risk_score_process(compare_df, topic):
        
    GSKGlossary = read_file("GSKGlossary")
    SystemMessage = read_file("risk_scoring_system_message")
    UserMessage = read_file("risk_scoring_user_message")
    
# Define the Enum for predefined options
    class RiskClassification(str, Enum):
        HIGH = "high"
        MEDIUM = "medium"
        LOW = "low"

    # Define the Pydantic model for the structured output
    class Risk(BaseModel):
        risk_level: RiskClassification = Field(
            description="The selected classification option."
        )
        justification: str = Field(
            description="Justify the reason for choosing this risk classification."
        )
        advice: str = Field(
            description="Suggestions for changes that could be made to the standard operating procedure to mitigat the risk."
        )
     

    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0,
    )

    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
    structured_llm = llm.with_structured_output(Risk)
    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])

    chain = prompt | structured_llm 

    risk_data = []
    

    # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
    for index, row in compare_df.iterrows():

            # Invoke the LLM with the extracted data
            risk_response = chain.invoke({
                "comparison": row['Justification'],
                "insight": row['Insight'],
                "SOPchunk":row['SOP'],
                "topic": topic
            })

            # Append the response to insights_data
            risk_data.append({
                "RiskLevel": risk_response.risk_level,
                "Justification": risk_response.justification,
                "advice": risk_response.advice,
                "comparison": row['Justification'],
                "insight": row['Insight'],
                "SOPchunk":row['SOP']
            })

    # Return the insights as a single DataFrame
    return pd.DataFrame(risk_data)