Spaces:

CognizantAI
/

IntelAnalyser

Sleeping

File size: 13,241 Bytes

import streamlit as st
import os
import pandas as pd
# from langchain.chat_models import AzureChatOpenAI
from langchain_openai import AzureChatOpenAI
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from pydantic import BaseModel, Field, validator
from langchain.output_parsers.enum import EnumOutputParser
from langchain_core.prompts import PromptTemplate
from enum import Enum


#os.environ["LANGCHAIN_TRACING_V2"]="true"
#os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
#LANGCHAIN_API_KEY = st.secrets['LANGCHAIN_API_KEY']
#os.environ["LANGCHAIN_PROJECT"]="UC2e2e"

# LLM Langchain Definition
OPENAI_API_KEY = st.secrets['OPENAI_API_KEY']
OPENAI_API_TYPE = "azure"
OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
OPENAI_API_VERSION = "2024-08-01-preview"
OPENAI_MODEL = "gpt-4o-mini"


# Function to read file contents
def read_file(file):
    """
    Reads the content of a text file and returns it as a string.
    :param file: The file name to read from the 'assets' directory.
    :return: The content of the file as a string or None if an error occurs.
    """
    fp = f"assets/{file}.md"
    try:
        with open(fp, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        print(f"The file at {fp} was not found.")
    except IOError:
        print(f"An error occurred while reading the file at {fp}.")
    return None

# Function to generate structured insights
def process_insight(chunk, topic,source):
    
    GSKGlossary = read_file("GSKGlossary")
    if source== "intl":
        SystemMessage = read_file("intl_insight_system_message")
        UserMessage = read_file("intl_insight_user_message")
    else:
        SystemMessage = read_file("ext_insight_system_message")
        UserMessage = read_file("ext_insight_user_message")


    class Insights(BaseModel):
        completed: bool = Field(description="This field is used to indicate that you think the number of insights has been completed")
        insight: str = Field(description="This field is used to return the MECE insight in string format")
      

    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0,
    )

    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
    structured_llm = llm.with_structured_output(Insights)
    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])

    chain = prompt | structured_llm 

    new_insights = []
    insights_data = []

    while True:
        # Invoke the LLM with the current chunk and existing insights
        counter = 5 - len(new_insights)
        new_insight_response = chain.invoke({"chunk": chunk, "existing_insights": new_insights, "counter": counter, "GSKGlossary": GSKGlossary, "topic":topic})
        classification = selectClass(new_insight_response.insight)
        # Append the new insight to the list
        new_insights.append(new_insight_response.insight)
        insights_data.append({
            
            # "completed": new_insight_response.completed,
            "classification": classification,
            "insight": new_insight_response.insight,
            "chunk": chunk
        })
        

        # Check if "completed" is True or the list of "new_insights" is >= 3
        if new_insight_response.completed and len(new_insights) >= 3:
            return pd.DataFrame(insights_data)

        # If the list of "new_insights" reaches 5, return the list
        if len(new_insights) == 5:
            return pd.DataFrame(insights_data)

def selectClass(insight):

    classification_system_message = read_file("classification_system_message")
    classification_user_message = read_file("classification_user_message")

    class InsightClassification(Enum):
        IMPACT = "impact"
        CONSULTATION = "consultation"
        AWARENESS = "awareness"

    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0,
    )
    parser = EnumOutputParser(enum=InsightClassification)
    system_message_template = SystemMessagePromptTemplate.from_template(classification_system_message)

# structured_llm = llm.with_structured_output(Insights)
    prompt = ChatPromptTemplate.from_messages([system_message_template, classification_user_message]).partial(options=parser.get_format_instructions())

    chain = prompt | llm | parser

    result = chain.invoke({"insight": insight})
    return result.value

def process_chunks(chunk, topic,source):
    """
    Processes chunks from a specific dataframe column, invokes the get_structured function for each chunk,
    and combines the resulting dataframes into one dataframe.
    :param df: The dataframe containing chunks.
    :param temp: Temperature parameter for the LLM.
    :param SystemMessage: System message template.
    :param UserMessage: User message template.
    :param completedMessage: Completion message description.
    :param insightMessage: Insight message description.
    :param chunk_column: The name of the column containing text chunks to process.
    :return: A combined dataframe of insights from all chunks.
    """
    all_insights = []

    for chunk in chunk["ChunkText"]:
        insights_df = process_insight(chunk, topic,source)
        all_insights.append(insights_df)

    return pd.concat(all_insights, ignore_index=True)


def evaluation_llm(chunk, topic , source):

    GSKGlossary = read_file("GSKGlossary")
    if source == "intl":
        SystemMessage = read_file("intl_eval_system_message")
        UserMessage = read_file("intl_eval_user_message")
    else:
        SystemMessage = read_file("ext_eval_system_message")
        UserMessage = read_file("ext_eval_user_message")

    class Evaluate(BaseModel):
        decision: bool = Field(description="True: The content of the document relates to the topic.False: The content of the document does not relate to the topic.")
        justification: str = Field(description="Please justify your decision in a logical and structured way.")

    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0, 
    )


    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
    structured_llm = llm.with_structured_output(Evaluate)
    
    # Create a chat prompt template combining system and human messages
    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])

    chain = prompt | structured_llm

    return chain.invoke({
        "chunk": chunk,
        "topic": topic,
        "GSKGlossary": GSKGlossary
    })

def evaluation_process(df_chunks, topic,source):
    """
    Iterates over chunks in the DataFrame and processes them using `get_structured`.

    :param df_chunks: DataFrame containing chunks.
    :param systemMessage: System message for evaluation.
    :param userMessage: User message template for evaluation.
    :param temp: Temperature setting for the model.
    :param decisionMessage: Description for decision field.
    :param justificationMessage: Description for justification field.
    :return: Updated DataFrame with decision and justification columns and consensus value.
    """
    decisions = []
    justifications = []

    # Avoid re-inserting columns if they already exist
    if "Decision" in df_chunks.columns:
        df_chunks = df_chunks.drop(columns=["Decision", "Justification"])

    for _, chunk in df_chunks.iterrows():
        result = evaluation_llm(chunk['ChunkText'], topic,source)
        decisions.append("True" if result.decision else "False")  # Convert bool to string
        justifications.append(result.justification)

    # Add new columns to the DataFrame
    df_chunks.insert(0, "Decision", decisions)
    df_chunks.insert(1, "Justification", justifications)

    # Count all True/False values for consensus and get most frequent value
    consensus_count = df_chunks["Decision"].value_counts()
    consensus_value = consensus_count.idxmax()  # Most frequently occurring value

    return df_chunks, consensus_value, consensus_count


def process_compare(insight_df, sopChunk_df, topic):

    GSKGlossary = read_file("GSKGlossary")

    SystemMessage = read_file("compare_system_message")
    UserMessage = read_file("compare_user_message")

    # Define the structured output model
    class Compare(BaseModel):
        review: bool = Field(description="This field is used to indicate whether a review is needed")
        justification: str = Field(description="This field is used to justify why a review is needed")

    # Initialize the LLM
    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0,
    )

    # Create the structured output and prompt chain
    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
    structured_llm = llm.with_structured_output(Compare)
    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
    chain = prompt | structured_llm

    compare_data = []

    # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
    for sopChunk_index, sopChunk_row in sopChunk_df.iterrows():
        sop_chunk_text = sopChunk_row["ChunkText"]  # Extract the ChunkText column
        for insight_index, insight_row in insight_df.iterrows():
            insight_text = insight_row["insight"]  # Extract the insight column

            # Invoke the LLM with the extracted data
            compare_response = chain.invoke({
                "sopChunk": sop_chunk_text,
                "insight": insight_text,
                "topic": topic,
                "GSKGlossary": GSKGlossary
            })

            # Append the response to insights_data
            compare_data.append({
                "ReviewNeeded": compare_response.review,
                "Justification": compare_response.justification,
                "SOP": sop_chunk_text,
                "Insight": insight_text
            })

    # Return the insights as a single DataFrame
    print(compare_data)
    return pd.DataFrame(compare_data)

def risk_score_process(compare_df, topic):
        
    GSKGlossary = read_file("GSKGlossary")
    SystemMessage = read_file("risk_scoring_system_message")
    UserMessage = read_file("risk_scoring_user_message")
    
# Define the Enum for predefined options
    class RiskClassification(str, Enum):
        HIGH = "high"
        MEDIUM = "medium"
        LOW = "low"

    # Define the Pydantic model for the structured output
    class Risk(BaseModel):
        risk_level: RiskClassification = Field(
            description="The selected classification option."
        )
        justification: str = Field(
            description="Justify the reason for choosing this risk classification."
        )
        advice: str = Field(
            description="Suggestions for changes that could be made to the standard operating procedure to mitigat the risk."
        )
     

    llm = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        openai_api_key=OPENAI_API_KEY,
        azure_endpoint=OPENAI_API_BASE,
        openai_api_type=OPENAI_API_TYPE,
        deployment_name=OPENAI_MODEL,
        temperature=0,
    )

    system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
    structured_llm = llm.with_structured_output(Risk)
    prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])

    chain = prompt | structured_llm 

    risk_data = []
    

    # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
    for index, row in compare_df.iterrows():

            # Invoke the LLM with the extracted data
            risk_response = chain.invoke({
                "comparison": row['Justification'],
                "insight": row['Insight'],
                "SOPchunk":row['SOP'],
                "topic": topic
            })

            # Append the response to insights_data
            risk_data.append({
                "RiskLevel": risk_response.risk_level,
                "Justification": risk_response.justification,
                "advice": risk_response.advice,
                "comparison": row['Justification'],
                "insight": row['Insight'],
                "SOPchunk":row['SOP']
            })

    # Return the insights as a single DataFrame
    return pd.DataFrame(risk_data)