import streamlit as st import os import pandas as pd # from langchain.chat_models import AzureChatOpenAI from langchain_openai import AzureChatOpenAI from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate from pydantic import BaseModel, Field, validator from langchain.output_parsers.enum import EnumOutputParser from langchain_core.prompts import PromptTemplate from enum import Enum #os.environ["LANGCHAIN_TRACING_V2"]="true" #os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com" #LANGCHAIN_API_KEY = st.secrets['LANGCHAIN_API_KEY'] #os.environ["LANGCHAIN_PROJECT"]="UC2e2e" # LLM Langchain Definition OPENAI_API_KEY = st.secrets['OPENAI_API_KEY'] OPENAI_API_TYPE = "azure" OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com" OPENAI_API_VERSION = "2024-08-01-preview" OPENAI_MODEL = "gpt-4o-mini" # Function to read file contents def read_file(file): """ Reads the content of a text file and returns it as a string. :param file: The file name to read from the 'assets' directory. :return: The content of the file as a string or None if an error occurs. """ fp = f"assets/{file}.md" try: with open(fp, 'r', encoding='utf-8') as file: content = file.read() return content except FileNotFoundError: print(f"The file at {fp} was not found.") except IOError: print(f"An error occurred while reading the file at {fp}.") return None # Function to generate structured insights def process_insight(chunk, topic,source): GSKGlossary = read_file("GSKGlossary") if source== "intl": SystemMessage = read_file("intl_insight_system_message") UserMessage = read_file("intl_insight_user_message") else: SystemMessage = read_file("ext_insight_system_message") UserMessage = read_file("ext_insight_user_message") class Insights(BaseModel): completed: bool = Field(description="This field is used to indicate that you think the number of insights has been completed") insight: str = Field(description="This field is used to return the MECE insight in string format") llm = AzureChatOpenAI( openai_api_version=OPENAI_API_VERSION, openai_api_key=OPENAI_API_KEY, azure_endpoint=OPENAI_API_BASE, openai_api_type=OPENAI_API_TYPE, deployment_name=OPENAI_MODEL, temperature=0, ) system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) structured_llm = llm.with_structured_output(Insights) prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) chain = prompt | structured_llm new_insights = [] insights_data = [] while True: # Invoke the LLM with the current chunk and existing insights counter = 5 - len(new_insights) new_insight_response = chain.invoke({"chunk": chunk, "existing_insights": new_insights, "counter": counter, "GSKGlossary": GSKGlossary, "topic":topic}) classification = selectClass(new_insight_response.insight) # Append the new insight to the list new_insights.append(new_insight_response.insight) insights_data.append({ # "completed": new_insight_response.completed, "classification": classification, "insight": new_insight_response.insight, "chunk": chunk }) # Check if "completed" is True or the list of "new_insights" is >= 3 if new_insight_response.completed and len(new_insights) >= 3: return pd.DataFrame(insights_data) # If the list of "new_insights" reaches 5, return the list if len(new_insights) == 5: return pd.DataFrame(insights_data) def selectClass(insight): classification_system_message = read_file("classification_system_message") classification_user_message = read_file("classification_user_message") class InsightClassification(Enum): IMPACT = "impact" CONSULTATION = "consultation" AWARENESS = "awareness" llm = AzureChatOpenAI( openai_api_version=OPENAI_API_VERSION, openai_api_key=OPENAI_API_KEY, azure_endpoint=OPENAI_API_BASE, openai_api_type=OPENAI_API_TYPE, deployment_name=OPENAI_MODEL, temperature=0, ) parser = EnumOutputParser(enum=InsightClassification) system_message_template = SystemMessagePromptTemplate.from_template(classification_system_message) # structured_llm = llm.with_structured_output(Insights) prompt = ChatPromptTemplate.from_messages([system_message_template, classification_user_message]).partial(options=parser.get_format_instructions()) chain = prompt | llm | parser result = chain.invoke({"insight": insight}) return result.value def process_chunks(chunk, topic,source): """ Processes chunks from a specific dataframe column, invokes the get_structured function for each chunk, and combines the resulting dataframes into one dataframe. :param df: The dataframe containing chunks. :param temp: Temperature parameter for the LLM. :param SystemMessage: System message template. :param UserMessage: User message template. :param completedMessage: Completion message description. :param insightMessage: Insight message description. :param chunk_column: The name of the column containing text chunks to process. :return: A combined dataframe of insights from all chunks. """ all_insights = [] for chunk in chunk["ChunkText"]: insights_df = process_insight(chunk, topic,source) all_insights.append(insights_df) return pd.concat(all_insights, ignore_index=True) def evaluation_llm(chunk, topic , source): GSKGlossary = read_file("GSKGlossary") if source == "intl": SystemMessage = read_file("intl_eval_system_message") UserMessage = read_file("intl_eval_user_message") else: SystemMessage = read_file("ext_eval_system_message") UserMessage = read_file("ext_eval_user_message") class Evaluate(BaseModel): decision: bool = Field(description="True: The content of the document relates to the topic.False: The content of the document does not relate to the topic.") justification: str = Field(description="Please justify your decision in a logical and structured way.") llm = AzureChatOpenAI( openai_api_version=OPENAI_API_VERSION, openai_api_key=OPENAI_API_KEY, azure_endpoint=OPENAI_API_BASE, openai_api_type=OPENAI_API_TYPE, deployment_name=OPENAI_MODEL, temperature=0, ) system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) structured_llm = llm.with_structured_output(Evaluate) # Create a chat prompt template combining system and human messages prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) chain = prompt | structured_llm return chain.invoke({ "chunk": chunk, "topic": topic, "GSKGlossary": GSKGlossary }) def evaluation_process(df_chunks, topic,source): """ Iterates over chunks in the DataFrame and processes them using `get_structured`. :param df_chunks: DataFrame containing chunks. :param systemMessage: System message for evaluation. :param userMessage: User message template for evaluation. :param temp: Temperature setting for the model. :param decisionMessage: Description for decision field. :param justificationMessage: Description for justification field. :return: Updated DataFrame with decision and justification columns and consensus value. """ decisions = [] justifications = [] # Avoid re-inserting columns if they already exist if "Decision" in df_chunks.columns: df_chunks = df_chunks.drop(columns=["Decision", "Justification"]) for _, chunk in df_chunks.iterrows(): result = evaluation_llm(chunk['ChunkText'], topic,source) decisions.append("True" if result.decision else "False") # Convert bool to string justifications.append(result.justification) # Add new columns to the DataFrame df_chunks.insert(0, "Decision", decisions) df_chunks.insert(1, "Justification", justifications) # Count all True/False values for consensus and get most frequent value consensus_count = df_chunks["Decision"].value_counts() consensus_value = consensus_count.idxmax() # Most frequently occurring value return df_chunks, consensus_value, consensus_count def process_compare(insight_df, sopChunk_df, topic): GSKGlossary = read_file("GSKGlossary") SystemMessage = read_file("compare_system_message") UserMessage = read_file("compare_user_message") # Define the structured output model class Compare(BaseModel): review: bool = Field(description="This field is used to indicate whether a review is needed") justification: str = Field(description="This field is used to justify why a review is needed") # Initialize the LLM llm = AzureChatOpenAI( openai_api_version=OPENAI_API_VERSION, openai_api_key=OPENAI_API_KEY, azure_endpoint=OPENAI_API_BASE, openai_api_type=OPENAI_API_TYPE, deployment_name=OPENAI_MODEL, temperature=0, ) # Create the structured output and prompt chain system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) structured_llm = llm.with_structured_output(Compare) prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) chain = prompt | structured_llm compare_data = [] # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight" for sopChunk_index, sopChunk_row in sopChunk_df.iterrows(): sop_chunk_text = sopChunk_row["ChunkText"] # Extract the ChunkText column for insight_index, insight_row in insight_df.iterrows(): insight_text = insight_row["insight"] # Extract the insight column # Invoke the LLM with the extracted data compare_response = chain.invoke({ "sopChunk": sop_chunk_text, "insight": insight_text, "topic": topic, "GSKGlossary": GSKGlossary }) # Append the response to insights_data compare_data.append({ "ReviewNeeded": compare_response.review, "Justification": compare_response.justification, "SOP": sop_chunk_text, "Insight": insight_text }) # Return the insights as a single DataFrame print(compare_data) return pd.DataFrame(compare_data) def risk_score_process(compare_df, topic): GSKGlossary = read_file("GSKGlossary") SystemMessage = read_file("risk_scoring_system_message") UserMessage = read_file("risk_scoring_user_message") # Define the Enum for predefined options class RiskClassification(str, Enum): HIGH = "high" MEDIUM = "medium" LOW = "low" # Define the Pydantic model for the structured output class Risk(BaseModel): risk_level: RiskClassification = Field( description="The selected classification option." ) justification: str = Field( description="Justify the reason for choosing this risk classification." ) advice: str = Field( description="Suggestions for changes that could be made to the standard operating procedure to mitigat the risk." ) llm = AzureChatOpenAI( openai_api_version=OPENAI_API_VERSION, openai_api_key=OPENAI_API_KEY, azure_endpoint=OPENAI_API_BASE, openai_api_type=OPENAI_API_TYPE, deployment_name=OPENAI_MODEL, temperature=0, ) system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) structured_llm = llm.with_structured_output(Risk) prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) chain = prompt | structured_llm risk_data = [] # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight" for index, row in compare_df.iterrows(): # Invoke the LLM with the extracted data risk_response = chain.invoke({ "comparison": row['Justification'], "insight": row['Insight'], "SOPchunk":row['SOP'], "topic": topic }) # Append the response to insights_data risk_data.append({ "RiskLevel": risk_response.risk_level, "Justification": risk_response.justification, "advice": risk_response.advice, "comparison": row['Justification'], "insight": row['Insight'], "SOPchunk":row['SOP'] }) # Return the insights as a single DataFrame return pd.DataFrame(risk_data)