IntelAnalyser / azure_openai.py
ashischakraborty's picture
Update azure_openai.py
657d3ba verified
import streamlit as st
import os
import pandas as pd
# from langchain.chat_models import AzureChatOpenAI
from langchain_openai import AzureChatOpenAI
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from pydantic import BaseModel, Field, validator
from langchain.output_parsers.enum import EnumOutputParser
from langchain_core.prompts import PromptTemplate
from enum import Enum
#os.environ["LANGCHAIN_TRACING_V2"]="true"
#os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
#LANGCHAIN_API_KEY = st.secrets['LANGCHAIN_API_KEY']
#os.environ["LANGCHAIN_PROJECT"]="UC2e2e"
# LLM Langchain Definition
OPENAI_API_KEY = st.secrets['OPENAI_API_KEY']
OPENAI_API_TYPE = "azure"
OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
OPENAI_API_VERSION = "2024-08-01-preview"
OPENAI_MODEL = "gpt-4o-mini"
# Function to read file contents
def read_file(file):
"""
Reads the content of a text file and returns it as a string.
:param file: The file name to read from the 'assets' directory.
:return: The content of the file as a string or None if an error occurs.
"""
fp = f"assets/{file}.md"
try:
with open(fp, 'r', encoding='utf-8') as file:
content = file.read()
return content
except FileNotFoundError:
print(f"The file at {fp} was not found.")
except IOError:
print(f"An error occurred while reading the file at {fp}.")
return None
# Function to generate structured insights
def process_insight(chunk, topic,source):
GSKGlossary = read_file("GSKGlossary")
if source== "intl":
SystemMessage = read_file("intl_insight_system_message")
UserMessage = read_file("intl_insight_user_message")
else:
SystemMessage = read_file("ext_insight_system_message")
UserMessage = read_file("ext_insight_user_message")
class Insights(BaseModel):
completed: bool = Field(description="This field is used to indicate that you think the number of insights has been completed")
insight: str = Field(description="This field is used to return the MECE insight in string format")
llm = AzureChatOpenAI(
openai_api_version=OPENAI_API_VERSION,
openai_api_key=OPENAI_API_KEY,
azure_endpoint=OPENAI_API_BASE,
openai_api_type=OPENAI_API_TYPE,
deployment_name=OPENAI_MODEL,
temperature=0,
)
system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
structured_llm = llm.with_structured_output(Insights)
prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
chain = prompt | structured_llm
new_insights = []
insights_data = []
while True:
# Invoke the LLM with the current chunk and existing insights
counter = 5 - len(new_insights)
new_insight_response = chain.invoke({"chunk": chunk, "existing_insights": new_insights, "counter": counter, "GSKGlossary": GSKGlossary, "topic":topic})
classification = selectClass(new_insight_response.insight)
# Append the new insight to the list
new_insights.append(new_insight_response.insight)
insights_data.append({
# "completed": new_insight_response.completed,
"classification": classification,
"insight": new_insight_response.insight,
"chunk": chunk
})
# Check if "completed" is True or the list of "new_insights" is >= 3
if new_insight_response.completed and len(new_insights) >= 3:
return pd.DataFrame(insights_data)
# If the list of "new_insights" reaches 5, return the list
if len(new_insights) == 5:
return pd.DataFrame(insights_data)
def selectClass(insight):
classification_system_message = read_file("classification_system_message")
classification_user_message = read_file("classification_user_message")
class InsightClassification(Enum):
IMPACT = "impact"
CONSULTATION = "consultation"
AWARENESS = "awareness"
llm = AzureChatOpenAI(
openai_api_version=OPENAI_API_VERSION,
openai_api_key=OPENAI_API_KEY,
azure_endpoint=OPENAI_API_BASE,
openai_api_type=OPENAI_API_TYPE,
deployment_name=OPENAI_MODEL,
temperature=0,
)
parser = EnumOutputParser(enum=InsightClassification)
system_message_template = SystemMessagePromptTemplate.from_template(classification_system_message)
# structured_llm = llm.with_structured_output(Insights)
prompt = ChatPromptTemplate.from_messages([system_message_template, classification_user_message]).partial(options=parser.get_format_instructions())
chain = prompt | llm | parser
result = chain.invoke({"insight": insight})
return result.value
def process_chunks(chunk, topic,source):
"""
Processes chunks from a specific dataframe column, invokes the get_structured function for each chunk,
and combines the resulting dataframes into one dataframe.
:param df: The dataframe containing chunks.
:param temp: Temperature parameter for the LLM.
:param SystemMessage: System message template.
:param UserMessage: User message template.
:param completedMessage: Completion message description.
:param insightMessage: Insight message description.
:param chunk_column: The name of the column containing text chunks to process.
:return: A combined dataframe of insights from all chunks.
"""
all_insights = []
for chunk in chunk["ChunkText"]:
insights_df = process_insight(chunk, topic,source)
all_insights.append(insights_df)
return pd.concat(all_insights, ignore_index=True)
def evaluation_llm(chunk, topic , source):
GSKGlossary = read_file("GSKGlossary")
if source == "intl":
SystemMessage = read_file("intl_eval_system_message")
UserMessage = read_file("intl_eval_user_message")
else:
SystemMessage = read_file("ext_eval_system_message")
UserMessage = read_file("ext_eval_user_message")
class Evaluate(BaseModel):
decision: bool = Field(description="True: The content of the document relates to the topic.False: The content of the document does not relate to the topic.")
justification: str = Field(description="Please justify your decision in a logical and structured way.")
llm = AzureChatOpenAI(
openai_api_version=OPENAI_API_VERSION,
openai_api_key=OPENAI_API_KEY,
azure_endpoint=OPENAI_API_BASE,
openai_api_type=OPENAI_API_TYPE,
deployment_name=OPENAI_MODEL,
temperature=0,
)
system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
structured_llm = llm.with_structured_output(Evaluate)
# Create a chat prompt template combining system and human messages
prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
chain = prompt | structured_llm
return chain.invoke({
"chunk": chunk,
"topic": topic,
"GSKGlossary": GSKGlossary
})
def evaluation_process(df_chunks, topic,source):
"""
Iterates over chunks in the DataFrame and processes them using `get_structured`.
:param df_chunks: DataFrame containing chunks.
:param systemMessage: System message for evaluation.
:param userMessage: User message template for evaluation.
:param temp: Temperature setting for the model.
:param decisionMessage: Description for decision field.
:param justificationMessage: Description for justification field.
:return: Updated DataFrame with decision and justification columns and consensus value.
"""
decisions = []
justifications = []
# Avoid re-inserting columns if they already exist
if "Decision" in df_chunks.columns:
df_chunks = df_chunks.drop(columns=["Decision", "Justification"])
for _, chunk in df_chunks.iterrows():
result = evaluation_llm(chunk['ChunkText'], topic,source)
decisions.append("True" if result.decision else "False") # Convert bool to string
justifications.append(result.justification)
# Add new columns to the DataFrame
df_chunks.insert(0, "Decision", decisions)
df_chunks.insert(1, "Justification", justifications)
# Count all True/False values for consensus and get most frequent value
consensus_count = df_chunks["Decision"].value_counts()
consensus_value = consensus_count.idxmax() # Most frequently occurring value
return df_chunks, consensus_value, consensus_count
def process_compare(insight_df, sopChunk_df, topic):
GSKGlossary = read_file("GSKGlossary")
SystemMessage = read_file("compare_system_message")
UserMessage = read_file("compare_user_message")
# Define the structured output model
class Compare(BaseModel):
review: bool = Field(description="This field is used to indicate whether a review is needed")
justification: str = Field(description="This field is used to justify why a review is needed")
# Initialize the LLM
llm = AzureChatOpenAI(
openai_api_version=OPENAI_API_VERSION,
openai_api_key=OPENAI_API_KEY,
azure_endpoint=OPENAI_API_BASE,
openai_api_type=OPENAI_API_TYPE,
deployment_name=OPENAI_MODEL,
temperature=0,
)
# Create the structured output and prompt chain
system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
structured_llm = llm.with_structured_output(Compare)
prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
chain = prompt | structured_llm
compare_data = []
# Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
for sopChunk_index, sopChunk_row in sopChunk_df.iterrows():
sop_chunk_text = sopChunk_row["ChunkText"] # Extract the ChunkText column
for insight_index, insight_row in insight_df.iterrows():
insight_text = insight_row["insight"] # Extract the insight column
# Invoke the LLM with the extracted data
compare_response = chain.invoke({
"sopChunk": sop_chunk_text,
"insight": insight_text,
"topic": topic,
"GSKGlossary": GSKGlossary
})
# Append the response to insights_data
compare_data.append({
"ReviewNeeded": compare_response.review,
"Justification": compare_response.justification,
"SOP": sop_chunk_text,
"Insight": insight_text
})
# Return the insights as a single DataFrame
print(compare_data)
return pd.DataFrame(compare_data)
def risk_score_process(compare_df, topic):
GSKGlossary = read_file("GSKGlossary")
SystemMessage = read_file("risk_scoring_system_message")
UserMessage = read_file("risk_scoring_user_message")
# Define the Enum for predefined options
class RiskClassification(str, Enum):
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
# Define the Pydantic model for the structured output
class Risk(BaseModel):
risk_level: RiskClassification = Field(
description="The selected classification option."
)
justification: str = Field(
description="Justify the reason for choosing this risk classification."
)
advice: str = Field(
description="Suggestions for changes that could be made to the standard operating procedure to mitigat the risk."
)
llm = AzureChatOpenAI(
openai_api_version=OPENAI_API_VERSION,
openai_api_key=OPENAI_API_KEY,
azure_endpoint=OPENAI_API_BASE,
openai_api_type=OPENAI_API_TYPE,
deployment_name=OPENAI_MODEL,
temperature=0,
)
system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
structured_llm = llm.with_structured_output(Risk)
prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
chain = prompt | structured_llm
risk_data = []
# Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
for index, row in compare_df.iterrows():
# Invoke the LLM with the extracted data
risk_response = chain.invoke({
"comparison": row['Justification'],
"insight": row['Insight'],
"SOPchunk":row['SOP'],
"topic": topic
})
# Append the response to insights_data
risk_data.append({
"RiskLevel": risk_response.risk_level,
"Justification": risk_response.justification,
"advice": risk_response.advice,
"comparison": row['Justification'],
"insight": row['Insight'],
"SOPchunk":row['SOP']
})
# Return the insights as a single DataFrame
return pd.DataFrame(risk_data)