Spaces:

CognizantAI
/

IntelAnalyser

Running

App Files Files Community

IntelAnalyser / azure_openai.py

ashischakraborty

Update azure_openai.py

657d3ba verified 5 months ago

raw

history blame contribute delete

13.2 kB

	import streamlit as st
	import os
	import pandas as pd
	# from langchain.chat_models import AzureChatOpenAI
	from langchain_openai import AzureChatOpenAI
	from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
	from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
	from pydantic import BaseModel, Field, validator
	from langchain.output_parsers.enum import EnumOutputParser
	from langchain_core.prompts import PromptTemplate
	from enum import Enum


	#os.environ["LANGCHAIN_TRACING_V2"]="true"
	#os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
	#LANGCHAIN_API_KEY = st.secrets['LANGCHAIN_API_KEY']
	#os.environ["LANGCHAIN_PROJECT"]="UC2e2e"

	# LLM Langchain Definition
	OPENAI_API_KEY = st.secrets['OPENAI_API_KEY']
	OPENAI_API_TYPE = "azure"
	OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
	OPENAI_API_VERSION = "2024-08-01-preview"
	OPENAI_MODEL = "gpt-4o-mini"


	# Function to read file contents
	def read_file(file):
	"""
	Reads the content of a text file and returns it as a string.
	:param file: The file name to read from the 'assets' directory.
	:return: The content of the file as a string or None if an error occurs.
	"""
	fp = f"assets/{file}.md"
	try:
	with open(fp, 'r', encoding='utf-8') as file:
	content = file.read()
	return content
	except FileNotFoundError:
	print(f"The file at {fp} was not found.")
	except IOError:
	print(f"An error occurred while reading the file at {fp}.")
	return None

	# Function to generate structured insights
	def process_insight(chunk, topic,source):

	GSKGlossary = read_file("GSKGlossary")
	if source== "intl":
	SystemMessage = read_file("intl_insight_system_message")
	UserMessage = read_file("intl_insight_user_message")
	else:
	SystemMessage = read_file("ext_insight_system_message")
	UserMessage = read_file("ext_insight_user_message")


	class Insights(BaseModel):
	completed: bool = Field(description="This field is used to indicate that you think the number of insights has been completed")
	insight: str = Field(description="This field is used to return the MECE insight in string format")


	llm = AzureChatOpenAI(
	openai_api_version=OPENAI_API_VERSION,
	openai_api_key=OPENAI_API_KEY,
	azure_endpoint=OPENAI_API_BASE,
	openai_api_type=OPENAI_API_TYPE,
	deployment_name=OPENAI_MODEL,
	temperature=0,
	)

	system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
	structured_llm = llm.with_structured_output(Insights)
	prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])

	chain = prompt \| structured_llm

	new_insights = []
	insights_data = []

	while True:
	# Invoke the LLM with the current chunk and existing insights
	counter = 5 - len(new_insights)
	new_insight_response = chain.invoke({"chunk": chunk, "existing_insights": new_insights, "counter": counter, "GSKGlossary": GSKGlossary, "topic":topic})
	classification = selectClass(new_insight_response.insight)
	# Append the new insight to the list
	new_insights.append(new_insight_response.insight)
	insights_data.append({

	# "completed": new_insight_response.completed,
	"classification": classification,
	"insight": new_insight_response.insight,
	"chunk": chunk
	})


	# Check if "completed" is True or the list of "new_insights" is >= 3
	if new_insight_response.completed and len(new_insights) >= 3:
	return pd.DataFrame(insights_data)

	# If the list of "new_insights" reaches 5, return the list
	if len(new_insights) == 5:
	return pd.DataFrame(insights_data)

	def selectClass(insight):

	classification_system_message = read_file("classification_system_message")
	classification_user_message = read_file("classification_user_message")

	class InsightClassification(Enum):
	IMPACT = "impact"
	CONSULTATION = "consultation"
	AWARENESS = "awareness"

	llm = AzureChatOpenAI(
	openai_api_version=OPENAI_API_VERSION,
	openai_api_key=OPENAI_API_KEY,
	azure_endpoint=OPENAI_API_BASE,
	openai_api_type=OPENAI_API_TYPE,
	deployment_name=OPENAI_MODEL,
	temperature=0,
	)
	parser = EnumOutputParser(enum=InsightClassification)
	system_message_template = SystemMessagePromptTemplate.from_template(classification_system_message)

	# structured_llm = llm.with_structured_output(Insights)
	prompt = ChatPromptTemplate.from_messages([system_message_template, classification_user_message]).partial(options=parser.get_format_instructions())

	chain = prompt \| llm \| parser

	result = chain.invoke({"insight": insight})
	return result.value

	def process_chunks(chunk, topic,source):
	"""
	Processes chunks from a specific dataframe column, invokes the get_structured function for each chunk,
	and combines the resulting dataframes into one dataframe.
	:param df: The dataframe containing chunks.
	:param temp: Temperature parameter for the LLM.
	:param SystemMessage: System message template.
	:param UserMessage: User message template.
	:param completedMessage: Completion message description.
	:param insightMessage: Insight message description.
	:param chunk_column: The name of the column containing text chunks to process.
	:return: A combined dataframe of insights from all chunks.
	"""
	all_insights = []

	for chunk in chunk["ChunkText"]:
	insights_df = process_insight(chunk, topic,source)
	all_insights.append(insights_df)

	return pd.concat(all_insights, ignore_index=True)


	def evaluation_llm(chunk, topic , source):

	GSKGlossary = read_file("GSKGlossary")
	if source == "intl":
	SystemMessage = read_file("intl_eval_system_message")
	UserMessage = read_file("intl_eval_user_message")
	else:
	SystemMessage = read_file("ext_eval_system_message")
	UserMessage = read_file("ext_eval_user_message")

	class Evaluate(BaseModel):
	decision: bool = Field(description="True: The content of the document relates to the topic.False: The content of the document does not relate to the topic.")
	justification: str = Field(description="Please justify your decision in a logical and structured way.")

	llm = AzureChatOpenAI(
	openai_api_version=OPENAI_API_VERSION,
	openai_api_key=OPENAI_API_KEY,
	azure_endpoint=OPENAI_API_BASE,
	openai_api_type=OPENAI_API_TYPE,
	deployment_name=OPENAI_MODEL,
	temperature=0,
	)


	system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
	structured_llm = llm.with_structured_output(Evaluate)

	# Create a chat prompt template combining system and human messages
	prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])

	chain = prompt \| structured_llm

	return chain.invoke({
	"chunk": chunk,
	"topic": topic,
	"GSKGlossary": GSKGlossary
	})

	def evaluation_process(df_chunks, topic,source):
	"""
	Iterates over chunks in the DataFrame and processes them using `get_structured`.

	:param df_chunks: DataFrame containing chunks.
	:param systemMessage: System message for evaluation.
	:param userMessage: User message template for evaluation.
	:param temp: Temperature setting for the model.
	:param decisionMessage: Description for decision field.
	:param justificationMessage: Description for justification field.
	:return: Updated DataFrame with decision and justification columns and consensus value.
	"""
	decisions = []
	justifications = []

	# Avoid re-inserting columns if they already exist
	if "Decision" in df_chunks.columns:
	df_chunks = df_chunks.drop(columns=["Decision", "Justification"])

	for _, chunk in df_chunks.iterrows():
	result = evaluation_llm(chunk['ChunkText'], topic,source)
	decisions.append("True" if result.decision else "False") # Convert bool to string
	justifications.append(result.justification)

	# Add new columns to the DataFrame
	df_chunks.insert(0, "Decision", decisions)
	df_chunks.insert(1, "Justification", justifications)

	# Count all True/False values for consensus and get most frequent value
	consensus_count = df_chunks["Decision"].value_counts()
	consensus_value = consensus_count.idxmax() # Most frequently occurring value

	return df_chunks, consensus_value, consensus_count


	def process_compare(insight_df, sopChunk_df, topic):

	GSKGlossary = read_file("GSKGlossary")

	SystemMessage = read_file("compare_system_message")
	UserMessage = read_file("compare_user_message")

	# Define the structured output model
	class Compare(BaseModel):
	review: bool = Field(description="This field is used to indicate whether a review is needed")
	justification: str = Field(description="This field is used to justify why a review is needed")

	# Initialize the LLM
	llm = AzureChatOpenAI(
	openai_api_version=OPENAI_API_VERSION,
	openai_api_key=OPENAI_API_KEY,
	azure_endpoint=OPENAI_API_BASE,
	openai_api_type=OPENAI_API_TYPE,
	deployment_name=OPENAI_MODEL,
	temperature=0,
	)

	# Create the structured output and prompt chain
	system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
	structured_llm = llm.with_structured_output(Compare)
	prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
	chain = prompt \| structured_llm

	compare_data = []

	# Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
	for sopChunk_index, sopChunk_row in sopChunk_df.iterrows():
	sop_chunk_text = sopChunk_row["ChunkText"] # Extract the ChunkText column
	for insight_index, insight_row in insight_df.iterrows():
	insight_text = insight_row["insight"] # Extract the insight column

	# Invoke the LLM with the extracted data
	compare_response = chain.invoke({
	"sopChunk": sop_chunk_text,
	"insight": insight_text,
	"topic": topic,
	"GSKGlossary": GSKGlossary
	})

	# Append the response to insights_data
	compare_data.append({
	"ReviewNeeded": compare_response.review,
	"Justification": compare_response.justification,
	"SOP": sop_chunk_text,
	"Insight": insight_text
	})

	# Return the insights as a single DataFrame
	print(compare_data)
	return pd.DataFrame(compare_data)

	def risk_score_process(compare_df, topic):

	GSKGlossary = read_file("GSKGlossary")
	SystemMessage = read_file("risk_scoring_system_message")
	UserMessage = read_file("risk_scoring_user_message")

	# Define the Enum for predefined options
	class RiskClassification(str, Enum):
	HIGH = "high"
	MEDIUM = "medium"
	LOW = "low"

	# Define the Pydantic model for the structured output
	class Risk(BaseModel):
	risk_level: RiskClassification = Field(
	description="The selected classification option."
	)
	justification: str = Field(
	description="Justify the reason for choosing this risk classification."
	)
	advice: str = Field(
	description="Suggestions for changes that could be made to the standard operating procedure to mitigat the risk."
	)


	llm = AzureChatOpenAI(
	openai_api_version=OPENAI_API_VERSION,
	openai_api_key=OPENAI_API_KEY,
	azure_endpoint=OPENAI_API_BASE,
	openai_api_type=OPENAI_API_TYPE,
	deployment_name=OPENAI_MODEL,
	temperature=0,
	)

	system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
	structured_llm = llm.with_structured_output(Risk)
	prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])

	chain = prompt \| structured_llm

	risk_data = []


	# Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
	for index, row in compare_df.iterrows():

	# Invoke the LLM with the extracted data
	risk_response = chain.invoke({
	"comparison": row['Justification'],
	"insight": row['Insight'],
	"SOPchunk":row['SOP'],
	"topic": topic
	})

	# Append the response to insights_data
	risk_data.append({
	"RiskLevel": risk_response.risk_level,
	"Justification": risk_response.justification,
	"advice": risk_response.advice,
	"comparison": row['Justification'],
	"insight": row['Insight'],
	"SOPchunk":row['SOP']
	})

	# Return the insights as a single DataFrame
	return pd.DataFrame(risk_data)