Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import pandas as pd | |
# from langchain.chat_models import AzureChatOpenAI | |
from langchain_openai import AzureChatOpenAI | |
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser | |
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate | |
from pydantic import BaseModel, Field, validator | |
from langchain.output_parsers.enum import EnumOutputParser | |
from langchain_core.prompts import PromptTemplate | |
from enum import Enum | |
#os.environ["LANGCHAIN_TRACING_V2"]="true" | |
#os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com" | |
#LANGCHAIN_API_KEY = st.secrets['LANGCHAIN_API_KEY'] | |
#os.environ["LANGCHAIN_PROJECT"]="UC2e2e" | |
# LLM Langchain Definition | |
OPENAI_API_KEY = st.secrets['OPENAI_API_KEY'] | |
OPENAI_API_TYPE = "azure" | |
OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com" | |
OPENAI_API_VERSION = "2024-08-01-preview" | |
OPENAI_MODEL = "gpt-4o-mini" | |
# Function to read file contents | |
def read_file(file): | |
""" | |
Reads the content of a text file and returns it as a string. | |
:param file: The file name to read from the 'assets' directory. | |
:return: The content of the file as a string or None if an error occurs. | |
""" | |
fp = f"assets/{file}.md" | |
try: | |
with open(fp, 'r', encoding='utf-8') as file: | |
content = file.read() | |
return content | |
except FileNotFoundError: | |
print(f"The file at {fp} was not found.") | |
except IOError: | |
print(f"An error occurred while reading the file at {fp}.") | |
return None | |
# Function to generate structured insights | |
def process_insight(chunk, topic,source): | |
GSKGlossary = read_file("GSKGlossary") | |
if source== "intl": | |
SystemMessage = read_file("intl_insight_system_message") | |
UserMessage = read_file("intl_insight_user_message") | |
else: | |
SystemMessage = read_file("ext_insight_system_message") | |
UserMessage = read_file("ext_insight_user_message") | |
class Insights(BaseModel): | |
completed: bool = Field(description="This field is used to indicate that you think the number of insights has been completed") | |
insight: str = Field(description="This field is used to return the MECE insight in string format") | |
llm = AzureChatOpenAI( | |
openai_api_version=OPENAI_API_VERSION, | |
openai_api_key=OPENAI_API_KEY, | |
azure_endpoint=OPENAI_API_BASE, | |
openai_api_type=OPENAI_API_TYPE, | |
deployment_name=OPENAI_MODEL, | |
temperature=0, | |
) | |
system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) | |
structured_llm = llm.with_structured_output(Insights) | |
prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) | |
chain = prompt | structured_llm | |
new_insights = [] | |
insights_data = [] | |
while True: | |
# Invoke the LLM with the current chunk and existing insights | |
counter = 5 - len(new_insights) | |
new_insight_response = chain.invoke({"chunk": chunk, "existing_insights": new_insights, "counter": counter, "GSKGlossary": GSKGlossary, "topic":topic}) | |
classification = selectClass(new_insight_response.insight) | |
# Append the new insight to the list | |
new_insights.append(new_insight_response.insight) | |
insights_data.append({ | |
# "completed": new_insight_response.completed, | |
"classification": classification, | |
"insight": new_insight_response.insight, | |
"chunk": chunk | |
}) | |
# Check if "completed" is True or the list of "new_insights" is >= 3 | |
if new_insight_response.completed and len(new_insights) >= 3: | |
return pd.DataFrame(insights_data) | |
# If the list of "new_insights" reaches 5, return the list | |
if len(new_insights) == 5: | |
return pd.DataFrame(insights_data) | |
def selectClass(insight): | |
classification_system_message = read_file("classification_system_message") | |
classification_user_message = read_file("classification_user_message") | |
class InsightClassification(Enum): | |
IMPACT = "impact" | |
CONSULTATION = "consultation" | |
AWARENESS = "awareness" | |
llm = AzureChatOpenAI( | |
openai_api_version=OPENAI_API_VERSION, | |
openai_api_key=OPENAI_API_KEY, | |
azure_endpoint=OPENAI_API_BASE, | |
openai_api_type=OPENAI_API_TYPE, | |
deployment_name=OPENAI_MODEL, | |
temperature=0, | |
) | |
parser = EnumOutputParser(enum=InsightClassification) | |
system_message_template = SystemMessagePromptTemplate.from_template(classification_system_message) | |
# structured_llm = llm.with_structured_output(Insights) | |
prompt = ChatPromptTemplate.from_messages([system_message_template, classification_user_message]).partial(options=parser.get_format_instructions()) | |
chain = prompt | llm | parser | |
result = chain.invoke({"insight": insight}) | |
return result.value | |
def process_chunks(chunk, topic,source): | |
""" | |
Processes chunks from a specific dataframe column, invokes the get_structured function for each chunk, | |
and combines the resulting dataframes into one dataframe. | |
:param df: The dataframe containing chunks. | |
:param temp: Temperature parameter for the LLM. | |
:param SystemMessage: System message template. | |
:param UserMessage: User message template. | |
:param completedMessage: Completion message description. | |
:param insightMessage: Insight message description. | |
:param chunk_column: The name of the column containing text chunks to process. | |
:return: A combined dataframe of insights from all chunks. | |
""" | |
all_insights = [] | |
for chunk in chunk["ChunkText"]: | |
insights_df = process_insight(chunk, topic,source) | |
all_insights.append(insights_df) | |
return pd.concat(all_insights, ignore_index=True) | |
def evaluation_llm(chunk, topic , source): | |
GSKGlossary = read_file("GSKGlossary") | |
if source == "intl": | |
SystemMessage = read_file("intl_eval_system_message") | |
UserMessage = read_file("intl_eval_user_message") | |
else: | |
SystemMessage = read_file("ext_eval_system_message") | |
UserMessage = read_file("ext_eval_user_message") | |
class Evaluate(BaseModel): | |
decision: bool = Field(description="True: The content of the document relates to the topic.False: The content of the document does not relate to the topic.") | |
justification: str = Field(description="Please justify your decision in a logical and structured way.") | |
llm = AzureChatOpenAI( | |
openai_api_version=OPENAI_API_VERSION, | |
openai_api_key=OPENAI_API_KEY, | |
azure_endpoint=OPENAI_API_BASE, | |
openai_api_type=OPENAI_API_TYPE, | |
deployment_name=OPENAI_MODEL, | |
temperature=0, | |
) | |
system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) | |
structured_llm = llm.with_structured_output(Evaluate) | |
# Create a chat prompt template combining system and human messages | |
prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) | |
chain = prompt | structured_llm | |
return chain.invoke({ | |
"chunk": chunk, | |
"topic": topic, | |
"GSKGlossary": GSKGlossary | |
}) | |
def evaluation_process(df_chunks, topic,source): | |
""" | |
Iterates over chunks in the DataFrame and processes them using `get_structured`. | |
:param df_chunks: DataFrame containing chunks. | |
:param systemMessage: System message for evaluation. | |
:param userMessage: User message template for evaluation. | |
:param temp: Temperature setting for the model. | |
:param decisionMessage: Description for decision field. | |
:param justificationMessage: Description for justification field. | |
:return: Updated DataFrame with decision and justification columns and consensus value. | |
""" | |
decisions = [] | |
justifications = [] | |
# Avoid re-inserting columns if they already exist | |
if "Decision" in df_chunks.columns: | |
df_chunks = df_chunks.drop(columns=["Decision", "Justification"]) | |
for _, chunk in df_chunks.iterrows(): | |
result = evaluation_llm(chunk['ChunkText'], topic,source) | |
decisions.append("True" if result.decision else "False") # Convert bool to string | |
justifications.append(result.justification) | |
# Add new columns to the DataFrame | |
df_chunks.insert(0, "Decision", decisions) | |
df_chunks.insert(1, "Justification", justifications) | |
# Count all True/False values for consensus and get most frequent value | |
consensus_count = df_chunks["Decision"].value_counts() | |
consensus_value = consensus_count.idxmax() # Most frequently occurring value | |
return df_chunks, consensus_value, consensus_count | |
def process_compare(insight_df, sopChunk_df, topic): | |
GSKGlossary = read_file("GSKGlossary") | |
SystemMessage = read_file("compare_system_message") | |
UserMessage = read_file("compare_user_message") | |
# Define the structured output model | |
class Compare(BaseModel): | |
review: bool = Field(description="This field is used to indicate whether a review is needed") | |
justification: str = Field(description="This field is used to justify why a review is needed") | |
# Initialize the LLM | |
llm = AzureChatOpenAI( | |
openai_api_version=OPENAI_API_VERSION, | |
openai_api_key=OPENAI_API_KEY, | |
azure_endpoint=OPENAI_API_BASE, | |
openai_api_type=OPENAI_API_TYPE, | |
deployment_name=OPENAI_MODEL, | |
temperature=0, | |
) | |
# Create the structured output and prompt chain | |
system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) | |
structured_llm = llm.with_structured_output(Compare) | |
prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) | |
chain = prompt | structured_llm | |
compare_data = [] | |
# Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight" | |
for sopChunk_index, sopChunk_row in sopChunk_df.iterrows(): | |
sop_chunk_text = sopChunk_row["ChunkText"] # Extract the ChunkText column | |
for insight_index, insight_row in insight_df.iterrows(): | |
insight_text = insight_row["insight"] # Extract the insight column | |
# Invoke the LLM with the extracted data | |
compare_response = chain.invoke({ | |
"sopChunk": sop_chunk_text, | |
"insight": insight_text, | |
"topic": topic, | |
"GSKGlossary": GSKGlossary | |
}) | |
# Append the response to insights_data | |
compare_data.append({ | |
"ReviewNeeded": compare_response.review, | |
"Justification": compare_response.justification, | |
"SOP": sop_chunk_text, | |
"Insight": insight_text | |
}) | |
# Return the insights as a single DataFrame | |
print(compare_data) | |
return pd.DataFrame(compare_data) | |
def risk_score_process(compare_df, topic): | |
GSKGlossary = read_file("GSKGlossary") | |
SystemMessage = read_file("risk_scoring_system_message") | |
UserMessage = read_file("risk_scoring_user_message") | |
# Define the Enum for predefined options | |
class RiskClassification(str, Enum): | |
HIGH = "high" | |
MEDIUM = "medium" | |
LOW = "low" | |
# Define the Pydantic model for the structured output | |
class Risk(BaseModel): | |
risk_level: RiskClassification = Field( | |
description="The selected classification option." | |
) | |
justification: str = Field( | |
description="Justify the reason for choosing this risk classification." | |
) | |
advice: str = Field( | |
description="Suggestions for changes that could be made to the standard operating procedure to mitigat the risk." | |
) | |
llm = AzureChatOpenAI( | |
openai_api_version=OPENAI_API_VERSION, | |
openai_api_key=OPENAI_API_KEY, | |
azure_endpoint=OPENAI_API_BASE, | |
openai_api_type=OPENAI_API_TYPE, | |
deployment_name=OPENAI_MODEL, | |
temperature=0, | |
) | |
system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) | |
structured_llm = llm.with_structured_output(Risk) | |
prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) | |
chain = prompt | structured_llm | |
risk_data = [] | |
# Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight" | |
for index, row in compare_df.iterrows(): | |
# Invoke the LLM with the extracted data | |
risk_response = chain.invoke({ | |
"comparison": row['Justification'], | |
"insight": row['Insight'], | |
"SOPchunk":row['SOP'], | |
"topic": topic | |
}) | |
# Append the response to insights_data | |
risk_data.append({ | |
"RiskLevel": risk_response.risk_level, | |
"Justification": risk_response.justification, | |
"advice": risk_response.advice, | |
"comparison": row['Justification'], | |
"insight": row['Insight'], | |
"SOPchunk":row['SOP'] | |
}) | |
# Return the insights as a single DataFrame | |
return pd.DataFrame(risk_data) |