ashischakraborty commited on
Commit
3daab2e
·
verified ·
1 Parent(s): 927b503

UC#3 first upload

Browse files
Files changed (3) hide show
  1. app.py +209 -0
  2. azure_openai.py +349 -0
  3. helper_functions.py +28 -0
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import PyPDF2
3
+ import pandas as pd
4
+ import uuid
5
+ import tiktoken
6
+ import re
7
+ from datetime import datetime
8
+ from helper_functions import extract_text_from_pdf
9
+ from azure_openai import evaluation_process, process_insight, process_compare, risk_score_process, process_chunks
10
+
11
+
12
+ def count_tokens_with_tiktoken(text):
13
+ """
14
+ Counts the number of tokens in a given text using tiktoken.
15
+ :param text: Input text.
16
+ :return: Token count.
17
+ """
18
+ tokenizer = tiktoken.get_encoding("cl100k_base")
19
+ tokens = tokenizer.encode(text)
20
+ return len(tokens), tokens
21
+
22
+ def split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap):
23
+ """
24
+ Splits text into chunks based on a specified chunk size in tokens and overlap using tiktoken.
25
+ Tracks start and end page numbers for each chunk.
26
+ :param text: Combined text of the document.
27
+ :param page_texts: List of tuples [(page_number, page_text), ...].
28
+ :param chunk_size: Maximum size of each chunk in tokens.
29
+ :param overlap: Number of overlapping tokens between consecutive chunks.
30
+ :return: List of dictionaries representing chunks with start and end pages.
31
+ """
32
+ _, tokens = count_tokens_with_tiktoken(text)
33
+ chunks = []
34
+
35
+ # Map token positions to page numbers
36
+ page_token_map = []
37
+ tokenizer = tiktoken.get_encoding("cl100k_base")
38
+ for page_number, page_text in page_texts:
39
+ page_tokens = tokenizer.encode(page_text)
40
+ page_token_map.extend([page_number] * len(page_tokens))
41
+
42
+ for start in range(0, len(tokens), chunk_size - overlap):
43
+ end = min(start + chunk_size, len(tokens))
44
+ chunk_tokens = tokens[start:end]
45
+ chunk_text = tokenizer.decode(chunk_tokens)
46
+
47
+ # Determine start and end pages
48
+ start_page = page_token_map[start] if start < len(page_token_map) else None
49
+ end_page = page_token_map[end - 1] if end - 1 < len(page_token_map) else page_texts[-1][0]
50
+
51
+ chunks.append({
52
+ "ChunkText": chunk_text,
53
+ "TokenCount": len(chunk_tokens),
54
+ "StartPage": start_page,
55
+ "EndPage": end_page,
56
+ "ChunkID": str(uuid.uuid4())
57
+ })
58
+
59
+ return chunks
60
+
61
+ def split_dataframe_with_combined_text_and_pages(df, chunk_size, overlap):
62
+ """
63
+ Splits the combined text of a DataFrame into chunks using tiktoken.
64
+ Each chunk will include start and end page numbers.
65
+ :param df: DataFrame with columns ['Title', 'Text', 'PageTexts'].
66
+ :param chunk_size: The maximum size of each chunk in tokens.
67
+ :param overlap: The number of overlapping tokens between consecutive chunks.
68
+ :return: DataFrame with columns ['ChunkText', 'TokenCount', 'StartPage', 'EndPage', 'ChunkID'].
69
+ """
70
+ chunks = []
71
+
72
+ for _, row in df.iterrows():
73
+ text = row['Text']
74
+ page_texts = row['PageTexts']
75
+ split_chunks = split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap)
76
+ chunks.extend(split_chunks)
77
+
78
+ return pd.DataFrame(chunks)
79
+
80
+
81
+
82
+ def main():
83
+ st.set_page_config(page_title="RegIntel Risk Analyser", page_icon=":vertical_traffic_light:")
84
+ st.title("External RegIntel Risk Analyser :vertical_traffic_light:")
85
+ topic = st.selectbox("Please choose a focus for the system",("Labelling",
86
+ "Institutional Review Board/Independent Ethics Committee",
87
+ "Investigator", "Sponsor",
88
+ "Clinical Trial Protocol and protocol amendments",
89
+ "Investigator's Brochure", "Conduct of Clinical Trial",
90
+ "Monitoring", "Auditing",
91
+ "Data handling and record keeping",
92
+ "clinical trial reports",
93
+ "Responsibilities of the Sponsor and Investigator",
94
+ "Sponsor Inspection Preparation"),)
95
+ uploaded_extintl_file_insight = st.file_uploader("Upload a External Reg Intel", type="pdf")
96
+ uploaded_interintel_file_insight = st.file_uploader("Upload a External Reg Intel", type="pdf")
97
+ if uploaded_extintl_file_insight is not None and uploaded_interintel_file_insight is not None:
98
+ uploaded_file_SOP = st.file_uploader("Upload an SOP file", type="pdf")
99
+ if uploaded_file_SOP is not None:
100
+ # Extract insight document
101
+ with st.spinner("Processing External Reg Intel"):
102
+ ext_intl_text_insight, ext_intl_page_texts_insight = extract_text_from_pdf(uploaded_extintl_file_insight)
103
+ token_count_insight, _ = count_tokens_with_tiktoken(ext_intl_text_insight)
104
+ st.sidebar.success("External Reg Intel file successfully processed")
105
+ st.write("Token Count")
106
+ st.write(f"The PDF contains **{token_count_insight}** tokens.")
107
+
108
+ with st.spinner("Processing Internal Reg Intel"):
109
+ int_intl_text_insight, int_intl_page_texts_insight = extract_text_from_pdf(uploaded_interintel_file_insight)
110
+ token_count_insight, _ = count_tokens_with_tiktoken(int_intl_text_insight)
111
+ st.sidebar.success("External Reg Intel file successfully processed")
112
+ st.write("Token Count")
113
+ st.write(f"The PDF contains **{token_count_insight}** tokens.")
114
+
115
+
116
+ # Extract SOP document
117
+ with st.spinner("Processing the SOP Text..."):
118
+ text_SOP, page_texts_SOP = extract_text_from_pdf(uploaded_file_SOP)
119
+ token_count_SOP, _ = count_tokens_with_tiktoken(text_SOP)
120
+ st.sidebar.success("SOP file successfully processed")
121
+ st.write("Token Count")
122
+ st.write(f"The PDF contains **{token_count_SOP}** tokens.")
123
+
124
+ # Process external insight Insights into chunks
125
+ with st.spinner("Processing the Insight Document..."):
126
+ df_ei_input_insight = pd.DataFrame([{ "Title": uploaded_extintl_file_insight.name, "Text": ext_intl_text_insight, "PageTexts": ext_intl_page_texts_insight }])
127
+ df_ei_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ei_input_insight, 10000, 1000)
128
+ st.write("Processed External Reg Intel")
129
+ st.sidebar.success("Processed External Reg Intel")
130
+ st.write(df_ei_insight_chunks)
131
+
132
+ # Process internal insight Insights into chunks
133
+ with st.spinner("Processing the Insight Document..."):
134
+ df_ii_input_insight = pd.DataFrame([{ "Title": uploaded_interintel_file_insight.name, "Text": int_intl_text_insight, "PageTexts": int_intl_page_texts_insight }])
135
+ df_ii_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ii_input_insight, 10000, 1000)
136
+ st.write("Processed External Reg Intel")
137
+ st.sidebar.success("Processed External Reg Intel")
138
+ st.write(df_ii_insight_chunks)
139
+
140
+
141
+ # Process SOP into chunks
142
+ with st.spinner("Processing the SOP Document..."):
143
+ df_input_SOP = pd.DataFrame([{ "Title": uploaded_file_SOP.name, "Text": text_SOP, "PageTexts": page_texts_SOP }])
144
+ df_sop_chunks = split_dataframe_with_combined_text_and_pages(df_input_SOP, 10000, 1000)
145
+ st.write("Processed SOP")
146
+ st.sidebar.success("Processed SOP")
147
+ st.write(df_sop_chunks)
148
+
149
+ # Evaluate Document
150
+ with st.spinner("Evaluating document"):
151
+ df_ei_eval, ei_con, ei_score = evaluation_process(df_ei_insight_chunks, topic,"ext")
152
+ ei_score["source"]="external intel"
153
+ df_ei_eval["source"]="external intel"
154
+ df_ii_eval, ii_con, ii_score = evaluation_process(df_ii_insight_chunks, topic,"intl")
155
+ ii_score["source"]="internal intel"
156
+ df_ii_eval["source"]="internal intel"
157
+ score = pd.concat([ei_score, ii_score])
158
+ st.write("External Inteligence Evaluation")
159
+ st.sidebar.success(f"Evaluation Concensus: {ei_con}")
160
+ st.write(f"Evaluation Concensus: {ei_con}")
161
+ st.write("Evaluation Scores:")
162
+ st.write(score)
163
+ if ei_con == "False" and ii_con == "False":
164
+ st.sidebar.error("Document Not Relevant To Topic")
165
+ st.write("Document Not Relevant To Topic")
166
+ st.write("Exiting RegIntel Analysis")
167
+ return
168
+
169
+ # Generate Insights
170
+ with st.spinner("Creating insights"):
171
+ df_ei_insights = process_chunks(df_ei_insight_chunks, topic,"ext")
172
+ df_ii_insights = process_chunks(df_ii_insight_chunks, topic,"intl")
173
+ df_ei_insights["source"]="external intel"
174
+ df_ii_insights["source"]="internal intel"
175
+ df_insights = pd.concat([df_ei_insights, df_ii_insights])
176
+ st.subheader("External Inteligence Insights")
177
+ st.sidebar.success("External Inteligence Insights Created")
178
+
179
+ st.write(df_insights)
180
+ filtered_insights_on_impact = df_insights[df_insights['classification'] == 'impact']
181
+
182
+
183
+ if filtered_insights_on_impact.empty:
184
+ st.write("No impact insights")
185
+ st.sidebar.error("No impact insights")
186
+ return
187
+
188
+ # Comparing to Insights
189
+ with st.spinner("Comparing Impact Classified Insights To SOP"):
190
+ df_compare = process_compare(filtered_insights_on_impact, df_sop_chunks, topic)
191
+ st.subheader("Comparison of Insights to SOP's")
192
+ st.sidebar.success("Comparison of External Intel to SOP's Complete")
193
+ st.write(df_compare)
194
+ filtered_comparisons_df = df_compare[df_compare['ReviewNeeded'] == True]
195
+ if filtered_comparisons_df.empty:
196
+ st.write("No reviews needed for this SOP")
197
+ st.sidebar.error("No reviews needed for this SOP")
198
+ return
199
+
200
+ # Risk scoring
201
+ with st.spinner("Risk Assessing Insights To SOP"):
202
+ df_risks = risk_score_process(filtered_comparisons_df, topic)
203
+ st.subheader("Risk Score of Insights to SOP's")
204
+ st.sidebar.success("Risk Score of Insights to SOP's Completed")
205
+ st.write(df_risks)
206
+
207
+
208
+ if __name__ == "__main__":
209
+ main()
azure_openai.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pandas as pd
4
+ # from langchain.chat_models import AzureChatOpenAI
5
+ from langchain_openai import AzureChatOpenAI
6
+ from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
7
+ from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
8
+ from pydantic import BaseModel, Field, validator
9
+ from langchain.output_parsers.enum import EnumOutputParser
10
+ from langchain_core.prompts import PromptTemplate
11
+ from enum import Enum
12
+
13
+
14
+ os.environ["LANGCHAIN_TRACING_V2"]="true"
15
+ os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
16
+ LANGCHAIN_API_KEY = st.secrets['LANGCHAIN_API_KEY']
17
+ os.environ["LANGCHAIN_PROJECT"]="UC2e2e"
18
+
19
+ # LLM Langchain Definition
20
+ OPENAI_API_KEY = st.secrets['OPENAI_API_KEY']
21
+ OPENAI_API_TYPE = "azure"
22
+ OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com"
23
+ OPENAI_API_VERSION = "2024-08-01-preview"
24
+ OPENAI_MODEL = "gpt-4o-mini"
25
+
26
+
27
+ # Function to read file contents
28
+ def read_file(file):
29
+ """
30
+ Reads the content of a text file and returns it as a string.
31
+ :param file: The file name to read from the 'assets' directory.
32
+ :return: The content of the file as a string or None if an error occurs.
33
+ """
34
+ fp = f"assets/{file}.md"
35
+ try:
36
+ with open(fp, 'r', encoding='utf-8') as file:
37
+ content = file.read()
38
+ return content
39
+ except FileNotFoundError:
40
+ print(f"The file at {fp} was not found.")
41
+ except IOError:
42
+ print(f"An error occurred while reading the file at {fp}.")
43
+ return None
44
+
45
+ # Function to generate structured insights
46
+ def process_insight(chunk, topic,source):
47
+
48
+ GSKGlossary = read_file("GSKGlossary")
49
+ if source== "intl":
50
+ SystemMessage = read_file("intl_insight_system_message")
51
+ UserMessage = read_file("intl_insight_user_message")
52
+ else:
53
+ SystemMessage = read_file("ext_insight_system_message")
54
+ UserMessage = read_file("ext_insight_user_message")
55
+
56
+
57
+ class Insights(BaseModel):
58
+ completed: bool = Field(description="This field is used to indicate that you think the number of insights has been completed")
59
+ insight: str = Field(description="This field is used to return the MECE insight in string format")
60
+
61
+
62
+ llm = AzureChatOpenAI(
63
+ openai_api_version=OPENAI_API_VERSION,
64
+ openai_api_key=OPENAI_API_KEY,
65
+ azure_endpoint=OPENAI_API_BASE,
66
+ openai_api_type=OPENAI_API_TYPE,
67
+ deployment_name=OPENAI_MODEL,
68
+ temperature=0,
69
+ )
70
+
71
+ system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
72
+ structured_llm = llm.with_structured_output(Insights)
73
+ prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
74
+
75
+ chain = prompt | structured_llm
76
+
77
+ new_insights = []
78
+ insights_data = []
79
+
80
+ while True:
81
+ # Invoke the LLM with the current chunk and existing insights
82
+ counter = 5 - len(new_insights)
83
+ new_insight_response = chain.invoke({"chunk": chunk, "existing_insights": new_insights, "counter": counter, "GSKGlossary": GSKGlossary, "topic":topic})
84
+ classification = selectClass(new_insight_response.insight)
85
+ # Append the new insight to the list
86
+ new_insights.append(new_insight_response.insight)
87
+ insights_data.append({
88
+
89
+ # "completed": new_insight_response.completed,
90
+ "classification": classification,
91
+ "insight": new_insight_response.insight,
92
+ "chunk": chunk
93
+ })
94
+
95
+
96
+ # Check if "completed" is True or the list of "new_insights" is >= 3
97
+ if new_insight_response.completed and len(new_insights) >= 3:
98
+ return pd.DataFrame(insights_data)
99
+
100
+ # If the list of "new_insights" reaches 5, return the list
101
+ if len(new_insights) == 5:
102
+ return pd.DataFrame(insights_data)
103
+
104
+ def selectClass(insight):
105
+
106
+ classification_system_message = read_file("classification_system_message")
107
+ classification_user_message = read_file("classification_user_message")
108
+
109
+ class InsightClassification(Enum):
110
+ IMPACT = "impact"
111
+ CONSULTATION = "consultation"
112
+ AWARENESS = "awareness"
113
+
114
+ llm = AzureChatOpenAI(
115
+ openai_api_version=OPENAI_API_VERSION,
116
+ openai_api_key=OPENAI_API_KEY,
117
+ azure_endpoint=OPENAI_API_BASE,
118
+ openai_api_type=OPENAI_API_TYPE,
119
+ deployment_name=OPENAI_MODEL,
120
+ temperature=0,
121
+ )
122
+ parser = EnumOutputParser(enum=InsightClassification)
123
+ system_message_template = SystemMessagePromptTemplate.from_template(classification_system_message)
124
+
125
+ # structured_llm = llm.with_structured_output(Insights)
126
+ prompt = ChatPromptTemplate.from_messages([system_message_template, classification_user_message]).partial(options=parser.get_format_instructions())
127
+
128
+ chain = prompt | llm | parser
129
+
130
+ result = chain.invoke({"insight": insight})
131
+ return result.value
132
+
133
+ def process_chunks(chunk, topic,source):
134
+ """
135
+ Processes chunks from a specific dataframe column, invokes the get_structured function for each chunk,
136
+ and combines the resulting dataframes into one dataframe.
137
+ :param df: The dataframe containing chunks.
138
+ :param temp: Temperature parameter for the LLM.
139
+ :param SystemMessage: System message template.
140
+ :param UserMessage: User message template.
141
+ :param completedMessage: Completion message description.
142
+ :param insightMessage: Insight message description.
143
+ :param chunk_column: The name of the column containing text chunks to process.
144
+ :return: A combined dataframe of insights from all chunks.
145
+ """
146
+ all_insights = []
147
+
148
+ for chunk in chunk["ChunkText"]:
149
+ insights_df = process_insight(chunk, topic,source)
150
+ all_insights.append(insights_df)
151
+
152
+ return pd.concat(all_insights, ignore_index=True)
153
+
154
+
155
+ def evaluation_llm(chunk, topic , source):
156
+
157
+ GSKGlossary = read_file("GSKGlossary")
158
+ if source == "intl":
159
+ SystemMessage = read_file("intl_eval_system_message")
160
+ UserMessage = read_file("intl_eval_user_message")
161
+ else:
162
+ SystemMessage = read_file("ext_eval_system_message")
163
+ UserMessage = read_file("ext_eval_user_message")
164
+
165
+ class Evaluate(BaseModel):
166
+ decision: bool = Field(description="True: The content of the document relates to the topic.False: The content of the document does not relate to the topic.")
167
+ justification: str = Field(description="Please justify your decision in a logical and structured way.")
168
+
169
+ llm = AzureChatOpenAI(
170
+ openai_api_version=OPENAI_API_VERSION,
171
+ openai_api_key=OPENAI_API_KEY,
172
+ azure_endpoint=OPENAI_API_BASE,
173
+ openai_api_type=OPENAI_API_TYPE,
174
+ deployment_name=OPENAI_MODEL,
175
+ temperature=0,
176
+ )
177
+
178
+
179
+ system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
180
+ structured_llm = llm.with_structured_output(Evaluate)
181
+
182
+ # Create a chat prompt template combining system and human messages
183
+ prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
184
+
185
+ chain = prompt | structured_llm
186
+
187
+ return chain.invoke({
188
+ "chunk": chunk,
189
+ "topic": topic,
190
+ "GSKGlossary": GSKGlossary
191
+ })
192
+
193
+ def evaluation_process(df_chunks, topic,source):
194
+ """
195
+ Iterates over chunks in the DataFrame and processes them using `get_structured`.
196
+
197
+ :param df_chunks: DataFrame containing chunks.
198
+ :param systemMessage: System message for evaluation.
199
+ :param userMessage: User message template for evaluation.
200
+ :param temp: Temperature setting for the model.
201
+ :param decisionMessage: Description for decision field.
202
+ :param justificationMessage: Description for justification field.
203
+ :return: Updated DataFrame with decision and justification columns and consensus value.
204
+ """
205
+ decisions = []
206
+ justifications = []
207
+
208
+ # Avoid re-inserting columns if they already exist
209
+ if "Decision" in df_chunks.columns:
210
+ df_chunks = df_chunks.drop(columns=["Decision", "Justification"])
211
+
212
+ for _, chunk in df_chunks.iterrows():
213
+ result = evaluation_llm(chunk['ChunkText'], topic,source)
214
+ decisions.append("True" if result.decision else "False") # Convert bool to string
215
+ justifications.append(result.justification)
216
+
217
+ # Add new columns to the DataFrame
218
+ df_chunks.insert(0, "Decision", decisions)
219
+ df_chunks.insert(1, "Justification", justifications)
220
+
221
+ # Count all True/False values for consensus and get most frequent value
222
+ consensus_count = df_chunks["Decision"].value_counts()
223
+ consensus_value = consensus_count.idxmax() # Most frequently occurring value
224
+
225
+ return df_chunks, consensus_value, consensus_count
226
+
227
+
228
+ def process_compare(insight_df, sopChunk_df, topic):
229
+
230
+ GSKGlossary = read_file("GSKGlossary")
231
+
232
+ SystemMessage = read_file("compare_system_message")
233
+ UserMessage = read_file("compare_user_message")
234
+
235
+ # Define the structured output model
236
+ class Compare(BaseModel):
237
+ review: bool = Field(description="This field is used to indicate whether a review is needed")
238
+ justification: str = Field(description="This field is used to justify why a review is needed")
239
+
240
+ # Initialize the LLM
241
+ llm = AzureChatOpenAI(
242
+ openai_api_version=OPENAI_API_VERSION,
243
+ openai_api_key=OPENAI_API_KEY,
244
+ azure_endpoint=OPENAI_API_BASE,
245
+ openai_api_type=OPENAI_API_TYPE,
246
+ deployment_name=OPENAI_MODEL,
247
+ temperature=0,
248
+ )
249
+
250
+ # Create the structured output and prompt chain
251
+ system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
252
+ structured_llm = llm.with_structured_output(Compare)
253
+ prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
254
+ chain = prompt | structured_llm
255
+
256
+ compare_data = []
257
+
258
+ # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
259
+ for sopChunk_index, sopChunk_row in sopChunk_df.iterrows():
260
+ sop_chunk_text = sopChunk_row["ChunkText"] # Extract the ChunkText column
261
+ for insight_index, insight_row in insight_df.iterrows():
262
+ insight_text = insight_row["insight"] # Extract the insight column
263
+
264
+ # Invoke the LLM with the extracted data
265
+ compare_response = chain.invoke({
266
+ "sopChunk": sop_chunk_text,
267
+ "insight": insight_text,
268
+ "topic": topic,
269
+ "GSKGlossary": GSKGlossary
270
+ })
271
+
272
+ # Append the response to insights_data
273
+ compare_data.append({
274
+ "ReviewNeeded": compare_response.review,
275
+ "Justification": compare_response.justification,
276
+ "SOP": sop_chunk_text,
277
+ "Insight": insight_text
278
+ })
279
+
280
+ # Return the insights as a single DataFrame
281
+ print(compare_data)
282
+ return pd.DataFrame(compare_data)
283
+
284
+ def risk_score_process(compare_df, topic):
285
+
286
+ GSKGlossary = read_file("GSKGlossary")
287
+ SystemMessage = read_file("risk_scoring_system_message")
288
+ UserMessage = read_file("risk_scoring_user_message")
289
+
290
+ # Define the Enum for predefined options
291
+ class RiskClassification(str, Enum):
292
+ HIGH = "high"
293
+ MEDIUM = "medium"
294
+ LOW = "low"
295
+
296
+ # Define the Pydantic model for the structured output
297
+ class Risk(BaseModel):
298
+ risk_level: RiskClassification = Field(
299
+ description="The selected classification option."
300
+ )
301
+ justification: str = Field(
302
+ description="Justify the reason for choosing this risk classification."
303
+ )
304
+ advice: str = Field(
305
+ description="Suggestions for changes that could be made to the standard operating procedure to mitigat the risk."
306
+ )
307
+
308
+
309
+ llm = AzureChatOpenAI(
310
+ openai_api_version=OPENAI_API_VERSION,
311
+ openai_api_key=OPENAI_API_KEY,
312
+ azure_endpoint=OPENAI_API_BASE,
313
+ openai_api_type=OPENAI_API_TYPE,
314
+ deployment_name=OPENAI_MODEL,
315
+ temperature=0,
316
+ )
317
+
318
+ system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage)
319
+ structured_llm = llm.with_structured_output(Risk)
320
+ prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage])
321
+
322
+ chain = prompt | structured_llm
323
+
324
+ risk_data = []
325
+
326
+
327
+ # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight"
328
+ for index, row in compare_df.iterrows():
329
+
330
+ # Invoke the LLM with the extracted data
331
+ risk_response = chain.invoke({
332
+ "comparison": row['Justification'],
333
+ "insight": row['Insight'],
334
+ "SOPchunk":row['SOP'],
335
+ "topic": topic
336
+ })
337
+
338
+ # Append the response to insights_data
339
+ risk_data.append({
340
+ "RiskLevel": risk_response.risk_level,
341
+ "Justification": risk_response.justification,
342
+ "advice": risk_response.advice,
343
+ "comparison": row['Justification'],
344
+ "insight": row['Insight'],
345
+ "SOPchunk":row['SOP']
346
+ })
347
+
348
+ # Return the insights as a single DataFrame
349
+ return pd.DataFrame(risk_data)
helper_functions.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import tiktoken
3
+ # Ensure NLTK resources are available
4
+
5
+
6
+ def extract_text_from_pdf(file):
7
+ """
8
+ Extracts text from a PDF file and tracks text by page.
9
+
10
+ :param file: Uploaded PDF file object.
11
+ :return: Tuple (text, page_texts), where:
12
+ - text is the combined text of the entire PDF.
13
+ - page_texts is a list of tuples [(page_number, page_text), ...].
14
+ """
15
+ pdf_reader = PyPDF2.PdfReader(file)
16
+ text = ""
17
+ page_texts = []
18
+ for i, page in enumerate(pdf_reader.pages):
19
+ page_content = page.extract_text()
20
+ text += page_content
21
+ page_texts.append((i + 1, page_content)) # Track page numbers (1-indexed)
22
+ return text, page_texts
23
+
24
+ def count_tokens(string: str) -> int:
25
+ """Returns the number of tokens in a text string."""
26
+ encoding = tiktoken.get_encoding("o200k_base")
27
+ return len(encoding.encode(string))
28
+