Spaces:
Sleeping
Sleeping
import streamlit as st | |
import PyPDF2 | |
import pandas as pd | |
import uuid | |
import tiktoken | |
import re | |
from datetime import datetime | |
from helper_functions import extract_text_from_pdf | |
from azure_openai import evaluation_process, process_insight, process_compare, risk_score_process, process_chunks | |
def count_tokens_with_tiktoken(text): | |
""" | |
Counts the number of tokens in a given text using tiktoken. | |
:param text: Input text. | |
:return: Token count. | |
""" | |
tokenizer = tiktoken.get_encoding("cl100k_base") | |
tokens = tokenizer.encode(text) | |
return len(tokens), tokens | |
def split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap): | |
""" | |
Splits text into chunks based on a specified chunk size in tokens and overlap using tiktoken. | |
Tracks start and end page numbers for each chunk. | |
:param text: Combined text of the document. | |
:param page_texts: List of tuples [(page_number, page_text), ...]. | |
:param chunk_size: Maximum size of each chunk in tokens. | |
:param overlap: Number of overlapping tokens between consecutive chunks. | |
:return: List of dictionaries representing chunks with start and end pages. | |
""" | |
_, tokens = count_tokens_with_tiktoken(text) | |
chunks = [] | |
# Map token positions to page numbers | |
page_token_map = [] | |
tokenizer = tiktoken.get_encoding("cl100k_base") | |
for page_number, page_text in page_texts: | |
page_tokens = tokenizer.encode(page_text) | |
page_token_map.extend([page_number] * len(page_tokens)) | |
for start in range(0, len(tokens), chunk_size - overlap): | |
end = min(start + chunk_size, len(tokens)) | |
chunk_tokens = tokens[start:end] | |
chunk_text = tokenizer.decode(chunk_tokens) | |
# Determine start and end pages | |
start_page = page_token_map[start] if start < len(page_token_map) else None | |
end_page = page_token_map[end - 1] if end - 1 < len(page_token_map) else page_texts[-1][0] | |
chunks.append({ | |
"ChunkText": chunk_text, | |
"TokenCount": len(chunk_tokens), | |
"StartPage": start_page, | |
"EndPage": end_page, | |
"ChunkID": str(uuid.uuid4()) | |
}) | |
return chunks | |
def split_dataframe_with_combined_text_and_pages(df, chunk_size, overlap): | |
""" | |
Splits the combined text of a DataFrame into chunks using tiktoken. | |
Each chunk will include start and end page numbers. | |
:param df: DataFrame with columns ['Title', 'Text', 'PageTexts']. | |
:param chunk_size: The maximum size of each chunk in tokens. | |
:param overlap: The number of overlapping tokens between consecutive chunks. | |
:return: DataFrame with columns ['ChunkText', 'TokenCount', 'StartPage', 'EndPage', 'ChunkID']. | |
""" | |
chunks = [] | |
for _, row in df.iterrows(): | |
text = row['Text'] | |
page_texts = row['PageTexts'] | |
split_chunks = split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap) | |
chunks.extend(split_chunks) | |
return pd.DataFrame(chunks) | |
def main(): | |
st.set_page_config(page_title="RegIntel Risk Analyser", page_icon=":vertical_traffic_light:") | |
st.title("External RegIntel Risk Analyser :vertical_traffic_light:") | |
topic = st.selectbox("Please choose a focus for the system",("Labelling", | |
"Institutional Review Board/Independent Ethics Committee", | |
"Investigator", "Sponsor", | |
"Clinical Trial Protocol and protocol amendments", | |
"Investigator's Brochure", "Conduct of Clinical Trial", | |
"Monitoring", "Auditing", | |
"Data handling and record keeping", | |
"clinical trial reports", | |
"Responsibilities of the Sponsor and Investigator", | |
"Sponsor Inspection Preparation"),) | |
uploaded_extintl_file_insight = st.file_uploader("Upload a External Reg Intel", type="pdf") | |
uploaded_interintel_file_insight = st.file_uploader("Upload a Internal Reg Intel", type="pdf") | |
if uploaded_extintl_file_insight is not None and uploaded_interintel_file_insight is not None: | |
uploaded_file_SOP = st.file_uploader("Upload the draft submission file", type="pdf") | |
if uploaded_file_SOP is not None: | |
# Extract insight document | |
with st.spinner("Processing External Reg Intel"): | |
ext_intl_text_insight, ext_intl_page_texts_insight = extract_text_from_pdf(uploaded_extintl_file_insight) | |
token_count_insight, _ = count_tokens_with_tiktoken(ext_intl_text_insight) | |
st.sidebar.success("External Reg Intel file successfully processed") | |
st.write("Token Count") | |
st.write(f"The PDF contains **{token_count_insight}** tokens.") | |
with st.spinner("Processing Internal Reg Intel"): | |
int_intl_text_insight, int_intl_page_texts_insight = extract_text_from_pdf(uploaded_interintel_file_insight) | |
token_count_insight, _ = count_tokens_with_tiktoken(int_intl_text_insight) | |
st.sidebar.success("Internal Reg Intel file successfully processed") | |
st.write("Token Count") | |
st.write(f"The PDF contains **{token_count_insight}** tokens.") | |
# Extract draft submission document | |
with st.spinner("Processing the draft submission file Text..."): | |
text_SOP, page_texts_SOP = extract_text_from_pdf(uploaded_file_SOP) | |
token_count_SOP, _ = count_tokens_with_tiktoken(text_SOP) | |
st.sidebar.success("draft submission file successfully processed") | |
st.write("Token Count") | |
st.write(f"The PDF contains **{token_count_SOP}** tokens.") | |
# Process external insight Insights into chunks | |
with st.spinner("Processing the Insight Document..."): | |
df_ei_input_insight = pd.DataFrame([{ "Title": uploaded_extintl_file_insight.name, "Text": ext_intl_text_insight, "PageTexts": ext_intl_page_texts_insight }]) | |
df_ei_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ei_input_insight, 10000, 1000) | |
st.write("Processed External Reg Intel") | |
st.sidebar.success("Processed External Reg Intel") | |
st.write(df_ei_insight_chunks) | |
# Process internal insight Insights into chunks | |
with st.spinner("Processing the Insight Document..."): | |
df_ii_input_insight = pd.DataFrame([{ "Title": uploaded_interintel_file_insight.name, "Text": int_intl_text_insight, "PageTexts": int_intl_page_texts_insight }]) | |
df_ii_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ii_input_insight, 10000, 1000) | |
st.write("Processed Internal Reg Intel") | |
st.sidebar.success("Processed Internal Reg Intel") | |
st.write(df_ii_insight_chunks) | |
# Process draft submission file into chunks | |
with st.spinner("Processing the draft submission file..."): | |
df_input_SOP = pd.DataFrame([{ "Title": uploaded_file_SOP.name, "Text": text_SOP, "PageTexts": page_texts_SOP }]) | |
df_sop_chunks = split_dataframe_with_combined_text_and_pages(df_input_SOP, 10000, 1000) | |
st.write("Processed draft submission file") | |
st.sidebar.success("Processed draft submission file") | |
st.write(df_sop_chunks) | |
# Evaluate Document | |
with st.spinner("Evaluating document"): | |
df_ei_eval, ei_con, ei_score = evaluation_process(df_ei_insight_chunks, topic,"ext") | |
ei_score["source"]="external intel" | |
df_ei_eval["source"]="external intel" | |
df_ii_eval, ii_con, ii_score = evaluation_process(df_ii_insight_chunks, topic,"intl") | |
ii_score["source"]="internal intel" | |
df_ii_eval["source"]="internal intel" | |
score = pd.concat([ei_score, ii_score]) | |
st.write("External & Internal Inteligence Evaluation") | |
st.sidebar.success(f"Evaluation Concensus: {ei_con}") | |
st.write(f"Evaluation Concensus: {ei_con}") | |
st.write("Evaluation Scores:") | |
st.write(score) | |
if ei_con == "False" and ii_con == "False": | |
st.sidebar.error("Document Not Relevant To Topic") | |
st.write("Document Not Relevant To Topic") | |
st.write("Exiting RegIntel Analysis") | |
return | |
# Generate Insights | |
with st.spinner("Creating insights"): | |
df_ei_insights = process_chunks(df_ei_insight_chunks, topic,"ext") | |
df_ii_insights = process_chunks(df_ii_insight_chunks, topic,"intl") | |
df_ei_insights["source"]="external intel" | |
df_ii_insights["source"]="internal intel" | |
df_insights = pd.concat([df_ei_insights, df_ii_insights]) | |
st.subheader("External & Internal Inteligence Insights") | |
st.sidebar.success("External & Internal Inteligence Insights Created") | |
st.write(df_insights) | |
filtered_insights_on_impact = df_insights[df_insights['classification'] == 'impact'] | |
if filtered_insights_on_impact.empty: | |
st.write("No impact insights") | |
st.sidebar.error("No impact insights") | |
return | |
# Comparing to Insights | |
with st.spinner("Comparing Impact Classified Insights To draft submission file"): | |
df_compare = process_compare(filtered_insights_on_impact, df_sop_chunks, topic) | |
st.subheader("Comparison of Insights to draft submission file's") | |
st.sidebar.success("Comparison of External & Internal Intel to draft submission file's Complete") | |
st.write(df_compare) | |
filtered_comparisons_df = df_compare[df_compare['ReviewNeeded'] == True] | |
if filtered_comparisons_df.empty: | |
st.write("No reviews needed for this draft submission file") | |
st.sidebar.error("No reviews needed for this draft submission file") | |
return | |
# Risk scoring | |
with st.spinner("Risk Assessing Insights To draft submission file"): | |
df_risks = risk_score_process(filtered_comparisons_df, topic) | |
st.subheader("Risk Score of Insights to draft submission file's") | |
st.sidebar.success("Risk Score of Insights to draft submission file's Completed") | |
st.write(df_risks) | |
if __name__ == "__main__": | |
main() | |