File size: 11,036 Bytes
3daab2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b74362
3daab2e
4c318b5
3daab2e
 
 
 
 
 
 
 
 
 
 
 
91c2853
3daab2e
 
 
 
4c318b5
 
3daab2e
 
4c318b5
3daab2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91c2853
 
3daab2e
 
 
4c318b5
 
3daab2e
 
4c318b5
 
3daab2e
 
 
 
 
 
 
 
 
 
 
91c2853
3daab2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91c2853
 
3daab2e
 
 
 
 
 
 
 
 
 
 
4c318b5
3daab2e
4c318b5
 
3daab2e
 
 
4c318b5
 
3daab2e
 
 
4c318b5
3daab2e
4c318b5
 
3daab2e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import streamlit as st
import PyPDF2
import pandas as pd
import uuid
import tiktoken
import re
from datetime import datetime
from helper_functions import extract_text_from_pdf
from azure_openai import evaluation_process, process_insight, process_compare, risk_score_process, process_chunks


def count_tokens_with_tiktoken(text):
    """
    Counts the number of tokens in a given text using tiktoken.
    :param text: Input text.
    :return: Token count.
    """
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(text)
    return len(tokens), tokens

def split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap):
    """
    Splits text into chunks based on a specified chunk size in tokens and overlap using tiktoken.
    Tracks start and end page numbers for each chunk.
    :param text: Combined text of the document.
    :param page_texts: List of tuples [(page_number, page_text), ...].
    :param chunk_size: Maximum size of each chunk in tokens.
    :param overlap: Number of overlapping tokens between consecutive chunks.
    :return: List of dictionaries representing chunks with start and end pages.
    """
    _, tokens = count_tokens_with_tiktoken(text)
    chunks = []

    # Map token positions to page numbers
    page_token_map = []
    tokenizer = tiktoken.get_encoding("cl100k_base")
    for page_number, page_text in page_texts:
        page_tokens = tokenizer.encode(page_text)
        page_token_map.extend([page_number] * len(page_tokens))

    for start in range(0, len(tokens), chunk_size - overlap):
        end = min(start + chunk_size, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens)

        # Determine start and end pages
        start_page = page_token_map[start] if start < len(page_token_map) else None
        end_page = page_token_map[end - 1] if end - 1 < len(page_token_map) else page_texts[-1][0]

        chunks.append({
            "ChunkText": chunk_text,
            "TokenCount": len(chunk_tokens),
            "StartPage": start_page,
            "EndPage": end_page,
            "ChunkID": str(uuid.uuid4())
        })

    return chunks

def split_dataframe_with_combined_text_and_pages(df, chunk_size, overlap):
    """
    Splits the combined text of a DataFrame into chunks using tiktoken.
    Each chunk will include start and end page numbers.
    :param df: DataFrame with columns ['Title', 'Text', 'PageTexts'].
    :param chunk_size: The maximum size of each chunk in tokens.
    :param overlap: The number of overlapping tokens between consecutive chunks.
    :return: DataFrame with columns ['ChunkText', 'TokenCount', 'StartPage', 'EndPage', 'ChunkID'].
    """
    chunks = []

    for _, row in df.iterrows():
        text = row['Text']
        page_texts = row['PageTexts']
        split_chunks = split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap)
        chunks.extend(split_chunks)

    return pd.DataFrame(chunks)



def main():
    st.set_page_config(page_title="RegIntel Risk Analyser", page_icon=":vertical_traffic_light:")
    st.title("External RegIntel Risk Analyser :vertical_traffic_light:")
    topic = st.selectbox("Please choose a focus for the system",("Labelling",
                                                                 "Institutional Review Board/Independent Ethics Committee", 
                                                                 "Investigator", "Sponsor", 
                                                                 "Clinical Trial Protocol and protocol amendments", 
                                                                 "Investigator's Brochure", "Conduct of Clinical Trial", 
                                                                 "Monitoring", "Auditing", 
                                                                 "Data handling and record keeping", 
                                                                 "clinical trial reports", 
                                                                 "Responsibilities of the Sponsor and Investigator", 
                                                                 "Sponsor Inspection Preparation"),)
    uploaded_extintl_file_insight = st.file_uploader("Upload a External Reg Intel", type="pdf")
    uploaded_interintel_file_insight = st.file_uploader("Upload a Internal Reg Intel", type="pdf")
    if uploaded_extintl_file_insight is not None and uploaded_interintel_file_insight is not None:
        uploaded_file_SOP = st.file_uploader("Upload the draft submission file", type="pdf")
        if uploaded_file_SOP is not None:
            # Extract insight document
            with st.spinner("Processing External Reg Intel"):
                ext_intl_text_insight, ext_intl_page_texts_insight = extract_text_from_pdf(uploaded_extintl_file_insight)
                token_count_insight, _ = count_tokens_with_tiktoken(ext_intl_text_insight)
            st.sidebar.success("External Reg Intel file successfully processed")
            st.write("Token Count")
            st.write(f"The PDF contains **{token_count_insight}** tokens.")

            with st.spinner("Processing Internal Reg Intel"):
                int_intl_text_insight, int_intl_page_texts_insight = extract_text_from_pdf(uploaded_interintel_file_insight)
                token_count_insight, _ = count_tokens_with_tiktoken(int_intl_text_insight)
            st.sidebar.success("Internal Reg Intel file successfully processed")
            st.write("Token Count")
            st.write(f"The PDF contains **{token_count_insight}** tokens.")


            # Extract draft submission document
            with st.spinner("Processing the draft submission file Text..."):
                text_SOP, page_texts_SOP = extract_text_from_pdf(uploaded_file_SOP)
                token_count_SOP, _ = count_tokens_with_tiktoken(text_SOP)
            st.sidebar.success("draft submission file successfully processed")
            st.write("Token Count")
            st.write(f"The PDF contains **{token_count_SOP}** tokens.")  

            # Process external insight Insights into chunks
            with st.spinner("Processing the Insight Document..."):
                df_ei_input_insight = pd.DataFrame([{ "Title": uploaded_extintl_file_insight.name, "Text": ext_intl_text_insight, "PageTexts": ext_intl_page_texts_insight }])
                df_ei_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ei_input_insight, 10000, 1000)
            st.write("Processed External Reg Intel")
            st.sidebar.success("Processed External Reg Intel")
            st.write(df_ei_insight_chunks)

            # Process internal insight Insights into chunks
            with st.spinner("Processing the Insight Document..."):
                df_ii_input_insight = pd.DataFrame([{ "Title": uploaded_interintel_file_insight.name, "Text": int_intl_text_insight, "PageTexts": int_intl_page_texts_insight }])
                df_ii_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ii_input_insight, 10000, 1000)
            st.write("Processed Internal Reg Intel")
            st.sidebar.success("Processed Internal Reg Intel")
            st.write(df_ii_insight_chunks)


            # Process draft submission file into chunks
            with st.spinner("Processing the draft submission file..."):
                df_input_SOP = pd.DataFrame([{ "Title": uploaded_file_SOP.name, "Text": text_SOP, "PageTexts": page_texts_SOP }])
                df_sop_chunks = split_dataframe_with_combined_text_and_pages(df_input_SOP, 10000, 1000)
            st.write("Processed draft submission file")
            st.sidebar.success("Processed draft submission file")
            st.write(df_sop_chunks)

            # Evaluate Document
            with st.spinner("Evaluating document"):
                df_ei_eval, ei_con, ei_score = evaluation_process(df_ei_insight_chunks, topic,"ext")
                ei_score["source"]="external intel"
                df_ei_eval["source"]="external intel"
                df_ii_eval, ii_con, ii_score = evaluation_process(df_ii_insight_chunks, topic,"intl")
                ii_score["source"]="internal intel"
                df_ii_eval["source"]="internal intel"
                score = pd.concat([ei_score, ii_score])
            st.write("External & Internal Inteligence Evaluation")
            st.sidebar.success(f"Evaluation Concensus: {ei_con}")
            st.write(f"Evaluation Concensus: {ei_con}")
            st.write("Evaluation Scores:")
            st.write(score)
            if ei_con == "False" and ii_con == "False": 
                st.sidebar.error("Document Not Relevant To Topic")
                st.write("Document Not Relevant To Topic") 
                st.write("Exiting RegIntel Analysis") 
                return

            # Generate Insights    
            with st.spinner("Creating insights"):
                 df_ei_insights = process_chunks(df_ei_insight_chunks, topic,"ext")
                 df_ii_insights = process_chunks(df_ii_insight_chunks, topic,"intl")
                 df_ei_insights["source"]="external intel"
                 df_ii_insights["source"]="internal intel"
                 df_insights = pd.concat([df_ei_insights, df_ii_insights])
            st.subheader("External & Internal Inteligence Insights")
            st.sidebar.success("External & Internal Inteligence Insights Created")
            
            st.write(df_insights)
            filtered_insights_on_impact = df_insights[df_insights['classification'] == 'impact']
            

            if filtered_insights_on_impact.empty:
                st.write("No impact insights")
                st.sidebar.error("No impact insights")
                return

            # Comparing to Insights
            with st.spinner("Comparing Impact Classified Insights To draft submission file"):
                 df_compare = process_compare(filtered_insights_on_impact, df_sop_chunks, topic)
            st.subheader("Comparison of Insights to draft submission file's")
            st.sidebar.success("Comparison of External & Internal Intel to draft submission file's Complete")
            st.write(df_compare)
            filtered_comparisons_df = df_compare[df_compare['ReviewNeeded'] == True]
            if filtered_comparisons_df.empty:
                st.write("No reviews needed for this draft submission file")
                st.sidebar.error("No reviews needed for this draft submission file")
                return

            # Risk scoring
            with st.spinner("Risk Assessing Insights To draft submission file"):
                 df_risks = risk_score_process(filtered_comparisons_df, topic)
            st.subheader("Risk Score of Insights to draft submission file's")
            st.sidebar.success("Risk Score of Insights to draft submission file's Completed")
            st.write(df_risks)


if __name__ == "__main__":
    main()