Spaces:

CognizantAI
/

IntelAnalyser

Sleeping

App Files Files Community

IntelAnalyser / app.py

ashischakraborty

UC#3 first upload

3daab2e verified 5 months ago

raw

history blame

10.7 kB

	import streamlit as st
	import PyPDF2
	import pandas as pd
	import uuid
	import tiktoken
	import re
	from datetime import datetime
	from helper_functions import extract_text_from_pdf
	from azure_openai import evaluation_process, process_insight, process_compare, risk_score_process, process_chunks


	def count_tokens_with_tiktoken(text):
	"""
	Counts the number of tokens in a given text using tiktoken.
	:param text: Input text.
	:return: Token count.
	"""
	tokenizer = tiktoken.get_encoding("cl100k_base")
	tokens = tokenizer.encode(text)
	return len(tokens), tokens

	def split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap):
	"""
	Splits text into chunks based on a specified chunk size in tokens and overlap using tiktoken.
	Tracks start and end page numbers for each chunk.
	:param text: Combined text of the document.
	:param page_texts: List of tuples [(page_number, page_text), ...].
	:param chunk_size: Maximum size of each chunk in tokens.
	:param overlap: Number of overlapping tokens between consecutive chunks.
	:return: List of dictionaries representing chunks with start and end pages.
	"""
	_, tokens = count_tokens_with_tiktoken(text)
	chunks = []

	# Map token positions to page numbers
	page_token_map = []
	tokenizer = tiktoken.get_encoding("cl100k_base")
	for page_number, page_text in page_texts:
	page_tokens = tokenizer.encode(page_text)
	page_token_map.extend([page_number] * len(page_tokens))

	for start in range(0, len(tokens), chunk_size - overlap):
	end = min(start + chunk_size, len(tokens))
	chunk_tokens = tokens[start:end]
	chunk_text = tokenizer.decode(chunk_tokens)

	# Determine start and end pages
	start_page = page_token_map[start] if start < len(page_token_map) else None
	end_page = page_token_map[end - 1] if end - 1 < len(page_token_map) else page_texts[-1][0]

	chunks.append({
	"ChunkText": chunk_text,
	"TokenCount": len(chunk_tokens),
	"StartPage": start_page,
	"EndPage": end_page,
	"ChunkID": str(uuid.uuid4())
	})

	return chunks

	def split_dataframe_with_combined_text_and_pages(df, chunk_size, overlap):
	"""
	Splits the combined text of a DataFrame into chunks using tiktoken.
	Each chunk will include start and end page numbers.
	:param df: DataFrame with columns ['Title', 'Text', 'PageTexts'].
	:param chunk_size: The maximum size of each chunk in tokens.
	:param overlap: The number of overlapping tokens between consecutive chunks.
	:return: DataFrame with columns ['ChunkText', 'TokenCount', 'StartPage', 'EndPage', 'ChunkID'].
	"""
	chunks = []

	for _, row in df.iterrows():
	text = row['Text']
	page_texts = row['PageTexts']
	split_chunks = split_text_into_chunks_with_tiktoken_and_pages(text, page_texts, chunk_size, overlap)
	chunks.extend(split_chunks)

	return pd.DataFrame(chunks)



	def main():
	st.set_page_config(page_title="RegIntel Risk Analyser", page_icon=":vertical_traffic_light:")
	st.title("External RegIntel Risk Analyser :vertical_traffic_light:")
	topic = st.selectbox("Please choose a focus for the system",("Labelling",
	"Institutional Review Board/Independent Ethics Committee",
	"Investigator", "Sponsor",
	"Clinical Trial Protocol and protocol amendments",
	"Investigator's Brochure", "Conduct of Clinical Trial",
	"Monitoring", "Auditing",
	"Data handling and record keeping",
	"clinical trial reports",
	"Responsibilities of the Sponsor and Investigator",
	"Sponsor Inspection Preparation"),)
	uploaded_extintl_file_insight = st.file_uploader("Upload a External Reg Intel", type="pdf")
	uploaded_interintel_file_insight = st.file_uploader("Upload a External Reg Intel", type="pdf")
	if uploaded_extintl_file_insight is not None and uploaded_interintel_file_insight is not None:
	uploaded_file_SOP = st.file_uploader("Upload an SOP file", type="pdf")
	if uploaded_file_SOP is not None:
	# Extract insight document
	with st.spinner("Processing External Reg Intel"):
	ext_intl_text_insight, ext_intl_page_texts_insight = extract_text_from_pdf(uploaded_extintl_file_insight)
	token_count_insight, _ = count_tokens_with_tiktoken(ext_intl_text_insight)
	st.sidebar.success("External Reg Intel file successfully processed")
	st.write("Token Count")
	st.write(f"The PDF contains {token_count_insight} tokens.")

	with st.spinner("Processing Internal Reg Intel"):
	int_intl_text_insight, int_intl_page_texts_insight = extract_text_from_pdf(uploaded_interintel_file_insight)
	token_count_insight, _ = count_tokens_with_tiktoken(int_intl_text_insight)
	st.sidebar.success("External Reg Intel file successfully processed")
	st.write("Token Count")
	st.write(f"The PDF contains {token_count_insight} tokens.")


	# Extract SOP document
	with st.spinner("Processing the SOP Text..."):
	text_SOP, page_texts_SOP = extract_text_from_pdf(uploaded_file_SOP)
	token_count_SOP, _ = count_tokens_with_tiktoken(text_SOP)
	st.sidebar.success("SOP file successfully processed")
	st.write("Token Count")
	st.write(f"The PDF contains {token_count_SOP} tokens.")

	# Process external insight Insights into chunks
	with st.spinner("Processing the Insight Document..."):
	df_ei_input_insight = pd.DataFrame([{ "Title": uploaded_extintl_file_insight.name, "Text": ext_intl_text_insight, "PageTexts": ext_intl_page_texts_insight }])
	df_ei_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ei_input_insight, 10000, 1000)
	st.write("Processed External Reg Intel")
	st.sidebar.success("Processed External Reg Intel")
	st.write(df_ei_insight_chunks)

	# Process internal insight Insights into chunks
	with st.spinner("Processing the Insight Document..."):
	df_ii_input_insight = pd.DataFrame([{ "Title": uploaded_interintel_file_insight.name, "Text": int_intl_text_insight, "PageTexts": int_intl_page_texts_insight }])
	df_ii_insight_chunks = split_dataframe_with_combined_text_and_pages(df_ii_input_insight, 10000, 1000)
	st.write("Processed External Reg Intel")
	st.sidebar.success("Processed External Reg Intel")
	st.write(df_ii_insight_chunks)


	# Process SOP into chunks
	with st.spinner("Processing the SOP Document..."):
	df_input_SOP = pd.DataFrame([{ "Title": uploaded_file_SOP.name, "Text": text_SOP, "PageTexts": page_texts_SOP }])
	df_sop_chunks = split_dataframe_with_combined_text_and_pages(df_input_SOP, 10000, 1000)
	st.write("Processed SOP")
	st.sidebar.success("Processed SOP")
	st.write(df_sop_chunks)

	# Evaluate Document
	with st.spinner("Evaluating document"):
	df_ei_eval, ei_con, ei_score = evaluation_process(df_ei_insight_chunks, topic,"ext")
	ei_score["source"]="external intel"
	df_ei_eval["source"]="external intel"
	df_ii_eval, ii_con, ii_score = evaluation_process(df_ii_insight_chunks, topic,"intl")
	ii_score["source"]="internal intel"
	df_ii_eval["source"]="internal intel"
	score = pd.concat([ei_score, ii_score])
	st.write("External Inteligence Evaluation")
	st.sidebar.success(f"Evaluation Concensus: {ei_con}")
	st.write(f"Evaluation Concensus: {ei_con}")
	st.write("Evaluation Scores:")
	st.write(score)
	if ei_con == "False" and ii_con == "False":
	st.sidebar.error("Document Not Relevant To Topic")
	st.write("Document Not Relevant To Topic")
	st.write("Exiting RegIntel Analysis")
	return

	# Generate Insights
	with st.spinner("Creating insights"):
	df_ei_insights = process_chunks(df_ei_insight_chunks, topic,"ext")
	df_ii_insights = process_chunks(df_ii_insight_chunks, topic,"intl")
	df_ei_insights["source"]="external intel"
	df_ii_insights["source"]="internal intel"
	df_insights = pd.concat([df_ei_insights, df_ii_insights])
	st.subheader("External Inteligence Insights")
	st.sidebar.success("External Inteligence Insights Created")

	st.write(df_insights)
	filtered_insights_on_impact = df_insights[df_insights['classification'] == 'impact']


	if filtered_insights_on_impact.empty:
	st.write("No impact insights")
	st.sidebar.error("No impact insights")
	return

	# Comparing to Insights
	with st.spinner("Comparing Impact Classified Insights To SOP"):
	df_compare = process_compare(filtered_insights_on_impact, df_sop_chunks, topic)
	st.subheader("Comparison of Insights to SOP's")
	st.sidebar.success("Comparison of External Intel to SOP's Complete")
	st.write(df_compare)
	filtered_comparisons_df = df_compare[df_compare['ReviewNeeded'] == True]
	if filtered_comparisons_df.empty:
	st.write("No reviews needed for this SOP")
	st.sidebar.error("No reviews needed for this SOP")
	return

	# Risk scoring
	with st.spinner("Risk Assessing Insights To SOP"):
	df_risks = risk_score_process(filtered_comparisons_df, topic)
	st.subheader("Risk Score of Insights to SOP's")
	st.sidebar.success("Risk Score of Insights to SOP's Completed")
	st.write(df_risks)


	if __name__ == "__main__":
	main()