Spaces:

mgbam
/

SmartDocAnalyzer

Sleeping

App Files Files Community

SmartDocAnalyzer / app.py

mgbam

Update app.py

9794246 verified 6 months ago

raw

history blame

5.48 kB

	import streamlit as st
	import transformers
	import altair as alt
	import pandas as pd
	from difflib import SequenceMatcher

	# ------------------------------
	# Simple Authentication Setup
	# ------------------------------
	# Define a simple password for demonstration purposes.
	PASSWORD = "password123"

	# Initialize authentication state
	if 'authenticated' not in st.session_state:
	st.session_state['authenticated'] = False

	# Simple password input in the sidebar for authentication
	if not st.session_state['authenticated']:
	st.sidebar.title("Login")
	password_input = st.sidebar.text_input("Enter password:", type="password")
	if st.sidebar.button("Login"):
	if password_input == PASSWORD:
	st.session_state['authenticated'] = True
	st.sidebar.success("Authenticated!")
	else:
	st.sidebar.error("Incorrect password. Please try again.")
	st.stop() # Stop app execution until authenticated

	st.sidebar.write("Welcome!")
	# ------------------------------
	# Load Models
	# ------------------------------
	@st.cache_resource
	def load_qwen():
	return transformers.pipeline(
	"text2text-generation",
	model="Qwen/Qwen2.5-14B",
	device_map="auto"
	)

	@st.cache_resource
	def load_phi():
	return transformers.pipeline(
	"text-generation",
	model="microsoft/phi-4",
	model_kwargs={"torch_dtype": "auto"},
	device_map="auto"
	)

	qwen_pipeline = load_qwen()
	phi_pipeline = load_phi()

	# ------------------------------
	# Utility Functions
	# ------------------------------
	def summarize_document(document_text):
	prompt = f"Summarize the following document and highlight key insights:\n\n{document_text}"
	summary = qwen_pipeline(prompt, max_new_tokens=1024)[0]['generated_text']
	return summary

	def answer_question(summary, question):
	prompt = f"Based on the following summary:\n\n{summary}\n\nAnswer the question: {question}"
	answer = phi_pipeline(prompt, max_new_tokens=256)[0]['generated_text']
	return answer

	def find_similar_chunks(original, output):
	matcher = SequenceMatcher(None, original, output)
	segments = []
	left = 0
	for _, j, n in matcher.get_matching_blocks():
	if left < j:
	segments.append({'text': output[left:j], 'match': False})
	segments.append({'text': output[j:j+n], 'match': True})
	left = j+n
	return segments

	# ------------------------------
	# Streamlit App Layout
	# ------------------------------
	st.title("SmartDoc Analyzer")
	st.markdown("Analyze Financial & Health Documents with AI")

	# Tabs for different functionalities
	tabs = st.tabs(["Document Summarization", "Interactive Q&A", "Visualization & Data Extraction"])

	# -------- Document Summarization Tab --------
	with tabs[0]:
	st.header("Document Summarization")
	document_text = st.text_area("Paste Document Text:", height=300)
	if st.button("Summarize Document"):
	if document_text:
	summary = summarize_document(document_text)
	st.subheader("Summary")
	st.write(summary)
	# Save summary in session for use in Q&A tab
	st.session_state['last_summary'] = summary
	else:
	st.warning("Please paste document text to summarize.")

	# -------- Interactive Q&A Tab --------
	with tabs[1]:
	st.header("Interactive Q&A")
	default_summary = st.session_state.get('last_summary', '')
	summary_context = st.text_area("Summary Context:", value=default_summary, height=150)
	question = st.text_input("Enter your question about the document:")
	if st.button("Get Answer"):
	if summary_context and question:
	answer = answer_question(summary_context, question)
	st.subheader("Answer")
	st.write(answer)
	else:
	st.warning("Please provide both a summary context and a question.")

	# -------- Visualization & Data Extraction Tab --------
	with tabs[2]:
	st.header("Visualization & Data Extraction")

	st.subheader("Visualization Placeholder")
	st.markdown("An interactive chart can be displayed here using Altair or Plotly.")

	# Example static Altair chart (replace with dynamic data extraction logic)
	data = pd.DataFrame({
	'Year': [2019, 2020, 2021, 2022],
	'Revenue': [150, 200, 250, 300]
	})
	chart = alt.Chart(data).mark_line(point=True).encode(
	x='Year:O',
	y='Revenue:Q',
	tooltip=['Year', 'Revenue']
	).interactive()
	st.altair_chart(chart, use_container_width=True)

	st.subheader("Data Extraction Placeholder")
	st.markdown("Implement NLP techniques or model prompts to extract structured data here.")

	uploaded_file = st.file_uploader("Upload a document file for extraction", type=["pdf", "docx", "txt"])
	if uploaded_file is not None:
	st.info("File uploaded successfully. Data extraction logic would process this file.")
	# Add logic to extract tables, key figures, etc. from the uploaded file.

	# ------------------------------
	# Safety & Compliance Layer (Placeholder)
	# ------------------------------
	st.sidebar.markdown("### Safety & Compliance")
	st.sidebar.info(
	"This tool provides AI-driven insights. "
	"Please note that summaries and answers are for informational purposes only and should not be "
	"considered professional financial or medical advice."
	)

	# ------------------------------
	# End of Application
	# ------------------------------