Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import transformers
|
3 |
+
import altair as alt
|
4 |
+
import pandas as pd
|
5 |
+
import streamlit_authenticator as stauth
|
6 |
+
from difflib import SequenceMatcher
|
7 |
+
|
8 |
+
# ------------------------------
|
9 |
+
# User Authentication Setup
|
10 |
+
# ------------------------------
|
11 |
+
# Sample configuration for authentication
|
12 |
+
config = {
|
13 |
+
'credentials': {
|
14 |
+
'usernames': {
|
15 |
+
'demo_user': {
|
16 |
+
'name': 'Demo User',
|
17 |
+
'password': stauth.Hasher(['password123']).generate()[0] # hashed password
|
18 |
+
}
|
19 |
+
}
|
20 |
+
},
|
21 |
+
'cookie': {
|
22 |
+
'expiry_days': 30,
|
23 |
+
'key': 'some_signature_key',
|
24 |
+
'name': 'some_cookie_name'
|
25 |
+
},
|
26 |
+
'preauthorized': {
|
27 |
+
'emails': []
|
28 |
+
}
|
29 |
+
}
|
30 |
+
|
31 |
+
authenticator = stauth.Authenticate(
|
32 |
+
config['credentials'],
|
33 |
+
config['cookie']['name'],
|
34 |
+
config['cookie']['key'],
|
35 |
+
config['cookie']['expiry_days']
|
36 |
+
)
|
37 |
+
|
38 |
+
name, authentication_status, username = authenticator.login('Login', 'main')
|
39 |
+
|
40 |
+
if not authentication_status:
|
41 |
+
st.error('Authentication failed. Please refresh and try again.')
|
42 |
+
st.stop()
|
43 |
+
|
44 |
+
st.sidebar.write(f"Welcome *{name}*")
|
45 |
+
authenticator.logout('Logout', 'sidebar')
|
46 |
+
|
47 |
+
# ------------------------------
|
48 |
+
# Load Models
|
49 |
+
# ------------------------------
|
50 |
+
@st.cache_resource
|
51 |
+
def load_qwen():
|
52 |
+
return transformers.pipeline(
|
53 |
+
"text2text-generation",
|
54 |
+
model="Qwen/Qwen2.5-14B",
|
55 |
+
device_map="auto"
|
56 |
+
)
|
57 |
+
|
58 |
+
@st.cache_resource
|
59 |
+
def load_phi():
|
60 |
+
return transformers.pipeline(
|
61 |
+
"text-generation",
|
62 |
+
model="microsoft/phi-4",
|
63 |
+
model_kwargs={"torch_dtype": "auto"},
|
64 |
+
device_map="auto"
|
65 |
+
)
|
66 |
+
|
67 |
+
qwen_pipeline = load_qwen()
|
68 |
+
phi_pipeline = load_phi()
|
69 |
+
|
70 |
+
# ------------------------------
|
71 |
+
# Utility Functions
|
72 |
+
# ------------------------------
|
73 |
+
def summarize_document(document_text):
|
74 |
+
prompt = f"Summarize the following document and highlight key insights:\n\n{document_text}"
|
75 |
+
summary = qwen_pipeline(prompt, max_new_tokens=1024)[0]['generated_text']
|
76 |
+
return summary
|
77 |
+
|
78 |
+
def answer_question(summary, question):
|
79 |
+
prompt = f"Based on the following summary:\n\n{summary}\n\nAnswer the question: {question}"
|
80 |
+
answer = phi_pipeline(prompt, max_new_tokens=256)[0]['generated_text']
|
81 |
+
return answer
|
82 |
+
|
83 |
+
def find_similar_chunks(original, output):
|
84 |
+
matcher = SequenceMatcher(None, original, output)
|
85 |
+
segments = []
|
86 |
+
left = 0
|
87 |
+
for _, j, n in matcher.get_matching_blocks():
|
88 |
+
if left < j:
|
89 |
+
segments.append({'text': output[left:j], 'match': False})
|
90 |
+
segments.append({'text': output[j:j+n], 'match': True})
|
91 |
+
left = j+n
|
92 |
+
return segments
|
93 |
+
|
94 |
+
# ------------------------------
|
95 |
+
# Streamlit App Layout
|
96 |
+
# ------------------------------
|
97 |
+
st.title("SmartDoc Analyzer")
|
98 |
+
st.markdown("Analyze Financial & Health Documents with AI")
|
99 |
+
|
100 |
+
# Tabs for different functionalities
|
101 |
+
tabs = st.tabs(["Document Summarization", "Interactive Q&A", "Visualization & Data Extraction"])
|
102 |
+
|
103 |
+
# -------- Document Summarization Tab --------
|
104 |
+
with tabs[0]:
|
105 |
+
st.header("Document Summarization")
|
106 |
+
document_text = st.text_area("Paste Document Text:", height=300)
|
107 |
+
if st.button("Summarize Document"):
|
108 |
+
if document_text:
|
109 |
+
summary = summarize_document(document_text)
|
110 |
+
st.subheader("Summary")
|
111 |
+
st.write(summary)
|
112 |
+
# Save summary in session for use in Q&A tab
|
113 |
+
st.session_state['last_summary'] = summary
|
114 |
+
else:
|
115 |
+
st.warning("Please paste document text to summarize.")
|
116 |
+
|
117 |
+
# -------- Interactive Q&A Tab --------
|
118 |
+
with tabs[1]:
|
119 |
+
st.header("Interactive Q&A")
|
120 |
+
default_summary = st.session_state.get('last_summary', '')
|
121 |
+
summary_context = st.text_area("Summary Context:", value=default_summary, height=150)
|
122 |
+
question = st.text_input("Enter your question about the document:")
|
123 |
+
if st.button("Get Answer"):
|
124 |
+
if summary_context and question:
|
125 |
+
answer = answer_question(summary_context, question)
|
126 |
+
st.subheader("Answer")
|
127 |
+
st.write(answer)
|
128 |
+
# For session saving, one could store Q&A pairs in st.session_state or database.
|
129 |
+
else:
|
130 |
+
st.warning("Please provide both a summary context and a question.")
|
131 |
+
|
132 |
+
# -------- Visualization & Data Extraction Tab --------
|
133 |
+
with tabs[2]:
|
134 |
+
st.header("Visualization & Data Extraction")
|
135 |
+
|
136 |
+
st.subheader("Visualization Placeholder")
|
137 |
+
st.markdown("An interactive chart can be displayed here using Altair or Plotly.")
|
138 |
+
|
139 |
+
# Example static Altair chart (replace with dynamic data extraction logic)
|
140 |
+
data = pd.DataFrame({
|
141 |
+
'Year': [2019, 2020, 2021, 2022],
|
142 |
+
'Revenue': [150, 200, 250, 300]
|
143 |
+
})
|
144 |
+
chart = alt.Chart(data).mark_line(point=True).encode(
|
145 |
+
x='Year:O',
|
146 |
+
y='Revenue:Q',
|
147 |
+
tooltip=['Year', 'Revenue']
|
148 |
+
).interactive()
|
149 |
+
st.altair_chart(chart, use_container_width=True)
|
150 |
+
|
151 |
+
st.subheader("Data Extraction Placeholder")
|
152 |
+
st.markdown("Implement NLP techniques or model prompts to extract structured data here.")
|
153 |
+
|
154 |
+
# File uploader example for future data extraction features
|
155 |
+
uploaded_file = st.file_uploader("Upload a document file for extraction", type=["pdf", "docx", "txt"])
|
156 |
+
if uploaded_file is not None:
|
157 |
+
st.info("File uploaded successfully. Data extraction logic would process this file.")
|
158 |
+
# Add logic to extract tables, key figures, etc. from the uploaded file.
|
159 |
+
|
160 |
+
# ------------------------------
|
161 |
+
# Safety & Compliance Layer (Placeholder)
|
162 |
+
# ------------------------------
|
163 |
+
st.sidebar.markdown("### Safety & Compliance")
|
164 |
+
st.sidebar.info(
|
165 |
+
"This tool provides AI-driven insights. "
|
166 |
+
"Please note that summaries and answers are for informational purposes only and should not be "
|
167 |
+
"considered professional financial or medical advice."
|
168 |
+
)
|
169 |
+
|
170 |
+
# ------------------------------
|
171 |
+
# End of Application
|
172 |
+
# ------------------------------
|