amiguel commited on
Commit
5e1781c
Β·
verified Β·
1 Parent(s): 40b16b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -51
app.py CHANGED
@@ -13,23 +13,23 @@ from langchain.schema import Document
13
  from langchain.docstore.document import Document as LangchainDocument
14
 
15
  # --- Avatars ---
16
- USER_AVATAR = "πŸ‘€"
17
- BOT_AVATAR = "πŸ€–"
18
 
19
- # --- HF Token ---
20
  HF_TOKEN = st.secrets["HF_TOKEN"]
21
 
22
- # --- Page Config ---
23
- st.set_page_config(page_title="Hybrid RAG with Streaming", page_icon="πŸ“„", layout="centered")
24
- st.title("πŸ“„ Hybrid Search + Streaming Chat")
25
 
26
  # --- Sidebar Upload ---
27
  with st.sidebar:
28
  st.header("πŸ“€ Upload Documents")
29
- uploaded_files = st.file_uploader("Upload PDFs or .txt files", type=["pdf", "txt"], accept_multiple_files=True)
30
  clear_chat = st.button("🧹 Clear Conversation")
31
 
32
- # --- Session State ---
33
  if "messages" not in st.session_state or clear_chat:
34
  st.session_state.messages = []
35
 
@@ -43,87 +43,87 @@ def load_model():
43
 
44
  tokenizer, model = load_model()
45
 
46
- # --- Load & Chunk Documents ---
47
  def process_documents(files):
48
  documents = []
49
  for file in files:
50
  suffix = ".pdf" if file.name.endswith(".pdf") else ".txt"
51
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
52
- tmp_file.write(file.read())
53
- tmp_file_path = tmp_file.name
54
- loader = PyPDFLoader(tmp_file_path) if suffix == ".pdf" else TextLoader(tmp_file_path)
55
  documents.extend(loader.load())
56
  return documents
57
 
58
- def chunk_documents(documents):
59
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
60
- return splitter.split_documents(documents)
61
 
62
  def build_hybrid_retriever(chunks):
63
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
64
- faiss_store = FAISS.from_documents(chunks, embeddings)
65
- faiss_retriever = faiss_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
 
 
 
66
 
67
- bm25_retriever = BM25Retriever.from_documents([LangchainDocument(page_content=d.page_content) for d in chunks])
68
- bm25_retriever.k = 5
 
 
 
 
 
69
 
70
- hybrid = EnsembleRetriever(retrievers=[faiss_retriever, bm25_retriever], weights=[0.5, 0.5])
71
- return hybrid
72
 
73
- # --- Prompt Construction ---
74
- def build_prompt(history, context=""):
75
- prompt = (
76
- "You are DigiTwin, an expert in reliability, inspection, and maintenance of piping, structures, vessels, and topside assets.\n"
77
- f"Use the following context to help answer questions:\n\n{context}\n\n"
78
- )
79
- for turn in history:
80
- role = "User" if turn["role"] == "user" else "Assistant"
81
- prompt += f"{role}: {turn['content']}\n"
82
- prompt += "Assistant:"
83
- return prompt
84
-
85
- # --- Generator for Streaming ---
86
- def generate_streaming_response(prompt):
87
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
88
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
89
- thread = Thread(target=model.generate, kwargs={**inputs, "streamer": streamer, "max_new_tokens": 300})
90
- thread.start()
91
  output = ""
92
  for token in streamer:
93
  output += token
94
  yield output
95
 
96
- # --- Run Document Processing and Retrieval ---
97
  retriever = None
98
  if uploaded_files:
99
- with st.spinner("πŸ“š Processing documents..."):
100
  docs = process_documents(uploaded_files)
101
  chunks = chunk_documents(docs)
102
  retriever = build_hybrid_retriever(chunks)
103
- st.success("βœ… Document processing complete.")
104
 
105
- # --- Display Past Messages ---
106
  for msg in st.session_state.messages:
107
  with st.chat_message(msg["role"], avatar=USER_AVATAR if msg["role"] == "user" else BOT_AVATAR):
108
  st.markdown(msg["content"])
109
 
110
- # --- Main Chat Input ---
111
- if prompt := st.chat_input("Ask a question..."):
112
- st.chat_message("user", avatar=USER_AVATAR).markdown(prompt)
113
- st.session_state.messages.append({"role": "user", "content": prompt})
114
 
115
  context = ""
116
  if retriever:
117
- docs = retriever.get_relevant_documents(prompt)
118
  context = "\n\n".join([doc.page_content for doc in docs])
119
 
120
- full_prompt = build_prompt(st.session_state.messages, context=context)
121
 
122
  with st.chat_message("assistant", avatar=BOT_AVATAR):
123
- response_container = st.empty()
124
  answer = ""
125
- for chunk in generate_streaming_response(full_prompt):
126
  answer = chunk
127
- response_container.markdown(answer + "β–Œ", unsafe_allow_html=True)
128
- response_container.markdown(answer)
129
  st.session_state.messages.append({"role": "assistant", "content": answer})
 
13
  from langchain.docstore.document import Document as LangchainDocument
14
 
15
  # --- Avatars ---
16
+ USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
17
+ BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"
18
 
19
+ # --- Hugging Face Token ---
20
  HF_TOKEN = st.secrets["HF_TOKEN"]
21
 
22
+ # --- Page Setup ---
23
+ st.set_page_config(page_title="Hybrid RAG Chat", page_icon="πŸ€–", layout="centered")
24
+ st.title("πŸ€– DigiTwin - Hybrid Search + Streaming")
25
 
26
  # --- Sidebar Upload ---
27
  with st.sidebar:
28
  st.header("πŸ“€ Upload Documents")
29
+ uploaded_files = st.file_uploader("PDFs or .txt files only", type=["pdf", "txt"], accept_multiple_files=True)
30
  clear_chat = st.button("🧹 Clear Conversation")
31
 
32
+ # --- Chat Memory ---
33
  if "messages" not in st.session_state or clear_chat:
34
  st.session_state.messages = []
35
 
 
43
 
44
  tokenizer, model = load_model()
45
 
46
+ # --- Document Processing ---
47
  def process_documents(files):
48
  documents = []
49
  for file in files:
50
  suffix = ".pdf" if file.name.endswith(".pdf") else ".txt"
51
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
52
+ tmp.write(file.read())
53
+ path = tmp.name
54
+ loader = PyPDFLoader(path) if suffix == ".pdf" else TextLoader(path)
55
  documents.extend(loader.load())
56
  return documents
57
 
58
+ def chunk_documents(docs):
59
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
60
+ return splitter.split_documents(docs)
61
 
62
  def build_hybrid_retriever(chunks):
63
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
64
+ faiss = FAISS.from_documents(chunks, embeddings)
65
+ faiss_ret = faiss.as_retriever(search_type="similarity", search_kwargs={"k": 5})
66
+ bm25 = BM25Retriever.from_documents([LangchainDocument(page_content=c.page_content) for c in chunks])
67
+ bm25.k = 5
68
+ return EnsembleRetriever(retrievers=[faiss_ret, bm25], weights=[0.5, 0.5])
69
 
70
+ # --- Prompt Builder ---
71
+ def build_prompt(history, context=""):
72
+ dialog = ""
73
+ for msg in history:
74
+ role = "User" if msg["role"] == "user" else "Assistant"
75
+ dialog += f"{role}: {msg['content']}\n"
76
+ return f"""You are DigiTwin, a highly professional and experienced assistant in inspection, integrity, and maintenance of topside equipment, piping systems, pressure vessels, structures, and safety systems.
77
 
78
+ Use the following context to provide expert-level answers.
 
79
 
80
+ Context:
81
+ {context}
82
+
83
+ {dialog}
84
+ Assistant:"""
85
+
86
+ # --- Response Generator ---
87
+ def generate_response(prompt):
 
 
 
 
 
 
88
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
89
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
90
+ Thread(target=model.generate, kwargs={**inputs, "streamer": streamer, "max_new_tokens": 300}).start()
 
91
  output = ""
92
  for token in streamer:
93
  output += token
94
  yield output
95
 
96
+ # --- Retrieval Logic ---
97
  retriever = None
98
  if uploaded_files:
99
+ with st.spinner("πŸ” Indexing documents..."):
100
  docs = process_documents(uploaded_files)
101
  chunks = chunk_documents(docs)
102
  retriever = build_hybrid_retriever(chunks)
103
+ st.success("βœ… Documents ready for hybrid search.")
104
 
105
+ # --- Display Conversation ---
106
  for msg in st.session_state.messages:
107
  with st.chat_message(msg["role"], avatar=USER_AVATAR if msg["role"] == "user" else BOT_AVATAR):
108
  st.markdown(msg["content"])
109
 
110
+ # --- Chat Input ---
111
+ if query := st.chat_input("Ask DigiTwin anything..."):
112
+ st.chat_message("user", avatar=USER_AVATAR).markdown(query)
113
+ st.session_state.messages.append({"role": "user", "content": query})
114
 
115
  context = ""
116
  if retriever:
117
+ docs = retriever.get_relevant_documents(query)
118
  context = "\n\n".join([doc.page_content for doc in docs])
119
 
120
+ full_prompt = build_prompt(st.session_state.messages, context)
121
 
122
  with st.chat_message("assistant", avatar=BOT_AVATAR):
123
+ container = st.empty()
124
  answer = ""
125
+ for chunk in generate_response(full_prompt):
126
  answer = chunk
127
+ container.markdown(answer + "β–Œ", unsafe_allow_html=True)
128
+ container.markdown(answer)
129
  st.session_state.messages.append({"role": "assistant", "content": answer})