amiguel commited on
Commit
95dae9c
Β·
verified Β·
1 Parent(s): 47a0ae1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -97
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  import torch
3
  import os
@@ -8,7 +9,10 @@ from langchain_community.document_loaders import PyPDFLoader, TextLoader
8
  from langchain_text_splitters import RecursiveCharacterTextSplitter
9
  from langchain_community.embeddings import HuggingFaceEmbeddings
10
  from langchain.vectorstores import FAISS
 
 
11
  from langchain.schema import Document
 
12
 
13
  # --- HF Token ---
14
  HF_TOKEN = st.secrets["HF_TOKEN"]
@@ -21,111 +25,83 @@ st.title("πŸ“‚ DigiTs the Twin")
21
  with st.sidebar:
22
  st.header("πŸ“„ Upload Knowledge Files")
23
  uploaded_files = st.file_uploader("Upload PDFs or .txt files", accept_multiple_files=True, type=["pdf", "txt"])
24
- if uploaded_files:
25
- st.success(f"{len(uploaded_files)} file(s) uploaded")
26
 
27
  # --- Model Loading ---
28
  @st.cache_resource
29
  def load_model():
30
- tokenizer = AutoTokenizer.from_pretrained("amiguel/GM_Qwen1.8B_Finetune", trust_remote_code=True, token=HF_TOKEN)
31
- model = AutoModelForCausalLM.from_pretrained(
32
- "amiguel/GM_Qwen1.8B_Finetune",
33
- device_map="auto",
34
- torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32,
35
- trust_remote_code=True,
36
- token=HF_TOKEN
37
- )
38
- return model, tokenizer
39
-
40
- model, tokenizer = load_model()
41
-
42
- # --- Prompt Helper ---
43
- SYSTEM_PROMPT = (
44
- "You are DigiTwin, a digital expert and senior topside engineer specializing in inspection and maintenance "
45
- "of offshore piping systems, structural elements, mechanical equipment, floating production units, pressure vessels "
46
- "(with emphasis on Visual Internal Inspection - VII), and pressure safety devices (PSDs). Rely on uploaded documents "
47
- "and context to provide practical, standards-driven, and technically accurate responses. Your guidance reflects deep "
48
- "field experience, industry regulations, and proven methodologies in asset integrity and reliability engineering."
49
- )
 
 
 
 
 
 
 
 
50
 
 
 
51
 
52
- def build_prompt(messages, context=""):
53
- prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}\n\nContext:\n{context}<|im_end|>\n"
54
- for msg in messages:
55
- role = msg["role"]
56
- prompt += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n"
57
- prompt += "<|im_start|>assistant\n"
58
- return prompt
59
 
 
 
 
 
60
 
61
- # --- RAG Embedding and Search ---
62
- @st.cache_resource
63
- def embed_uploaded_files(files):
64
- raw_docs = []
65
- for f in files:
66
- file_path = f"/tmp/{f.name}"
67
- with open(file_path, "wb") as out_file:
68
- out_file.write(f.read())
69
-
70
- loader = PyPDFLoader(file_path) if f.name.endswith(".pdf") else TextLoader(file_path)
71
- raw_docs.extend(loader.load())
72
-
73
- splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
74
- chunks = splitter.split_documents(raw_docs)
75
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
76
- db = FAISS.from_documents(chunks, embedding=embeddings)
77
- return db
78
-
79
- retriever = embed_uploaded_files(uploaded_files) if uploaded_files else None
80
 
81
- # --- Streaming Response ---
82
- def generate_response(prompt_text):
83
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
84
- inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
85
- thread = Thread(target=model.generate, kwargs={
86
- "input_ids": inputs["input_ids"],
87
- "attention_mask": inputs["attention_mask"],
88
- "max_new_tokens": 1024,
89
- "temperature": 0.7,
90
- "top_p": 0.9,
91
- "repetition_penalty": 1.1,
92
- "do_sample": True,
93
- "streamer": streamer
94
- })
95
  thread.start()
96
- return streamer
97
-
98
- # --- Avatars & Messages ---
99
- USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
100
- BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"
101
-
102
- if "messages" not in st.session_state:
103
- st.session_state.messages = []
104
-
105
- for msg in st.session_state.messages:
106
- avatar = USER_AVATAR if msg["role"] == "user" else BOT_AVATAR
107
- with st.chat_message(msg["role"], avatar=avatar):
108
- st.markdown(msg["content"])
109
-
110
- # --- Chat UI ---
111
- if prompt := st.chat_input("Ask something based on uploaded documents..."):
112
- st.chat_message("user", avatar=USER_AVATAR).markdown(prompt)
113
- st.session_state.messages.append({"role": "user", "content": prompt})
114
-
115
- context = ""
116
- if retriever:
117
- docs = retriever.similarity_search(prompt, k=3)
118
- context = "\n\n".join([d.page_content for d in docs])
119
-
120
- full_prompt = build_prompt(st.session_state.messages, context=context)
121
-
122
- with st.chat_message("assistant", avatar=BOT_AVATAR):
123
- start_time = time.time()
124
- streamer = generate_response(full_prompt)
125
- container = st.empty()
126
- answer = ""
127
- for chunk in streamer:
128
- answer += chunk
129
- container.markdown(answer + "β–Œ", unsafe_allow_html=True)
130
- container.markdown(answer)
131
- st.session_state.messages.append({"role": "assistant", "content": answer})
 
1
+
2
  import streamlit as st
3
  import torch
4
  import os
 
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
  from langchain_community.embeddings import HuggingFaceEmbeddings
11
  from langchain.vectorstores import FAISS
12
+ from langchain.retrievers import BM25Retriever
13
+ from langchain.retrievers import EnsembleRetriever
14
  from langchain.schema import Document
15
+ from langchain.docstore.document import Document as LangchainDocument
16
 
17
  # --- HF Token ---
18
  HF_TOKEN = st.secrets["HF_TOKEN"]
 
25
  with st.sidebar:
26
  st.header("πŸ“„ Upload Knowledge Files")
27
  uploaded_files = st.file_uploader("Upload PDFs or .txt files", accept_multiple_files=True, type=["pdf", "txt"])
28
+ hybrid_toggle = st.checkbox("πŸ”€ Enable Hybrid Search", value=True)
 
29
 
30
  # --- Model Loading ---
31
  @st.cache_resource
32
  def load_model():
33
+ model_id = "tiiuae/falcon-7b-instruct"
34
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
35
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", token=HF_TOKEN)
36
+ return tokenizer, model
37
+
38
+ tokenizer, model = load_model()
39
+
40
+ # --- Document Processing ---
41
+ def process_documents(files):
42
+ documents = []
43
+ for file in files:
44
+ if file.name.endswith(".pdf"):
45
+ loader = PyPDFLoader(file)
46
+ else:
47
+ loader = TextLoader(file)
48
+ docs = loader.load()
49
+ documents.extend(docs)
50
+ return documents
51
+
52
+ def chunk_documents(documents):
53
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
54
+ return splitter.split_documents(documents)
55
+
56
+ # --- Embedding and Retrieval ---
57
+ def build_retrievers(chunks):
58
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
59
+ faiss_vectorstore = FAISS.from_documents(chunks, embeddings)
60
+ faiss_retriever = faiss_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
61
 
62
+ bm25_retriever = BM25Retriever.from_documents([LangchainDocument(page_content=d.page_content) for d in chunks])
63
+ bm25_retriever.k = 5
64
 
65
+ ensemble = EnsembleRetriever(retrievers=[faiss_retriever, bm25_retriever], weights=[0.5, 0.5])
66
+ return faiss_retriever, ensemble
 
 
 
 
 
67
 
68
+ # --- Inference ---
69
+ def generate_answer(query, retriever):
70
+ docs = retriever.get_relevant_documents(query)
71
+ context = "\n".join([doc.page_content for doc in docs])
72
 
73
+ system_prompt = (
74
+ "You are DigiTwin, an expert advisor in asset integrity, reliability, inspection, and maintenance "
75
+ "of topside piping, structural, mechanical systems, floating units, pressure vessels (VII), and pressure safety devices (PSD's). "
76
+ "Use the context below to answer professionally.\n\nContext:\n" + context + "\n\nQuery: " + query + "\nAnswer:"
77
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
 
 
79
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
80
+ inputs = tokenizer(system_prompt, return_tensors="pt").to(model.device)
81
+ generation_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=300)
82
+
83
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
 
 
 
 
 
 
 
84
  thread.start()
85
+
86
+ answer = ""
87
+ for token in streamer:
88
+ answer += token
89
+ yield answer
90
+
91
+ # --- Main App ---
92
+ if uploaded_files:
93
+ with st.spinner("Processing documents..."):
94
+ docs = process_documents(uploaded_files)
95
+ chunks = chunk_documents(docs)
96
+ faiss_retriever, hybrid_retriever = build_retrievers(chunks)
97
+ st.success("Documents processed successfully.")
98
+
99
+ query = st.text_input("πŸ” Ask a question based on the uploaded documents")
100
+ if query:
101
+ st.subheader("πŸ“€ Answer")
102
+ retriever = hybrid_retriever if hybrid_toggle else faiss_retriever
103
+ response_placeholder = st.empty()
104
+ full_response = ""
105
+ for partial_response in generate_answer(query, retriever):
106
+ full_response = partial_response
107
+ response_placeholder.markdown(full_response)