NLP / app.py
maximka608
split sentence
7fea419
raw
history blame
3.32 kB
import streamlit as st
from utils.vector_base import KnowledgeBase
from utils.embedding import Embeddings
from utils.llm import LLM
from config import config
def get_emdedding_model():
return Embeddings()
def get_llm(api_key):
return LLM(api_key)
def get_metadata(path):
titles, texts = [], []
with open(path, 'rb') as file:
metadata = json.load(file)
for data in metadata:
titles.append(data['title'])
texts.append(data['text'])
return texts, titles
def combine_docs(indexes, texts):
result = ""
for i, index in enumerate(indexes):
result += " [" + str(i + 1) + "] " + texts[index]
return result
def create_prompt(query, docs):
system_prompt = f"""You are a language model integrated into a search and generation system based on relevant documents (RAG system).
Your task is to provide answers to the user's queries based solely on the provided documents.
If the information required to answer the user's question is available in the documents, use it, and refer to the document from which it was sourced by indicating its number in square brackets. For example:
"This term means such-and-such [1]."
Ensure that the citation clearly refers to the relevant document and is placed directly after the information from the source.
If the information is not present in the documents, kindly explain that the information is not available, and do not speculate or make up information.
Do not alter the content or meaning of the sources. Convey the information accurately and structure your response clearly, even if the formatting options are limited.
User query: {query}
Documents:
{docs}
"""
return system_prompt
def main(query, search_types, llm_api_key):
model, llm = get_emdedding_model(), get_llm(llm_api_key)
texts, titles = get_metadata(config.PATH_METADATA)
embedding = model.get_query_embedding(query)
knowledge_base = KnowledgeBase(config.PATH_FAISS, config.PATH_PREPROCESSING_TEXT)
vector_search = []
bm25_search = []
if "Vector" in search_types:
vector_search = knowledge_base.search_by_embedding(embedding, 5)[0].tolist()
if "BM25" in search_types:
bm25_search = knowledge_base.search_by_BM25(query, 5)
docs = combine_docs(vector_search + bm25_search, texts)
prompt = create_prompt(query, docs)
response = llm.generate_response(prompt)
return response, docs
# Streamlit Interface
if __name__ == '__main__':
st.title("PaperRAG")
st.subheader("RAG system for scientific papers with selectable search types")
query = st.text_input("Enter your query")
search_types = st.multiselect(
"Select search types",
options=["Vector", "BM25"],
default=["Vector", "BM25"]
)
llm_api_key = st.text_input("Cohere API Key", type="password")
if st.button("Get Response"):
if query and llm_api_key:
response, docs = main(query, search_types, llm_api_key)
st.subheader("LLM Response:")
st.text_area("Response", value=response, height=300)
st.subheader("Citations:")
st.text_area("Documents", value=docs, height=300)
else:
st.error("Please enter both a query and an API key.")