Spaces:

maximka608
/

NLP

Sleeping

NLP / app.py

maximka608

split sentence

7fea419 5 months ago

3.32 kB

	import streamlit as st
	from utils.vector_base import KnowledgeBase
	from utils.embedding import Embeddings
	from utils.llm import LLM
	from config import config


	def get_emdedding_model():
	return Embeddings()


	def get_llm(api_key):
	return LLM(api_key)


	def get_metadata(path):
	titles, texts = [], []
	with open(path, 'rb') as file:
	metadata = json.load(file)
	for data in metadata:
	titles.append(data['title'])
	texts.append(data['text'])
	return texts, titles

	def combine_docs(indexes, texts):
	result = ""
	for i, index in enumerate(indexes):
	result += " [" + str(i + 1) + "] " + texts[index]
	return result


	def create_prompt(query, docs):
	system_prompt = f"""You are a language model integrated into a search and generation system based on relevant documents (RAG system).
	Your task is to provide answers to the user's queries based solely on the provided documents.
	If the information required to answer the user's question is available in the documents, use it, and refer to the document from which it was sourced by indicating its number in square brackets. For example:
	"This term means such-and-such [1]."
	Ensure that the citation clearly refers to the relevant document and is placed directly after the information from the source.

	If the information is not present in the documents, kindly explain that the information is not available, and do not speculate or make up information.

	Do not alter the content or meaning of the sources. Convey the information accurately and structure your response clearly, even if the formatting options are limited.

	User query: {query}
	Documents:
	{docs}
	"""
	return system_prompt


	def main(query, search_types, llm_api_key):
	model, llm = get_emdedding_model(), get_llm(llm_api_key)
	texts, titles = get_metadata(config.PATH_METADATA)
	embedding = model.get_query_embedding(query)

	knowledge_base = KnowledgeBase(config.PATH_FAISS, config.PATH_PREPROCESSING_TEXT)
	vector_search = []
	bm25_search = []

	if "Vector" in search_types:
	vector_search = knowledge_base.search_by_embedding(embedding, 5)[0].tolist()
	if "BM25" in search_types:
	bm25_search = knowledge_base.search_by_BM25(query, 5)

	docs = combine_docs(vector_search + bm25_search, texts)
	prompt = create_prompt(query, docs)

	response = llm.generate_response(prompt)
	return response, docs


	# Streamlit Interface
	if __name__ == '__main__':
	st.title("PaperRAG")
	st.subheader("RAG system for scientific papers with selectable search types")

	query = st.text_input("Enter your query")
	search_types = st.multiselect(
	"Select search types",
	options=["Vector", "BM25"],
	default=["Vector", "BM25"]
	)
	llm_api_key = st.text_input("Cohere API Key", type="password")

	if st.button("Get Response"):
	if query and llm_api_key:
	response, docs = main(query, search_types, llm_api_key)

	st.subheader("LLM Response:")
	st.text_area("Response", value=response, height=300)

	st.subheader("Citations:")
	st.text_area("Documents", value=docs, height=300)
	else:
	st.error("Please enter both a query and an API key.")