import streamlit as st import pathlib from huggingface_hub import hf_hub_download from langchain_community.llms import LlamaCpp from langchain.chains import create_retrieval_chain from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_core.prompts import ChatPromptTemplate @st.cache_resource() def load_llm(repo_id, filename): # Create a directory for models if it doesn't exist models_folder = pathlib.Path("models") models_folder.mkdir(exist_ok=True) # Download the model model_path = hf_hub_download( repo_id=repo_id, filename=filename, local_dir=models_folder ) llm = LlamaCpp( model_path=model_path, repo_id=repo_id, filename=filename, verbose=False, use_mmap=True, use_mlock=True, n_threads=4, n_threads_batch=4, n_ctx=8000, ) print(f"{repo_id} loaded successfully. ✅") return llm # Streamed response emulator def response_generator(llm, messages, question, retriever): system_prompt = ( "You are an assistant for question-answering tasks. " "Use the following pieces of retrieved context to answer " "the question. If you don't know the answer, say that you " "don't know. Use three sentences maximum and keep the " "answer concise." "\n\n" "{context}" ) prompt = ChatPromptTemplate.from_messages( [ ("system", system_prompt), ("user", "{input}"), ] ) question_answer_chain = create_stuff_documents_chain(llm, prompt) rag_chain = create_retrieval_chain(retriever, question_answer_chain) results = rag_chain.invoke({"input": question}) return results