Spaces:
Sleeping
Sleeping
import os | |
import math | |
import numpy as np | |
from llama_cpp import Llama | |
from llama_index.llms.llama_cpp import LlamaCPP | |
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings | |
from llama_index.retrievers.bm25 import BM25Retriever | |
from llama_index.core.retrievers import QueryFusionRetriever | |
from llama_index.core.query_engine import RetrieverQueryEngine | |
from llama_index.core import StorageContext, load_index_from_storage, QueryBundle | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.core.postprocessor import LLMRerank | |
from llama_index.core.node_parser import TokenTextSplitter | |
from transformers import AutoTokenizer | |
from llama_index.core.postprocessor import SentenceTransformerRerank | |
_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct") | |
def messages_to_prompt(messages): | |
messages = [{"role": m.role.value, "content": m.content} for m in messages] | |
prompt = _tokenizer.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
return prompt | |
def completion_to_prompt(completion): | |
messages = [{"role": "user", "content": completion}] | |
prompt = _tokenizer.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
return prompt | |
llm = LlamaCPP( | |
model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", | |
temperature=0.1, | |
max_new_tokens=128, | |
context_window=16384, | |
model_kwargs={"n_gpu_layers":-1, 'logits_all': False}, | |
messages_to_prompt=messages_to_prompt, | |
completion_to_prompt=completion_to_prompt,) | |
llm2 = Llama(model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", | |
n_gpu_layers=-1, n_ctx=8000, logits_all=True) | |
embedding_model = HuggingFaceEmbedding( | |
model_name="models/all-MiniLM-L6-v2" | |
) | |
Settings.llm = llm | |
Settings.embed_model = embedding_model | |
def check_if_exists(): | |
index = os.path.exists("models/precomputed_index") | |
bm25 = os.path.exists("models/bm25_retriever") | |
if index and bm25: | |
return True | |
else: | |
return False | |
def precompute_index(data_folder='data'): | |
documents = SimpleDirectoryReader(data_folder).load_data() | |
splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=50) | |
nodes = splitter.get_nodes_from_documents(documents) | |
index = VectorStoreIndex(nodes, verbose=True) | |
# index = VectorStoreIndex.from_documents(documents) | |
index.storage_context.persist(persist_dir='models/precomputed_index') | |
bm25_retriever = BM25Retriever.from_defaults( | |
nodes=nodes, | |
similarity_top_k=5, | |
verbose=True | |
) | |
bm25_retriever.persist("models/bm25_retriever") | |
def is_harmful(query): | |
harmful_keywords = ["bomb", "kill", "weapon", "suicide", "terror", "attack"] | |
return any(keyword in query.lower() for keyword in harmful_keywords) | |
def is_not_relevant(query, index, threshold=0.7): | |
retriever = index.as_retriever(similarity_top_k=1) | |
nodes = retriever.retrieve(query) | |
if not nodes: | |
return False | |
similarity = nodes[0].score | |
return similarity <= threshold | |
def get_sequence_probability(llm, input_sequence): | |
input_tokens = llm.tokenize(input_sequence.encode("utf-8")) | |
llm.eval(input_tokens) | |
probs = llm.logits_to_logprobs(llm.eval_logits) | |
total_log_prob = 0.0 | |
for i, token in enumerate(input_tokens): | |
token_log_prob = probs[i, token] | |
total_log_prob += token_log_prob | |
sequence_probability = math.exp(total_log_prob) | |
return sequence_probability | |
def answer_question(query): | |
if is_harmful(query): | |
return "This query has been flagged as unsafe." | |
print("loading bm25 retriever") | |
bm25_retriever = BM25Retriever.from_persist_dir("models/bm25_retriever") | |
print("loading saved vector index") | |
storage_context = StorageContext.from_defaults(persist_dir="models/precomputed_index") | |
index = load_index_from_storage(storage_context) | |
if is_not_relevant(query, index, 0.2): | |
return "This query doesn't appear relevant to finance." | |
retriever = QueryFusionRetriever( | |
[ | |
index.as_retriever(similarity_top_k=5, verbose=True), | |
bm25_retriever, | |
], | |
llm=llm, | |
num_queries=1, | |
similarity_top_k=5, | |
verbose=True | |
) | |
reranker = SentenceTransformerRerank( | |
model="cross-encoder/ms-marco-MiniLM-L-2-v2", | |
top_n=5 | |
) | |
keyword_query_engine = RetrieverQueryEngine( | |
retriever=retriever, | |
node_postprocessors=[reranker], | |
) | |
response = keyword_query_engine.query(f"Answer in less than 100 words: \nQuery:{query}") | |
response_text = str(response) | |
response_prob = get_sequence_probability(llm2, response_text) | |
print(f"Output probability: {response_prob}") | |
return response_text | |