Spaces:
Sleeping
Sleeping
File size: 4,952 Bytes
b0fc7d6 533f28c b0fc7d6 533f28c b0fc7d6 a643df2 b0fc7d6 8cec7ed a643df2 ac00498 a643df2 533f28c 8cec7ed a643df2 b0fc7d6 a643df2 b0fc7d6 a643df2 b0fc7d6 a6d3adb 62ec630 a6d3adb 62ec630 533f28c 8cec7ed 533f28c 8cec7ed 533f28c b0fc7d6 533f28c b0fc7d6 a6d3adb 533f28c b0fc7d6 a643df2 b0fc7d6 a643df2 b0fc7d6 a643df2 b0fc7d6 a643df2 b0fc7d6 8cec7ed 533f28c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
import math
import numpy as np
from llama_cpp import Llama
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import StorageContext, load_index_from_storage, QueryBundle
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import LLMRerank
from llama_index.core.node_parser import TokenTextSplitter
from transformers import AutoTokenizer
from llama_index.core.postprocessor import SentenceTransformerRerank
_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
def messages_to_prompt(messages):
messages = [{"role": m.role.value, "content": m.content} for m in messages]
prompt = _tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return prompt
def completion_to_prompt(completion):
messages = [{"role": "user", "content": completion}]
prompt = _tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return prompt
llm = LlamaCPP(
model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
temperature=0.1,
max_new_tokens=128,
context_window=16384,
model_kwargs={"n_gpu_layers":-1, 'logits_all': False},
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,)
llm2 = Llama(model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
n_gpu_layers=-1, n_ctx=8000, logits_all=True)
embedding_model = HuggingFaceEmbedding(
model_name="models/all-MiniLM-L6-v2"
)
Settings.llm = llm
Settings.embed_model = embedding_model
def check_if_exists():
index = os.path.exists("models/precomputed_index")
bm25 = os.path.exists("models/bm25_retriever")
if index and bm25:
return True
else:
return False
def precompute_index(data_folder='data'):
documents = SimpleDirectoryReader(data_folder).load_data()
splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=50)
nodes = splitter.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes, verbose=True)
# index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist(persist_dir='models/precomputed_index')
bm25_retriever = BM25Retriever.from_defaults(
nodes=nodes,
similarity_top_k=5,
verbose=True
)
bm25_retriever.persist("models/bm25_retriever")
def is_harmful(query):
harmful_keywords = ["bomb", "kill", "weapon", "suicide", "terror", "attack"]
return any(keyword in query.lower() for keyword in harmful_keywords)
def is_not_relevant(query, index, threshold=0.7):
retriever = index.as_retriever(similarity_top_k=1)
nodes = retriever.retrieve(query)
if not nodes:
return False
similarity = nodes[0].score
return similarity <= threshold
def get_sequence_probability(llm, input_sequence):
input_tokens = llm.tokenize(input_sequence.encode("utf-8"))
llm.eval(input_tokens)
probs = llm.logits_to_logprobs(llm.eval_logits)
total_log_prob = 0.0
for i, token in enumerate(input_tokens):
token_log_prob = probs[i, token]
total_log_prob += token_log_prob
sequence_probability = math.exp(total_log_prob)
return sequence_probability
def answer_question(query):
if is_harmful(query):
return "This query has been flagged as unsafe."
print("loading bm25 retriever")
bm25_retriever = BM25Retriever.from_persist_dir("models/bm25_retriever")
print("loading saved vector index")
storage_context = StorageContext.from_defaults(persist_dir="models/precomputed_index")
index = load_index_from_storage(storage_context)
if is_not_relevant(query, index, 0.2):
return "This query doesn't appear relevant to finance."
retriever = QueryFusionRetriever(
[
index.as_retriever(similarity_top_k=5, verbose=True),
bm25_retriever,
],
llm=llm,
num_queries=1,
similarity_top_k=5,
verbose=True
)
reranker = SentenceTransformerRerank(
model="cross-encoder/ms-marco-MiniLM-L-2-v2",
top_n=5
)
keyword_query_engine = RetrieverQueryEngine(
retriever=retriever,
node_postprocessors=[reranker],
)
response = keyword_query_engine.query(f"Answer in less than 100 words: \nQuery:{query}")
response_text = str(response)
response_prob = get_sequence_probability(llm2, response_text)
print(f"Output probability: {response_prob}")
return response_text
|