File size: 4,952 Bytes
b0fc7d6
533f28c
 
 
b0fc7d6
 
 
 
 
533f28c
b0fc7d6
 
a643df2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0fc7d6
 
 
 
8cec7ed
a643df2
ac00498
a643df2
533f28c
 
 
8cec7ed
a643df2
 
b0fc7d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a643df2
 
 
 
b0fc7d6
 
a643df2
 
 
b0fc7d6
 
 
 
 
 
 
 
a6d3adb
62ec630
 
 
 
 
a6d3adb
62ec630
 
533f28c
 
8cec7ed
 
 
 
 
 
533f28c
 
 
8cec7ed
533f28c
b0fc7d6
533f28c
 
 
b0fc7d6
 
 
 
 
 
a6d3adb
533f28c
 
b0fc7d6
 
a643df2
b0fc7d6
 
 
 
 
a643df2
b0fc7d6
a643df2
 
 
b0fc7d6
 
 
a643df2
b0fc7d6
8cec7ed
533f28c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import math
import numpy as np
from llama_cpp import Llama
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import StorageContext, load_index_from_storage, QueryBundle
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import LLMRerank
from llama_index.core.node_parser import TokenTextSplitter
from transformers import AutoTokenizer
from llama_index.core.postprocessor import SentenceTransformerRerank

_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")


def messages_to_prompt(messages):
    messages = [{"role": m.role.value, "content": m.content} for m in messages]
    prompt = _tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return prompt


def completion_to_prompt(completion):
    messages = [{"role": "user", "content": completion}]
    prompt = _tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return prompt


llm = LlamaCPP(
    model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
    temperature=0.1,
    max_new_tokens=128,
    context_window=16384,
    model_kwargs={"n_gpu_layers":-1, 'logits_all': False},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,)

llm2 = Llama(model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
            n_gpu_layers=-1, n_ctx=8000, logits_all=True)


embedding_model = HuggingFaceEmbedding(
    model_name="models/all-MiniLM-L6-v2"
)
Settings.llm = llm
Settings.embed_model = embedding_model


def check_if_exists():
    index = os.path.exists("models/precomputed_index")
    bm25 = os.path.exists("models/bm25_retriever")
    if index and bm25:
        return True
    else:
        return False


def precompute_index(data_folder='data'):
    documents = SimpleDirectoryReader(data_folder).load_data()
    splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=50)
    nodes = splitter.get_nodes_from_documents(documents)
    index = VectorStoreIndex(nodes, verbose=True)
    # index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir='models/precomputed_index')
    bm25_retriever = BM25Retriever.from_defaults(
        nodes=nodes,
        similarity_top_k=5,
        verbose=True
    )
    bm25_retriever.persist("models/bm25_retriever")

def is_harmful(query):
    harmful_keywords = ["bomb", "kill", "weapon", "suicide", "terror", "attack"]
    return any(keyword in query.lower() for keyword in harmful_keywords)


def is_not_relevant(query, index, threshold=0.7):
    retriever = index.as_retriever(similarity_top_k=1)
    nodes = retriever.retrieve(query)
    if not nodes:
        return False
    similarity = nodes[0].score
    return similarity <= threshold


def get_sequence_probability(llm, input_sequence):
    input_tokens = llm.tokenize(input_sequence.encode("utf-8"))
    llm.eval(input_tokens)
    probs = llm.logits_to_logprobs(llm.eval_logits)
    total_log_prob = 0.0
    for i, token in enumerate(input_tokens):
        token_log_prob = probs[i, token]
        total_log_prob += token_log_prob
        
    sequence_probability = math.exp(total_log_prob)
    return sequence_probability

    
def answer_question(query):
    if is_harmful(query):
        return "This query has been flagged as unsafe."

    print("loading bm25 retriever")
    bm25_retriever = BM25Retriever.from_persist_dir("models/bm25_retriever")
    print("loading saved vector index")
    storage_context = StorageContext.from_defaults(persist_dir="models/precomputed_index")
    index = load_index_from_storage(storage_context)
    
    if is_not_relevant(query, index, 0.2):
        return "This query doesn't appear relevant to finance."
    
    retriever = QueryFusionRetriever(
        [
            index.as_retriever(similarity_top_k=5, verbose=True),
            bm25_retriever,
        ],
        llm=llm,
        num_queries=1,
        similarity_top_k=5,
        verbose=True
    )
    reranker = SentenceTransformerRerank(
        model="cross-encoder/ms-marco-MiniLM-L-2-v2",
        top_n=5
    )
    keyword_query_engine = RetrieverQueryEngine(
        retriever=retriever,
        node_postprocessors=[reranker],
    )
    response = keyword_query_engine.query(f"Answer in less than 100 words: \nQuery:{query}")
    response_text = str(response)
    response_prob = get_sequence_probability(llm2, response_text)
    print(f"Output probability: {response_prob}")
    return response_text