File size: 4,407 Bytes
ddf1044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e95282d
ddf1044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
import hnswlib
import pandas as pd
from sentence_transformers import SentenceTransformer, CrossEncoder
import os
from together import Together
from dotenv import load_dotenv
from cryptography.fernet import Fernet
import gzip
import io

os.environ["TOKENIZERS_PARALLELISM"] = "false"

load_dotenv()

client = Together(api_key=os.environ.get("API_KEY"))

#read data
fernet = Fernet(os.environ.get("KEY2").encode("utf-8"))

with gzip.open("corpus.gz",'rb') as f:
    bytes_enc = f.read()
    pq_bytes = fernet.decrypt(bytes_enc)
    pq_file = io.BytesIO(pq_bytes)
    corpus = pd.read_parquet(pq_file)

biencoder = SentenceTransformer("intfloat/multilingual-e5-small", device="cpu")
embedding_size = biencoder.get_sentence_embedding_dimension()

crossencoder = CrossEncoder("KennethTM/MiniLM-L6-danish-reranker", device="cpu")

index = hnswlib.Index(space = 'cosine', dim = embedding_size)
index.load_index("corpus.index")
index.set_ef(40)

state = {}

source_label = {"wiki": "Wikipedia", "lex": "lex.dk", "mfkn": "MFKN", "dce": "DCE"}

def format_markdown(results):
    result_template = '### {idx}. [{title}]({url}) ({source}):\n"{text}"'
    result_join = "\n\n".join([result_template.format(idx=i+1, source=source_label[source], title=title, url=url, text=text) 
                               for i, (title, source, url, text) in enumerate(zip(results["title"], results["source"], results["url"], results["text_chunks"]))])
    results_formatted = f"## Referencer:\n\n{result_join}"

    return(results_formatted)

def format_context(results):
    result_template = "Kilde {idx}:\n{text}"
    result_join = "\n\n".join([result_template.format(idx=i+1, text=text) for i, text in enumerate(results["text_chunks"])])

    return(result_join)

def search(query, top_k):
    
    query_embedding = biencoder.encode(query, prompt = "query: ")

    biencoder_hits = int(top_k)*2
    ids, _ = index.knn_query(query_embedding, k = biencoder_hits)
    ids = ids[0]

    results = corpus.iloc[ids].copy()
    results["scores"] = crossencoder.predict([(query, i) for i in results["text_chunks"]])
    results = results.sort_values("scores", ascending=False)
    results = results[:int(top_k)]

    results_markdown = format_markdown(results)
    results_context = format_context(results)

    state["context"] = results_context
    state["query"] = query

    return(results_markdown)

def search_summary():
    context = state["context"]
    query = state["query"]

    prompt = [{"role": "system", "content": "Svar på spørgsmålet. Anvend kilderne i konteksten hvis de kan bruges til besvarelsen. Besvar kun på dansk."},
              {"role": "user", "content": f"Kontekst:\n{context}\n\nSpørgsmål:\n{query}"}]
    
    stream = client.chat.completions.create(
        model="meta-llama/Llama-3-8b-chat-hf",
        messages=prompt,
        stream=True,
        max_tokens=1024
    )

    partial_message = ""
    for chunk in stream:
        partial_message += chunk.choices[0].delta.content or ""
        yield partial_message


with gr.Blocks() as demo:

    gr.Markdown("# Natur og miljø BOT")
    gr.Markdown("Dette er en simpel spørgsmål-svar applikation indenfor Danmarks natur og miljø. Svar genereres af en sprogmodel (LLAMA-3-8B) og anvender relevante referencer i en stor samling af dokumenter. Dette er blandt andet artikler fra [Wikipedia](https://da.wikipedia.org/wiki/Forside), rapporter fra [DCE - Nationalt Center for Miljø og Energi](https://dce.au.dk/udgivelser), [lex.dk - Den Store Danske](https://denstoredanske.lex.dk/) samt sager fra [Miljø og fødevareklagenævnet](https://mfkn.naevneneshus.dk).")
    
    with gr.Row():
        textbox = gr.Textbox(placeholder="Søg...", lines=1, scale=8, label="Spørgsmål")
        num = gr.Number(5, label="Referencer", scale=1, minimum=1, maximum=10)
        btn = gr.Button("Søg!", size="sm", scale=2)

    with gr.Row():
        summary = gr.Textbox(interactive=False, lines=10, label="Svar")
        
    with gr.Row():
        results = gr.Markdown()

    gr.Markdown("*Applikation lavet af Kenneth Thorø Martinsen (email: [email protected])*")

    btn.click(fn=search, inputs=[textbox, num], outputs=results).then(search_summary, inputs=None, outputs=summary)
    textbox.submit(fn=search, inputs=[textbox, num], outputs=results).then(search_summary, inputs=None, outputs=summary)

demo.queue().launch()