File size: 1,448 Bytes
92808fd
 
 
69427c3
7793af4
 
 
 
d6ea168
7793af4
d6ea168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92808fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ff711c
92808fd
 
 
 
 
 
 
 
 
 
f0211bc
92808fd
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import numpy as np



def query_pinecone(
    dense_vec,
    top_k,
    index,
    indices=None
):
    if indices != None:
        xc = index.query(
            vector=dense_vec,
            top_k=top_k,              
            filter={"QA_Flag": {"$eq": "Answer"},
            "index": {"$in": indices}},
            include_metadata=True,
        )
    else:
        xc = index.query(
            vector=dense_vec,
            top_k=top_k,              
            filter={"QA_Flag": {"$eq": "Answer"}},
            include_metadata=True,
        )
    return xc["matches"]


def format_query(query_results):
    # extract passage_text from Pinecone search result
    context = [
        (result["metadata"]["Text"], result["score"])
        for result in query_results
    ]
    return context


def format_context(context):
    output_text = []
    for text, score in context:
        output_text.append(f"Text: {text}\n\nCosine Similarity: {score}")
    return output_text


def get_bm25_search_hits(corpus, sparse_scores, top_n=50):
    bm25_search = []
    indices = []
    for idx in sparse_scores:
        if len(bm25_search) <= top_n:
            bm25_search.append(corpus[idx])
            indices.append(idx)
    indices = [int(x) for x in indices]
    return indices


def retrieve_transcript():
    open_file = open(
        f"2020-Apr-28-AMD.txt",
        "r",
    )
    file_text = open_file.read()
    return f"""{file_text}"""