File size: 3,286 Bytes
76cbdff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2c7bf9
 
 
 
76cbdff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from collections import defaultdict
import json

from langchain.schema import Document
import streamlit as st

import utils_mod


def group_docs(docs) -> list[tuple[str, list[Document]]]:
    """Group and sort docs.

    docs are grouped by legis_id
    inside a legis_id group, the docs are sorted by start_index
    overall the legis_id groups are sorted by number of docs (desc)

    doc_grps = [
        (legis_id, start_index sorted docs), # group with the most docs
        (legis_id, start_index sorted docs),
        ...
        (legis_id, start_index sorted docs), # group with the least docs
    ]
    """
    doc_grps = defaultdict(list)

    # create legis_id groups
    for doc in docs:
        doc_grps[doc.metadata["legis_id"]].append(doc)

    # sort docs in each group by start index
    for legis_id in doc_grps.keys():
        doc_grps[legis_id] = sorted(
            doc_grps[legis_id],
            key=lambda x: x.metadata["start_index"],
        )

    # sort groups by number of docs
    doc_grps = sorted(
        tuple(doc_grps.items()),
        key=lambda x: (
            -len(x[1]),  # length of x[1] = number of chunks
            x[0],  # legis_id for deterministic sort
        ),
    )

    return doc_grps


def format_docs(docs: list[Document]) -> str:
    """JSON grouped"""

    doc_grps = group_docs(docs)
    out = []
    for legis_id, doc_grp in doc_grps:
        dd = {
            "legis_id": doc_grp[0].metadata["legis_id"],
            "title": doc_grp[0].metadata["title"],
            "introduced_date": doc_grp[0].metadata["introduced_date"],
            "sponsor": doc_grp[0].metadata["sponsor_full_name"],
            "snippets": [doc.page_content for doc in doc_grp],
        }
        out.append(dd)
    return json.dumps(out, indent=4)


def render_doc_grp(legis_id: str, doc_grp: list[Document]):
    first_doc = doc_grp[0]

    congress_gov_url = utils_mod.get_congress_gov_url(
        first_doc.metadata["congress_num"],
        first_doc.metadata["legis_type"],
        first_doc.metadata["legis_num"],
    )
    congress_gov_link = f"[congress.gov]({congress_gov_url})"

    ref = "{} chunks from {}\n\n{}\n\n{}\n\n[{} ({}) ]({})".format(
        len(doc_grp),
        first_doc.metadata["legis_id"],
        first_doc.metadata["title"],
        congress_gov_link,
        first_doc.metadata["sponsor_full_name"],
        first_doc.metadata["sponsor_bioguide_id"],
        utils_mod.get_sponsor_url(first_doc.metadata["sponsor_bioguide_id"]),
    )
    doc_contents = [
        "[start_index={}] ".format(int(doc.metadata["start_index"])) + doc.page_content
        for doc in doc_grp
    ]
    with st.expander(ref):
        st.write(utils_mod.escape_markdown("\n\n...\n\n".join(doc_contents)))


def render_retrieved_chunks(docs: list[Document], tag: str | None = None):
    with st.container(border=True):
        doc_grps = group_docs(docs)
        if tag is None:
            st.write(
                "Retrieved Chunks\n\nleft click to expand, right click to follow links"
            )
        else:
            st.write(
                f"Retrieved Chunks ({tag})\n\nleft click to expand, right click to follow links"
            )
        for legis_id, doc_grp in doc_grps:
            render_doc_grp(legis_id, doc_grp)