File size: 2,141 Bytes
53ffb10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import json
import random

import streamlit as st
from datasets import load_from_disk


class Visualization:
    def __init__(self, path_web_documents_dataset):
        self.path_web_documents_dataset = path_web_documents_dataset

    def visualization(self):
        self.set_title()
        self.load_dataset()
        self.choose_document()
        self.display_document()

    def set_title(self):
        st.title("Visualization of web documents")

    def load_dataset(self):
        st.header("Select the size of the dataset")

        self.dataset = load_from_disk(self.path_web_documents_dataset)

        opt_sizes = ["100", "300", "1000", "3000"]
        size_dataset = st.selectbox(
            "Select the size of the dataset",
            options=opt_sizes,
        )

        self.dataset = self.dataset.select(range(int(size_dataset)))

    def choose_document(self):
        st.header("Choose a document")
        if st.button("Select a random document"):
            dct_idx = random.randint(a=0, b=self.dataset.num_rows - 1)
        else:
            dct_idx = 0
        idx = st.number_input(
            f"Select a document among the first {self.dataset.num_rows} ones",
            min_value=0,
            max_value=self.dataset.num_rows - 1,
            value=dct_idx,
            step=1,
            help=f"Index between 0 and {self.dataset.num_rows-1}",
        )
        self.current_doc = self.dataset[idx]

    def display_document(self):
        st.header("Document")
        texts = self.current_doc["texts"]
        images = self.current_doc["images"]
        metadata = json.loads(self.current_doc["metadata"])
        for text, image, meta in zip(texts, images, metadata):
            if text:
                st.text(f"{text}\n\n")
            elif image:
                st.markdown(f"![img]({meta['src']})\n\n")


if __name__ == "__main__":
    st.set_page_config(layout="wide")
    path_web_documents_dataset = "./web_docs_final"  # Find at s3://m4-datasets/trash/web_docs_final/
    visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
    visualization.visualization()