File size: 3,840 Bytes
6849ffb
 
 
 
 
 
2c14e84
6849ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9ab841
6849ffb
 
 
 
 
e9ab841
6849ffb
 
 
 
 
 
 
 
 
e9ab841
6849ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import re

import gradio as gr
from huggingface_hub import get_collection

os.environ["COLLECTION_SLUG_OR_URL"] = "https://huggingface.co/collections/PleIAs/finance-commons-66925e1095c7fa6e6828e26c"

def extract_collection_id(input_text):
    if url_match := re.match(r"https://huggingface\.co/collections/(.+)$", input_text):
        return url_match[1]

    # Check if input is already in the correct format
    return input_text if re.match(r"^[\w-]+/[\w-]+", input_text) else None


def load_collection():
    collection_input = os.getenv("COLLECTION_SLUG_OR_URL")
    if not collection_input:
        raise ValueError("COLLECTION_SLUG_OR_URL environment variable is not set.")

    collection_id = extract_collection_id(collection_input)
    if not collection_id:
        raise ValueError(
            "Invalid collection ID or URL in COLLECTION_SLUG_OR_URL environment variable."
        )

    collection = get_collection(collection_id)
    if dataset_ids := [
        item.item_id for item in collection.items if item.item_type == "dataset"
    ]:
        return dataset_ids, collection_id
    else:
        raise ValueError("No datasets found in this collection.")


def display_dataset(dataset_ids, index):
    dataset_id = dataset_ids[index]
    return gr.HTML(f"""<iframe
    src="https://huggingface.co/datasets/{dataset_id}/embed/viewer"
    frameborder="0"
    width="100%"
    height="560px"
></iframe>""")


def navigate_dataset(dataset_ids, index, direction):
    new_index = (index + direction) % len(dataset_ids)
    return (
        new_index,
        f"Dataset {new_index + 1} of {len(dataset_ids)}: {dataset_ids[new_index]}",
    )


def get_display_name(collection_id):
    # Pattern to match username/repo-name with an optional ID of 16 or more hexadecimal characters
    pattern = r"^(.+?)-([a-f0-9]{16,})$"
    if match := re.match(pattern, collection_id):
        return match[1]
    else:
        # If no match, return the original
        return collection_id


try:
    dataset_ids, collection_id = load_collection()
    display_name = get_display_name(collection_id)

    with gr.Blocks() as demo:
        gr.Markdown(f"<h1>Finance Commons</h1>")
        gr.Markdown(
            f"[View full collection on Hugging Face](https://huggingface.co/collections/{collection_id})"
        )

        gr.Markdown("""
        This app allows you to browse Finance Commons, the largest collection of open datasets on finance documents.""")

        index_state = gr.State(value=0)

        with gr.Row():
            left_btn = gr.Button("Previous")
            right_btn = gr.Button("Next")

        dataset_info = gr.Markdown(f"Dataset 1 of {len(dataset_ids)}: {dataset_ids[0]}")
        iframe_output = gr.HTML()
        gr.Markdown("""**Note**: This space currently mostly supports text formats.
        """)
        left_btn.click(
            navigate_dataset,
            inputs=[gr.State(dataset_ids), index_state, gr.Number(-1, visible=False)],
            outputs=[index_state, dataset_info],
        )
        right_btn.click(
            navigate_dataset,
            inputs=[gr.State(dataset_ids), index_state, gr.Number(1, visible=False)],
            outputs=[index_state, dataset_info],
        )

        index_state.change(
            display_dataset,
            inputs=[gr.State(dataset_ids), index_state],
            outputs=[iframe_output],
        )

        # Initialize the display with the first dataset
        demo.load(
            fn=lambda: display_dataset(dataset_ids, 0),
            inputs=None,
            outputs=[iframe_output],
        )

    if __name__ == "__main__":
        demo.launch()

except Exception as e:
    print(f"Error: {str(e)}")
    print(
        "Please set the COLLECTION_SLUG_OR_URL environment variable with a valid collection ID or URL."
    )