File size: 4,318 Bytes
e1c118f
 
 
 
aeb0b24
d22aa8c
 
 
357c2b5
 
 
 
f06f7e1
d22aa8c
 
e1c118f
d22aa8c
e1c118f
d22aa8c
e1c118f
 
e5f7335
e1c118f
 
 
 
f5e54ef
e1c118f
 
 
 
 
cb69299
982f5b5
 
8a7055c
67fcf7e
9eb887b
cb69299
 
 
 
 
 
 
 
1dd4e6f
cb69299
9eb887b
cb69299
 
 
5dc6c40
 
cb69299
 
 
5dc6c40
cb69299
d22aa8c
e1c118f
d22aa8c
 
e1c118f
 
 
fe27c61
48c5747
 
e1c118f
 
48c5747
e1c118f
 
71aecdf
 
 
 
357c2b5
71aecdf
 
 
 
cb69299
 
71aecdf
cb69299
e1c118f
62d1d53
e1c118f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
from huggingface_hub import hf_hub_download
import json
import gzip
import urllib

usernames = {}

filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset", revision="v1.2")
with gzip.open(filepath, 'r') as f:
    usernames["v1.2"] = json.loads(f.read().decode('utf-8'))

filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset", revision="v1.1")
with gzip.open(filepath, 'r') as f:
    usernames["v1.1"] = json.loads(f.read().decode('utf-8'))

filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset")
with gzip.open(filepath, 'r') as f:
    usernames["v1.0"] = json.loads(f.read().decode('utf-8'))

text = """\
![](https://huggingface.co/spaces/lvwerra/in-the-stack-gr/resolve/main/banner.png)
**_The Stack is an open governance interface between the AI community and the open source community._**

# Am I in The Stack?

As part of the BigCode project, we released and maintain [The Stack](https://huggingface.co/datasets/bigcode/the-stack), a 6 TB dataset of permissively licensed source code over 300 programming languages. One of our goals in this project is to give people agency over their source code by letting them decide whether or not it should be used to develop and evaluate machine learning models, as we acknowledge that not all developers may wish to have their data used for that purpose.
""" + """\

This tool lets you check if a repository under a given username is part of The Stack dataset. Would you like to have your data removed from future versions of The Stack? You can opt-out following the instructions [here](https://www.bigcode-project.org/docs/about/the-stack/#how-can-i-request-that-my-data-be-removed-from-the-stack).
"""

opt_out_text_template = """\
### Opt-out

If you want your data to be removed from the stack and model training \
open an issue with <a href="https://github.com/bigcode-project/opt-out-v2/issues/new?title={title}&body={body}" target="_blank">this link</a> \
(if the link doesn't work try right a right click and open it in a new tab) or visit [https://github.com/bigcode-project/opt-out-v2/issues/new?&template=opt-out-request.md](https://github.com/bigcode-project/opt-out-v2/issues/new?&template=opt-out-request.md) .\
"""

opt_out_issue_title = """Opt-out request for {username}"""
opt_out_issue_body = """\
I request that the following data is removed from The Stack:

 - Commits
 - GitHub issue
{repo_list}

_Note_: If you don't want all resources to be included just remove the elements from the list above. If you would like to exclude all repositories and resources just add a single element "all" to the list.
"""

def issue_url(username, repos):
    title = urllib.parse.quote(opt_out_issue_title.format(username=username))  
    body = urllib.parse.quote(opt_out_issue_body.format(repo_list=" - "+ "\n - ".join(repos)))
    
    opt_out_text = opt_out_text_template.format(title=title, body=body)
    
    return opt_out_text

def check_username(username, version):
    output_md = ""
    if username in usernames[version] and len(usernames[version][username])>0:
        repos = usernames[version][username]
        repo_word = "repository" if len(repos)==1 else "repositories"
        output_md += f"**Yes**, there is code from **{len(repos)} {repo_word}** in The Stack:\n\n"
        for repo in repos:
            output_md += f"_{repo}_\n\n"

        return output_md.strip(), issue_url(username, repos)
    else:
        output_md += "**No**, your code is not in The Stack."
        return output_md.strip(), ""

with gr.Blocks() as demo:
    with gr.Row():
        _, colum_2, _ = gr.Column(scale=1), gr.Column(scale=6), gr.Column(scale=1)
        with colum_2:
            gr.Markdown(text)
            version = gr.Dropdown(["v1.2", "v1.1", "v1.0"], label="The Stack version:", value="v1.2")
            username = gr.Text("", label="Your GitHub username:")
            check_button = gr.Button("Check!")
            
            repos = gr.Markdown()
            opt_out = gr.Markdown()

            
            check_button.click(check_username, [username, version], [repos, opt_out])


demo.launch()