Spaces:

CarperAI
/

pilev2_pipeline

Runtime error

App Files Files Community

reshinthadith commited on Nov 28, 2022

Commit

675e604

1 Parent(s): cf5eed6

Fix dataset loading bug

Browse files

Files changed (1) hide show

app.py +203 -177

app.py CHANGED Viewed

@@ -3,185 +3,211 @@ import matplotlib.pyplot as plt
 import numpy as np
 from functools import partial
-# ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code")
-# amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS")
-# apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail")
-# books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3")
-# cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset")
-# dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath")
-# discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse")
-# wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki")
-# euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings")
-# freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options")
-# ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff")
-# ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues")
-# gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg")
-# leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode")
-# pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw")
-# pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed")
-# s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC")
-# se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange")
-# usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET")
-# uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO")
-# ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC")
-# arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv")
 dataset_data = {
-    "AI4Code": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "AMPS": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "ASFPublicMail": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "Books3": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "CPDataset": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "DMMath": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "Discourse": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "Enwiki": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "EuroParliamentProceedings": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "FreeLaw_Options": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "GitHubDiff": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "GitHubIssues": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "Gutenberg": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "LeetCode": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "PileOfLaw": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "PubMed": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "S2ORC": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "StackExchange": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "USENET": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "USPTO": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "UbuntuIRC": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-    "arXiv": {
-        # create fake data for the different ratios
-        "word_rep_ratios": np.random.randn(1000),
-        "char_rep_ratios": np.random.randn(1000),
-        "flagged_word_ratios": np.random.randn(1000),
-        "num_words": np.random.randint(0, 1000, 1000),
-    },
-}
 def plt_plot(ratio, dataset, threshold):
     x = dataset_data[dataset][ratio]

 import numpy as np
 from functools import partial
+ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code/data.json")
+amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS/data.json")
+apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail/data.json")
+books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3/data.json")
+cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset/data.json")
+dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath/data.json")
+discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse/data.json")
+wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki/data.json")
+euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings/data.json")
+freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options/data.json")
+ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff/data.json")
+ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues/data.json")
+gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg/data.json")
+leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode/data.json")
+pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw/data.json")
+pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed/data.json")
+s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC/data.json")
+se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange/data.json")
+usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET/data.json")
+uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO/data.json")
+ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC/data.json")
+arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv/data.json")
 dataset_data = {
+    "ai4code" : ai4code_ds["train"],
+    "amps" : amps_ds["train"],
+    "apache" : apache_ds["train"],
+    "books3" : books3_ds["train"],
+    "competitive_programming" : cp_ds["train"],
+    "dmmath" : dmmath_ds["train"],
+    "discourse" : discourse_ds["train"],
+    "enwiki" : wiki_ds["train"],
+    "euro" : euro_ds["train"],
+    "freelaw" : freelaw_ds["train"],
+    "ghdiffs" : ghdiffs_ds["train"],
+    "ghissues" : ghissues_ds["train"],
+    "gutenberg" : gutenberg_ds["train"],
+    "leetcode" : leet_ds["train"],
+    "pileoflaw" : pileoflaw_ds["train"],
+    "pubmed" : pubmed_ds["train"],
+    "s2orc" : s2orc_ds["train"],
+    "se" : se_ds["train"],
+    "usenet" : usenet_ds["train"],
+    "uspto" : uspto_ds["train"],
+    "ubuntuirc" : ubuntuirc_ds["train"],
+    "arxiv" : arxiv_ds["train"]
+    }
+# dataset_data = {
+#     "AI4Code": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "AMPS": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "ASFPublicMail": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "Books3": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "CPDataset": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "DMMath": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "Discourse": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "Enwiki": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "EuroParliamentProceedings": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "FreeLaw_Options": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "GitHubDiff": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "GitHubIssues": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "Gutenberg": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "LeetCode": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "PileOfLaw": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "PubMed": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "S2ORC": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "StackExchange": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "USENET": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "USPTO": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "UbuntuIRC": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+#     "arXiv": {
+#         # create fake data for the different ratios
+#         "word_rep_ratios": np.random.randn(1000),
+#         "char_rep_ratios": np.random.randn(1000),
+#         "flagged_word_ratios": np.random.randn(1000),
+#         "num_words": np.random.randint(0, 1000, 1000),
+#     },
+# }
 def plt_plot(ratio, dataset, threshold):
     x = dataset_data[dataset][ratio]