Spaces:

distributed
/

model_convergence

Sleeping

App Files Files Community

kmfoda commited on Dec 26, 2024

Commit

3428800

1 Parent(s): 757cbdf

Update to include gpt2-2b

Browse files

Files changed (2) hide show

evaluate.py +60 -56
results.json +260 -0

evaluate.py CHANGED Viewed

@@ -1,22 +1,29 @@
 import json
 import random
 import torch
-import time
-import os
 from distributed_training.data.dataset import DataLoader
-from huggingface_hub import list_repo_refs
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import create_tag, list_repo_refs, scan_cache_dir
 device = "cuda"
 test_indices_length = 1000
 AUTOMATE = True
-models = ["distributed/optimized-gpt2-1b", "distributed/optimized-gpt2-500m", "distributed/optimized-gpt2-250m", "distributed/optimized-gpt2-250m-v0.1.3", "distributed/optimized-gpt2-250m-v0.1.1", "distributed/gpt2-94m"]
 if os.path.exists("results.json"):
-    with open('results.json', 'r') as file:
         results = json.load(file)
 else:
     results = {}
@@ -24,39 +31,34 @@ else:
 while True:
     for model_name in [models[0]]:
-        if (model_name not in results.keys()):
             results[model_name] = {}
-        tokenizer = AutoTokenizer.from_pretrained("distributed/optimized-gpt2-250m", trust_remote_code=True)
         refs = list_repo_refs(model_name, repo_type="model")
         global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None
-        if global_epoch in results[model_name]['main-net'].keys():
             print(f"Results for epoch {global_epoch} already calcualted")
-            time.sleep(30*60)
-        for epoch in range(0,global_epoch, 1):
-            if str(epoch) in results[model_name]['main-net'].keys():
                 continue
-            model = AutoModelForCausalLM.from_pretrained(model_name, revision=str(epoch), trust_remote_code=True)
             model = model.to(device)
             search_start = random.choice(
-                range(
-                    DataLoader.max_pages
-                    - test_indices_length
-                    + 1
-                )
             )
-            group = [
-                i
-                for i in range(
-                    search_start, search_start + test_indices_length
-                )
-            ]
             dataloader = DataLoader(
                 batch_size=1,
@@ -71,7 +73,7 @@ while True:
                 inputs = batch[0].to(device)
                 labels = batch[1].to(device)
-                if (len(inputs[0]) != len(labels[0])):
                     breakpoint()
                 if "optimized" in model_name:
                     outputs = model(input_ids=inputs, labels=labels)
@@ -86,37 +88,39 @@ while True:
                 # Backward Pass
                 model.zero_grad()
-            average_loss = total_loss / (index+1)
-            results[model_name]['main-net'][str(epoch)] = [average_loss]
             print(f"Epoch: {epoch}  Average Loss: {average_loss:.2f}")
             with open("results.json", "w") as outfile:
-                json.dump(results, outfile, indent = 4)
-        current_revision = model.config._commit_hash
-        keep_recent=1
-        try:
-            cache_info = scan_cache_dir()
-            for repo in cache_info.repos:
-                if repo.repo_id == model_name:
-                    revisions = sorted(
-                        repo.revisions, key=lambda r: r.last_modified, reverse=True
-                    )
-                    current_index = next(
-                        (
-                            i
-                            for i, r in enumerate(revisions)
-                            if r.commit_hash == current_revision
-                        ),
-                        None,
-                    )
-                    if current_index is not None:
-                        for revision in revisions[
-                            max(current_index + 1, keep_recent) :
-                        ]:
-                            cache_info.delete_revisions(revision.commit_hash).execute()
-                    break
-        except:
-            print(
-                "Failed to delete previous model version from cache. This might lead to 100% disk space utlisation in the future."
-            )

 import json
+import os
 import random
+import time
 import torch
 from distributed_training.data.dataset import DataLoader
 from huggingface_hub import create_tag, list_repo_refs, scan_cache_dir
+from transformers import AutoModelForCausalLM, AutoTokenizer
 device = "cuda"
 test_indices_length = 1000
 AUTOMATE = True
+models = [
+    "distributed/optimized-gpt2-2b",
+    "distributed/optimized-gpt2-1b",
+    "distributed/optimized-gpt2-500m",
+    "distributed/optimized-gpt2-250m",
+    "distributed/optimized-gpt2-250m-v0.1.3",
+    "distributed/optimized-gpt2-250m-v0.1.1",
+    "distributed/gpt2-94m",
+]
 if os.path.exists("results.json"):
+    with open("results.json", "r") as file:
         results = json.load(file)
 else:
     results = {}
 while True:
     for model_name in [models[0]]:
+        if model_name not in results.keys():
             results[model_name] = {}
+        tokenizer = AutoTokenizer.from_pretrained(
+            "distributed/optimized-gpt2-250m", trust_remote_code=True
+        )
         refs = list_repo_refs(model_name, repo_type="model")
         global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None
+        if global_epoch in results[model_name]["main-net"].keys():
             print(f"Results for epoch {global_epoch} already calcualted")
+            time.sleep(30 * 60)
+        for epoch in range(0, global_epoch, 1):
+            if str(epoch) in results[model_name]["main-net"].keys():
                 continue
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name, revision=str(epoch), trust_remote_code=True
+            )
             model = model.to(device)
             search_start = random.choice(
+                range(DataLoader.max_pages - test_indices_length + 1)
             )
+            group = [i for i in range(search_start, search_start + test_indices_length)]
             dataloader = DataLoader(
                 batch_size=1,
                 inputs = batch[0].to(device)
                 labels = batch[1].to(device)
+                if len(inputs[0]) != len(labels[0]):
                     breakpoint()
                 if "optimized" in model_name:
                     outputs = model(input_ids=inputs, labels=labels)
                 # Backward Pass
                 model.zero_grad()
+            average_loss = total_loss / (index + 1)
+            results[model_name]["main-net"][str(epoch)] = [average_loss]
             print(f"Epoch: {epoch}  Average Loss: {average_loss:.2f}")
             with open("results.json", "w") as outfile:
+                json.dump(results, outfile, indent=4)
+            current_revision = model.config._commit_hash
+            keep_recent = 1
+            try:
+                cache_info = scan_cache_dir()
+                for repo in cache_info.repos:
+                    if repo.repo_id == model_name:
+                        revisions = sorted(
+                            repo.revisions, key=lambda r: r.last_modified, reverse=True
+                        )
+                        current_index = next(
+                            (
+                                i
+                                for i, r in enumerate(revisions)
+                                if r.commit_hash == current_revision
+                            ),
+                            None,
+                        )
+                        if current_index is not None:
+                            for revision in revisions[
+                                max(current_index + 1, keep_recent) :
+                            ]:
+                                cache_info.delete_revisions(
+                                    revision.commit_hash
+                                ).execute()
+                        break
+            except:
+                print(
+                    "Failed to delete previous model version from cache. This might lead to 100% disk space utlisation in the future."
+                )

results.json CHANGED Viewed

@@ -1,4 +1,264 @@
 {
     "distributed/optimized-gpt2-1b": {
         "main-net": {
             "0": [

 {
+    "distributed/optimized-gpt2-2b": {
+        "main-net": {
+            "0": [
+                24.161225109466358
+            ],
+            "1": [
+                10.691863035173064
+            ],
+            "2": [
+                10.022417357756433
+            ],
+            "3": [
+                9.55640449560465
+            ],
+            "4": [
+                9.58835850097239
+            ],
+            "5": [
+                9.462455684855833
+            ],
+            "6": [
+                9.277088693210057
+            ],
+            "7": [
+                9.301309664550528
+            ],
+            "8": [
+                9.130553578411487
+            ],
+            "9": [
+                9.198034809787515
+            ],
+            "10": [
+                9.150826927009486
+            ],
+            "11": [
+                9.101872412292888
+            ],
+            "12": [
+                9.037806239881014
+            ],
+            "13": [
+                8.92622016663717
+            ],
+            "14": [
+                8.890519196425027
+            ],
+            "15": [
+                8.851365919739123
+            ],
+            "16": [
+                8.86057827515688
+            ],
+            "17": [
+                8.770904886188791
+            ],
+            "18": [
+                8.787317971086813
+            ],
+            "19": [
+                8.762063222648823
+            ],
+            "20": [
+                8.691791485353338
+            ],
+            "21": [
+                8.612739718558705
+            ],
+            "22": [
+                8.662117434136661
+            ],
+            "23": [
+                8.569304224873378
+            ],
+            "24": [
+                8.508418809899077
+            ],
+            "25": [
+                8.416297421540703
+            ],
+            "26": [
+                8.395312497974823
+            ],
+            "27": [
+                8.361652030098822
+            ],
+            "28": [
+                8.309751656976077
+            ],
+            "29": [
+                8.271234605559991
+            ],
+            "30": [
+                8.302275388588832
+            ],
+            "31": [
+                8.172970907627247
+            ],
+            "32": [
+                8.112572425603867
+            ],
+            "33": [
+                0.0
+            ],
+            "34": [
+                8.067226740030142
+            ],
+            "35": [
+                8.015923105084333
+            ],
+            "36": [
+                8.000407182927034
+            ],
+            "37": [
+                7.897538427511851
+            ],
+            "38": [
+                7.8652859703003175
+            ],
+            "39": [
+                7.817014323654024
+            ],
+            "40": [
+                7.807054872649335
+            ],
+            "41": [
+                7.827541650510302
+            ],
+            "42": [
+                7.689037536915112
+            ],
+            "43": [
+                7.757941870595895
+            ],
+            "44": [
+                7.804858555885658
+            ],
+            "45": [
+                7.6064825819472395
+            ],
+            "46": [
+                7.611153989357136
+            ],
+            "47": [
+                7.59192221113976
+            ],
+            "48": [
+                7.578028715109523
+            ],
+            "49": [
+                7.535055926722339
+            ],
+            "50": [
+                7.4285404285591445
+            ],
+            "51": [
+                7.508890847739933
+            ],
+            "52": [
+                7.594857940802703
+            ],
+            "53": [
+                7.512502627618094
+            ],
+            "54": [
+                7.506787989576394
+            ],
+            "55": [
+                7.501947044107324
+            ],
+            "56": [
+                7.429504378702631
+            ],
+            "57": [
+                7.372085496371972
+            ],
+            "58": [
+                7.408436578101554
+            ],
+            "59": [
+                7.408653273726955
+            ],
+            "60": [
+                7.3867659356859
+            ],
+            "61": [
+                7.328268373037534
+            ],
+            "62": [
+                7.374929182813982
+            ],
+            "63": [
+                7.309664613777591
+            ],
+            "64": [
+                7.282248006827795
+            ],
+            "65": [
+                7.386888501138398
+            ],
+            "66": [
+                7.2420131648637325
+            ],
+            "67": [
+                7.3391031794848
+            ],
+            "68": [
+                7.266478459521978
+            ],
+            "69": [
+                7.2372944774106145
+            ],
+            "70": [
+                7.293267532594487
+            ],
+            "71": [
+                7.174058415324812
+            ],
+            "72": [
+                7.300561785442671
+            ],
+            "73": [
+                7.2531355329462
+            ],
+            "74": [
+                7.176742718436501
+            ],
+            "75": [
+                7.150713069236231
+            ],
+            "76": [
+                7.181416940589538
+            ],
+            "77": [
+                7.206587009709836
+            ],
+            "78": [
+                7.08934546457475
+            ],
+            "79": [
+                7.042178735546037
+            ],
+            "80": [
+                7.034408964761874
+            ],
+            "81": [
+                7.1328608132201765
+            ],
+            "82": [
+                7.020384328287156
+            ],
+            "83": [
+                6.989416580784077
+            ],
+            "84": [
+                7.075146196260734
+            ]
+        },
+        "baseline": {}
+    },
     "distributed/optimized-gpt2-1b": {
         "main-net": {
             "0": [