pfnet
/

GenerRNA

Model card Files Files and versions Community

MasaakiKotera commited on Oct 18, 2024

Commit

8f79412

verified ·

1 Parent(s): 106d320

Upload sampling.py with huggingface_hub

Browse files

Files changed (1) hide show

sampling.py +54 -16

sampling.py CHANGED Viewed

@@ -23,6 +23,7 @@ parser.add_argument("--out_path", type=str, required=True)
 parser.add_argument("--num_samples", type=int, required=False, default=100000)
 parser.add_argument("--max_new_tokens", type=int, required=True, help="number of tokens generated in each sample")
 parser.add_argument("--strategy",type=str, required=False,default='top_k',help="should be in ['greedy_search', 'sampling', 'top_k', 'beam_search']")
 parser.add_argument("--temperature",type=float, required=False,default=1.0,help="1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions")
 parser.add_argument("--top_k",type=int, required=False,default=20,help="retain only the top_k most likely tokens, clamp others to have 0 probability")
 parser.add_argument("--ckpt_path",type=str, required=True,help="path to a checkpoint/model")
@@ -30,6 +31,7 @@ parser.add_argument("--tokenizer_path",type=str, required=True,help="path to a t
 parser.add_argument("--start",type=str, required=False,default="<|endoftext|>")
 parser.add_argument("--repetition_penalty",type=float, required=False,default=1.0)
 parser.add_argument("--shuffle_token", action='store_true', help="Enable shuffling of tokens before decoding")
 args = parser.parse_args()
 init_from = args.init_from
@@ -37,17 +39,20 @@ out_path = args.out_path
 num_samples = args.num_samples
 max_new_tokens = args.max_new_tokens
 strategy = args.strategy
 temperature = args.temperature
 top_k = args.top_k
 ckpt_path = args.ckpt_path
 tokenizer_path = args.tokenizer_path
 start = args.start
 repetition_penalty = args.repetition_penalty
 # -----------------------------------------------------------------------------
 seed = random.randint(1,6666)
-# seed = 1337
-device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
 dtype = 'float32'
 # dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
 compile = False # use PyTorch 2.0 to compile the model to be faster
@@ -91,20 +96,53 @@ load_meta = False
 encode = tokenizer.encode
 decode = tokenizer.decode
-start_ids = encode("".join(start))
-x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
-with open(out_path, 'a') as f:
-    with torch.no_grad():
-        with ctx:
-            for k in tqdm(range(num_samples), desc="Generating samples"):
-                token_sequence = model.generate(x, max_new_tokens, strategy=strategy, temperature=temperature, top_k=top_k, repetition_penalty=repetition_penalty)[0].tolist()
-                # Shuffle tokens if --shuffle_token is specified
-                if args.shuffle_token:
-                    random.shuffle(token_sequence)
-                y = decode(token_sequence) + '\n'
-                f.write(y)
-                f.flush()

 parser.add_argument("--num_samples", type=int, required=False, default=100000)
 parser.add_argument("--max_new_tokens", type=int, required=True, help="number of tokens generated in each sample")
 parser.add_argument("--strategy",type=str, required=False,default='top_k',help="should be in ['greedy_search', 'sampling', 'top_k', 'beam_search']")
+parser.add_argument("--beam_size",type=int, required=False,default=3,help="beam size for beam search")
 parser.add_argument("--temperature",type=float, required=False,default=1.0,help="1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions")
 parser.add_argument("--top_k",type=int, required=False,default=20,help="retain only the top_k most likely tokens, clamp others to have 0 probability")
 parser.add_argument("--ckpt_path",type=str, required=True,help="path to a checkpoint/model")
 parser.add_argument("--start",type=str, required=False,default="<|endoftext|>")
 parser.add_argument("--repetition_penalty",type=float, required=False,default=1.0)
 parser.add_argument("--shuffle_token", action='store_true', help="Enable shuffling of tokens before decoding")
+parser.add_argument("--fasta", action='store_true', default=True, help="Enable writing output in FASTA format")
 args = parser.parse_args()
 init_from = args.init_from
 num_samples = args.num_samples
 max_new_tokens = args.max_new_tokens
 strategy = args.strategy
+assert strategy in ['greedy_search', 'sampling', 'top_k', 'beam_search']
+beam_size = args.beam_size
 temperature = args.temperature
 top_k = args.top_k
 ckpt_path = args.ckpt_path
 tokenizer_path = args.tokenizer_path
 start = args.start
 repetition_penalty = args.repetition_penalty
+fasta = args.fasta
 # -----------------------------------------------------------------------------
 seed = random.randint(1,6666)
+device = 'cuda'
 dtype = 'float32'
 # dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
 compile = False # use PyTorch 2.0 to compile the model to be faster
 encode = tokenizer.encode
 decode = tokenizer.decode
+fasta_out_path = os.path.splitext(out_path)[0] + ".fasta" if fasta else None
+if strategy in["sampling", "top_k"]:
+    start_ids = encode("".join(start))
+    x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+    with open(out_path, 'a') as f:
+        with open(fasta_out_path, 'a') if fasta else nullcontext() as fasta_f:
+            with torch.no_grad():
+                with ctx:
+                    for k in tqdm(range(num_samples), desc="Generating samples"):
+                        token_sequence = model.generate(x, max_new_tokens, strategy=strategy, temperature=temperature, top_k=top_k, repetition_penalty=repetition_penalty)[0].tolist()
+                        # Shuffle tokens if --shuffle_token is specified
+                        if args.shuffle_token:
+                            random.shuffle(token_sequence)
+                        y = decode(token_sequence).replace(' ', '')
+                        # y = decode(token_sequence).replace('\n', '').replace(' ', '') + '\n'
+                        f.write(y)
+                        f.flush()
+                        if fasta:
+                            fasta_entry = f">sample_{k}\n{y.replace(' ', '')}\n"
+                            fasta_f.write(fasta_entry.strip() + '\n')
+                            fasta_f.flush()
+elif strategy in ["beam_search", "greedy_search"]:
+    with open(out_path, 'a') as f:
+        with open(fasta_out_path, 'a') if fasta else nullcontext() as fasta_f:
+            with torch.no_grad():
+                with ctx:
+                    start = '<|endoftext|>'
+                    start_ids = encode(start)
+                    x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+                    token_sequence = model.generate(x, max_new_tokens, strategy=strategy, temperature=temperature, top_k=top_k, repetition_penalty=repetition_penalty, beam_size=beam_size)[0].tolist()
+                    y = decode(token_sequence).replace(' ', '')
+                    f.write(y)
+                    f.flush()
+                    if fasta:
+                        fasta_entry = f">sample_{k}\n{y.replace(' ', '')}\n"
+                        fasta_f.write(fasta_entry.strip() + '\n')
+                        fasta_f.flush()