S-Dreamer commited on
Commit
c59a42f
·
verified ·
1 Parent(s): 030ec08

Update generation_fast.py

Browse files
Files changed (1) hide show
  1. generation_fast.py +42 -23
generation_fast.py CHANGED
@@ -1,30 +1,49 @@
1
- # generation_fast.py
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
 
5
- class CodeGenerator:
6
- def __init__(self, model_name="S-Dreamer/PyCodeT5"):
7
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
8
- self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
- self.model.to(self.device)
11
 
12
- def generate_code(self, nl_input, max_length=512, num_beams=5, early_stopping=True):
13
- inputs = self.tokenizer(nl_input, return_tensors="pt").to(self.device)
14
- outputs = self.model.generate(
15
- **inputs,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  max_length=max_length,
17
- num_beams=num_beams,
18
- early_stopping=early_stopping,
19
- no_repeat_ngram_size=2, # Prevents repetition
20
- length_penalty=1.0, # Adjust length penalty
21
- temperature=1.0, # Adjust temperature for diversity
22
  )
23
- generated_code = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
24
- return generated_code
25
 
 
26
  if __name__ == "__main__":
27
- generator = CodeGenerator()
28
- nl_input = "Write a Python function to reverse a string."
29
- generated_code = generator.generate_code(nl_input)
30
- print(generated_code)
 
 
1
  import torch
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
 
4
+ # Load model and tokenizer
5
+ model_name = "your_model_repo"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 
 
8
 
9
+ # Ensure special tokens and preprocessing settings are applied
10
+ if tokenizer.special_tokens_map is None:
11
+ tokenizer.special_tokens_map = {
12
+ "bos_token": "<s>",
13
+ "eos_token": "</s>",
14
+ "unk_token": "<unk>",
15
+ "sep_token": "</s>",
16
+ "pad_token": "<pad>",
17
+ "cls_token": "<s>",
18
+ "mask_token": "<mask>"
19
+ }
20
+ tokenizer.save_pretrained(model_name)
21
+
22
+ preprocessor_config = {
23
+ "do_lower_case": False,
24
+ "max_length": 128,
25
+ "truncation": True,
26
+ "padding": "max_length"
27
+ }
28
+
29
+ # Define a function for text generation
30
+ def generate_code(prompt, max_length=128, temperature=0.7, top_p=0.9):
31
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=preprocessor_config["max_length"])
32
+
33
+ with torch.no_grad():
34
+ outputs = model.generate(
35
+ input_ids=inputs["input_ids"],
36
+ attention_mask=inputs["attention_mask"],
37
  max_length=max_length,
38
+ temperature=temperature,
39
+ top_p=top_p,
40
+ do_sample=True
 
 
41
  )
42
+
43
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
44
 
45
+ # Example usage
46
  if __name__ == "__main__":
47
+ prompt = "def quicksort(arr):"
48
+ generated_code = generate_code(prompt)
49
+ print("Generated Code:\n", generated_code)