Joash commited on
Commit
6a725a8
·
1 Parent(s): 1878206

Fix tokenizer initialization and improve error handling

Browse files
Files changed (1) hide show
  1. app.py +21 -2
app.py CHANGED
@@ -54,12 +54,21 @@ class CodeReviewer:
54
  login(token=HF_TOKEN, add_to_git_credential=False)
55
 
56
  logger.info("Loading tokenizer...")
 
57
  self.tokenizer = AutoTokenizer.from_pretrained(
58
  MODEL_NAME,
59
  token=HF_TOKEN,
60
  trust_remote_code=True,
61
  cache_dir=CACHE_DIR
62
  )
 
 
 
 
 
 
 
 
63
 
64
  logger.info("Loading model...")
65
  self.model = AutoModelForCausalLM.from_pretrained(
@@ -71,6 +80,8 @@ class CodeReviewer:
71
  cache_dir=CACHE_DIR,
72
  token=HF_TOKEN
73
  )
 
 
74
  self.device = next(self.model.parameters()).device
75
  logger.info(f"Model loaded successfully on {self.device}")
76
  except Exception as e:
@@ -93,6 +104,9 @@ Code:
93
  @spaces.GPU
94
  def review_code(self, code: str, language: str) -> str:
95
  """Perform code review using the model."""
 
 
 
96
  try:
97
  start_time = datetime.now()
98
  prompt = self.create_review_prompt(code, language)
@@ -105,7 +119,10 @@ Code:
105
  truncation=True,
106
  max_length=512,
107
  padding=True
108
- ).to(self.device)
 
 
 
109
  except Exception as token_error:
110
  logger.error(f"Tokenization error: {token_error}")
111
  return "Error: Failed to process input code. Please try again."
@@ -120,7 +137,9 @@ Code:
120
  temperature=0.7,
121
  top_p=0.95,
122
  num_beams=1,
123
- early_stopping=True
 
 
124
  )
125
  except Exception as gen_error:
126
  logger.error(f"Generation error: {gen_error}")
 
54
  login(token=HF_TOKEN, add_to_git_credential=False)
55
 
56
  logger.info("Loading tokenizer...")
57
+ # Initialize tokenizer with special tokens
58
  self.tokenizer = AutoTokenizer.from_pretrained(
59
  MODEL_NAME,
60
  token=HF_TOKEN,
61
  trust_remote_code=True,
62
  cache_dir=CACHE_DIR
63
  )
64
+ # Ensure special tokens are set
65
+ special_tokens = {
66
+ 'pad_token': '[PAD]',
67
+ 'eos_token': '</s>',
68
+ 'bos_token': '<s>'
69
+ }
70
+ self.tokenizer.add_special_tokens(special_tokens)
71
+ logger.info("Tokenizer loaded successfully")
72
 
73
  logger.info("Loading model...")
74
  self.model = AutoModelForCausalLM.from_pretrained(
 
80
  cache_dir=CACHE_DIR,
81
  token=HF_TOKEN
82
  )
83
+ # Resize embeddings for special tokens
84
+ self.model.resize_token_embeddings(len(self.tokenizer))
85
  self.device = next(self.model.parameters()).device
86
  logger.info(f"Model loaded successfully on {self.device}")
87
  except Exception as e:
 
104
  @spaces.GPU
105
  def review_code(self, code: str, language: str) -> str:
106
  """Perform code review using the model."""
107
+ if not self.tokenizer or not self.model:
108
+ return "Error: Model not properly initialized. Please try again later."
109
+
110
  try:
111
  start_time = datetime.now()
112
  prompt = self.create_review_prompt(code, language)
 
119
  truncation=True,
120
  max_length=512,
121
  padding=True
122
+ )
123
+ if inputs is None:
124
+ raise ValueError("Failed to tokenize input")
125
+ inputs = inputs.to(self.device)
126
  except Exception as token_error:
127
  logger.error(f"Tokenization error: {token_error}")
128
  return "Error: Failed to process input code. Please try again."
 
137
  temperature=0.7,
138
  top_p=0.95,
139
  num_beams=1,
140
+ early_stopping=True,
141
+ pad_token_id=self.tokenizer.pad_token_id,
142
+ eos_token_id=self.tokenizer.eos_token_id
143
  )
144
  except Exception as gen_error:
145
  logger.error(f"Generation error: {gen_error}")