Browse Source
Update tokenizer
Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
main
1 changed files with
3 additions and
2 deletions
-
codebert.py
|
|
@ -74,8 +74,9 @@ class CodeBert(NNOperator): |
|
|
|
def __call__(self, txt: str) -> numpy.ndarray: |
|
|
|
try: |
|
|
|
tokens = self.tokenizer.tokenize(txt) |
|
|
|
tokens = [tokenizer.cls_token, '<encoder-only>', tokenizer.sep_token] + tokens + [tokenizer.sep_token] |
|
|
|
tokens_ids = tokenizer.convert_tokens_to_ids(tokens, return_tensors='pt') |
|
|
|
tokens = [self.tokenizer.cls_token, '<encoder-only>', self.tokenizer.sep_token] + tokens + \ |
|
|
|
[self.tokenizer.sep_token] |
|
|
|
tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens, return_tensors='pt') |
|
|
|
inputs = torch.tensor(tokens_ids).to(self.device) |
|
|
|
except Exception as e: |
|
|
|
log.error(f'Invalid input for the tokenizer: {self.model_name}') |
|
|
|