logo
Browse Source

Update tokenizer

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
main
Jael Gu 2 years ago
parent
commit
5d025f739b
  1. 5
      codebert.py

5
codebert.py

@ -74,8 +74,9 @@ class CodeBert(NNOperator):
def __call__(self, txt: str) -> numpy.ndarray: def __call__(self, txt: str) -> numpy.ndarray:
try: try:
tokens = self.tokenizer.tokenize(txt) tokens = self.tokenizer.tokenize(txt)
tokens = [tokenizer.cls_token, '<encoder-only>', tokenizer.sep_token] + tokens + [tokenizer.sep_token]
tokens_ids = tokenizer.convert_tokens_to_ids(tokens, return_tensors='pt')
tokens = [self.tokenizer.cls_token, '<encoder-only>', self.tokenizer.sep_token] + tokens + \
[self.tokenizer.sep_token]
tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens, return_tensors='pt')
inputs = torch.tensor(tokens_ids).to(self.device) inputs = torch.tensor(tokens_ids).to(self.device)
except Exception as e: except Exception as e:
log.error(f'Invalid input for the tokenizer: {self.model_name}') log.error(f'Invalid input for the tokenizer: {self.model_name}')

Loading…
Cancel
Save