Browse Source
        
      
      Update tokenizer
      
        Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
      
      
        main
      
      
     
    
    
    
	
		
			
				 1 changed files with 
3 additions and 
2 deletions
			 
			
		 
		
			
				- 
					
					
					 
					codebert.py
				
 
			
		
		
			
			
			
			
			
			
				
				
					
						
							
								
									
	
		
			
				| 
					
					
						
							
						
					
					
				 | 
				@ -74,8 +74,9 @@ class CodeBert(NNOperator): | 
			
		
		
	
		
			
				 | 
				 | 
				    def __call__(self, txt: str) -> numpy.ndarray: | 
				 | 
				 | 
				    def __call__(self, txt: str) -> numpy.ndarray: | 
			
		
		
	
		
			
				 | 
				 | 
				        try: | 
				 | 
				 | 
				        try: | 
			
		
		
	
		
			
				 | 
				 | 
				            tokens = self.tokenizer.tokenize(txt) | 
				 | 
				 | 
				            tokens = self.tokenizer.tokenize(txt) | 
			
		
		
	
		
			
				 | 
				 | 
				            tokens = [tokenizer.cls_token, '<encoder-only>', tokenizer.sep_token] + tokens + [tokenizer.sep_token] | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				            tokens_ids = tokenizer.convert_tokens_to_ids(tokens, return_tensors='pt') | 
				 | 
				 | 
				 | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				            tokens = [self.tokenizer.cls_token, '<encoder-only>', self.tokenizer.sep_token] + tokens + \ | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				                     [self.tokenizer.sep_token] | 
			
		
		
	
		
			
				 | 
				 | 
				 | 
				 | 
				 | 
				            tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens, return_tensors='pt') | 
			
		
		
	
		
			
				 | 
				 | 
				            inputs = torch.tensor(tokens_ids).to(self.device) | 
				 | 
				 | 
				            inputs = torch.tensor(tokens_ids).to(self.device) | 
			
		
		
	
		
			
				 | 
				 | 
				        except Exception as e: | 
				 | 
				 | 
				        except Exception as e: | 
			
		
		
	
		
			
				 | 
				 | 
				            log.error(f'Invalid input for the tokenizer: {self.model_name}') | 
				 | 
				 | 
				            log.error(f'Invalid input for the tokenizer: {self.model_name}') | 
			
		
		
	
	
		
			
				| 
					
						
							
						
					
					
					
				 | 
				
  |