opus-mt/opus_mt.py

# Copyright 2021 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
from pathlib import Path

import torch
from torchvision import transforms

from towhee.types.image_utils import to_pil
from towhee.operator.base import NNOperator, OperatorFlag
from towhee.types.arg import arg, to_image_color
from towhee import register

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class OpusMT(NNOperator):
    """
    Opus-mt machine translation
    """
    def __init__(self, model_name: str):
        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        config = self.configs()[model_name]  
        self.tokenizer = AutoTokenizer.from_pretrained(config['tokenizer'])
        self.model = AutoModelForSeq2SeqLM.from_pretrained(config['model'])
        self.model.to(self.device)

    def __call__(self, text): 
        input_ids = self.tokenizer(text, return_tensors='pt', padding=True)['input_ids'].to(self.device)
        outputs = self.model.generate(input_ids)
        decoded = self.tokenizer.decode(outputs[0].detach().cpu(), skip_special_tokens=True)
        return decoded

    def configs(self):
        configs = {}
        configs['opus-mt-en-zh'] = {}
        configs['opus-mt-en-zh']['tokenizer'] = 'Helsinki-NLP/opus-mt-en-zh'
        configs['opus-mt-en-zh']['model'] = 'Helsinki-NLP/opus-mt-en-zh'

        configs['opus-mt-zh-en'] = {}
        configs['opus-mt-zh-en']['tokenizer'] = 'Helsinki-NLP/opus-mt-zh-en'
        configs['opus-mt-zh-en']['model'] = 'Helsinki-NLP/opus-mt-zh-en'

        configs['opus-mt-zh-en'] = {}
        configs['opus-mt-zh-en']['tokenizer'] = 'Helsinki-NLP/opus-mt-zh-en'
        configs['opus-mt-zh-en']['model'] = 'Helsinki-NLP/opus-mt-zh-en'

        configs['opus-tatoeba-en-ja'] = {}
        configs['opus-tatoeba-en-ja']['tokenizer'] = 'Helsinki-NLP/opus-tatoeba-en-ja'
        configs['opus-tatoeba-en-ja']['model'] = 'Helsinki-NLP/opus-tatoeba-en-ja'

        configs['opus-tatoeba-ja-en'] = {}
        configs['opus-tatoeba-ja-en']['tokenizer'] = 'Helsinki-NLP/opus-tatoeba-ja-en'
        configs['opus-tatoeba-ja-en']['model'] = 'Helsinki-NLP/opus-tatoeba-ja-en'

        configs['opus-mt-ru-en'] = {}
        configs['opus-mt-ru-en']['tokenizer'] = 'Helsinki-NLP/opus-mt-ru-en'
        configs['opus-mt-ru-en']['model'] = 'Helsinki-NLP/opus-mt-ru-en'

        configs['opus-mt-en-ru'] = {}
        configs['opus-mt-en-ru']['tokenizer'] = 'Helsinki-NLP/opus-mt-en-ru'
        configs['opus-mt-en-ru']['model'] = 'Helsinki-NLP/opus-mt-en-ru'
        
        return configs
init the operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`# Copyright 2021 Zilliz. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import sys`
			`from pathlib import Path`

			`import torch`
			`from torchvision import transforms`

			`from towhee.types.image_utils import to_pil`
			`from towhee.operator.base import NNOperator, OperatorFlag`
			`from towhee.types.arg import arg, to_image_color`
			`from towhee import register`

			`from transformers import AutoTokenizer, AutoModelForSeq2SeqLM`

update the opus-mt operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`class OpusMT(NNOperator):`
init the operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`"""`
			`Opus-mt machine translation`
			`"""`
			`def __init__(self, model_name: str):`
			`super().__init__()`
			`self.device = "cuda" if torch.cuda.is_available() else "cpu"`
update the opus-mt operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`config = self.configs()[model_name]`
init the operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`self.tokenizer = AutoTokenizer.from_pretrained(config['tokenizer'])`
			`self.model = AutoModelForSeq2SeqLM.from_pretrained(config['model'])`
			`self.model.to(self.device)`

update the operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`def __call__(self, text):`
			`input_ids = self.tokenizer(text, return_tensors='pt', padding=True)['input_ids'].to(self.device)`
update the opus-mt operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`outputs = self.model.generate(input_ids)`
			`decoded = self.tokenizer.decode(outputs[0].detach().cpu(), skip_special_tokens=True)`
init the operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`return decoded`
update the opus-mt operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago
			`def configs(self):`
			`configs = {}`
			`configs['opus-mt-en-zh'] = {}`
			`configs['opus-mt-en-zh']['tokenizer'] = 'Helsinki-NLP/opus-mt-en-zh'`
			`configs['opus-mt-en-zh']['model'] = 'Helsinki-NLP/opus-mt-en-zh'`

			`configs['opus-mt-zh-en'] = {}`
			`configs['opus-mt-zh-en']['tokenizer'] = 'Helsinki-NLP/opus-mt-zh-en'`
			`configs['opus-mt-zh-en']['model'] = 'Helsinki-NLP/opus-mt-zh-en'`
add en-ja model. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago
			`configs['opus-mt-zh-en'] = {}`
			`configs['opus-mt-zh-en']['tokenizer'] = 'Helsinki-NLP/opus-mt-zh-en'`
			`configs['opus-mt-zh-en']['model'] = 'Helsinki-NLP/opus-mt-zh-en'`

			`configs['opus-tatoeba-en-ja'] = {}`
			`configs['opus-tatoeba-en-ja']['tokenizer'] = 'Helsinki-NLP/opus-tatoeba-en-ja'`
			`configs['opus-tatoeba-en-ja']['model'] = 'Helsinki-NLP/opus-tatoeba-en-ja'`

			`configs['opus-tatoeba-ja-en'] = {}`
			`configs['opus-tatoeba-ja-en']['tokenizer'] = 'Helsinki-NLP/opus-tatoeba-ja-en'`
			`configs['opus-tatoeba-ja-en']['model'] = 'Helsinki-NLP/opus-tatoeba-ja-en'`
add English Russian model. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago
			`configs['opus-mt-ru-en'] = {}`
			`configs['opus-mt-ru-en']['tokenizer'] = 'Helsinki-NLP/opus-mt-ru-en'`
			`configs['opus-mt-ru-en']['model'] = 'Helsinki-NLP/opus-mt-ru-en'`

			`configs['opus-mt-en-ru'] = {}`
			`configs['opus-mt-en-ru']['tokenizer'] = 'Helsinki-NLP/opus-mt-en-ru'`
			`configs['opus-mt-en-ru']['model'] = 'Helsinki-NLP/opus-mt-en-ru'`

update the opus-mt operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`return configs`