update clip.

Signed-off-by: wxywb <xy.wang@zilliz.com>
4 years ago · 3dc67453f0
4 changed files with 47 additions and 18 deletions
--- a/init.py
+++ b/init.py
@ -1 +1,22 @@
-from .clip import *
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .clip import Clip
+
+def dolg(img_size=512, input_dim=3, hidden_dim=1024, output_dim=2048):
+    return Dolg(img_size, input_dim, hidden_dim, output_dim)
+
+
+def clip(name: str, modality: str):
+    return Clip(name, modality)
--- a/clip.py
+++ b/clip.py
@ -12,42 +12,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
-@register(output_schema=['vec'])
 import numpy
 import towhee
 import sys
 from pathlib import Path
+import torch
 from torchvision import transforms
-from towhee.types.image_utils import to_pil

+from towhee.types.image_utils import to_pil
 from towhee.operator.base import NNOperator, OperatorFlag
 from towhee.types.arg import arg, to_image_color
 from towhee import register

+
@register(output_schema=['vec'])
 class Clip(NNOperator): 
    """
    CLIP multi-modal embedding operator
    """
-    def __init__(self, modality: str):
-        self._modality = modality
+    def __init__(self, name: str, modality: str):
+        sys.path.append(str(Path(__file__).parent))
+        #from clip_impl import load
+        import clip_impl
+        self.modality = modality
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._model, self.preprocess = clip_impl.load(name, self.device)
+        self.tokenize = clip_impl.tokenize

    def __call__(self, data):
-        if self._modality == 'image'
-            emb = self._inference_from_image(data)
-        elif self._modality == 'text' 
-            emb = self._inference_from_text(data)
-        else 
+        if self.modality == 'image':
+            vec = self._inference_from_image(data)
+        elif self.modality == 'text': 
+            vec = self._inference_from_text(data)
+        else:
            raise ValueError("modality[{}] not implemented.".format(self._modality))
+        return vec

    def _inference_from_text(self, text):
-        return text
+        text = self.tokenize(text).to(self.device)
+        text_features = self._model.encode_text(text)
+        return text_features

    @arg(1, to_image_color('RGB'))
    def _inference_from_image(self, img):
-        return img
-
-
-
+        image = self.preprocess(to_pil(img)).unsqueeze(0).to(self.device)
+        image_features = self._model.encode_image(image)
+        return image_features

--- a/clip_impl.py
+++ b/clip_impl.py
@ -10,8 +10,8 @@ from PIL import Image
 from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
 from tqdm import tqdm

-from .model import build_model
-from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+from clip_model import build_model
+from simple_tokenizer import SimpleTokenizer as _Tokenizer

 try:
    from torchvision.transforms import InterpolationMode
--- a/clip_model.py
+++ b/clip_model.py