init

4 years ago · adfb671d8e
6 changed files with 281 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,121 @@
-# mdmmt
+# Video-Text Retrieval Embdding with MDMMT
+
+*author: Chen Zhang*
+
+
+<br />
+
+
+
+## Description
+
+This operator extracts features for video or text with [MDMMT: Multidomain Multimodal Transformer for Video Retrieval](https://arxiv.org/pdf/2103.10699.pdf), which can generate embeddings for text and video by jointly training a video encoder and text encoder to maximize the cosine similarity.
+
+
+<br />
+
+
+## Code Example
+
+Load a video embeddings extracted from different upstream expert networks, such as video, RGB, audio.
+
+Read the text to generate a text embedding. 
+
+ *Write the pipeline code*:
+
+```python
+import towhee
+import torch
+
+torch.manual_seed(42)
+
+# features are embeddings extracted from the upstream models.
+features = {
+    "VIDEO": torch.rand(30, 2048),
+    "CLIP": torch.rand(30, 512),
+    "tf_vggish": torch.rand(30, 128),
+}
+
+# features_t is the time series of the features, usually uniformly sampled. 
+features_t = {
+    "VIDEO": torch.linspace(1, 30, steps=30),
+    "CLIP": torch.linspace(1, 30, steps=30),
+    "tf_vggish": torch.linspace(1, 30, steps=30),
+}
+
+# features_ind is the mask of the features.
+features_ind = {
+    "VIDEO": torch.as_tensor([1] * 25 + [0] * 5),
+    "CLIP": torch.as_tensor([1] * 25 + [0] * 5),
+    "tf_vggish": torch.as_tensor([1] * 25 + [0] * 5),
+}
+
+video_input_dict = {"features": features, "features_t": features_t, "features_ind": features_ind}
+
+towhee.dc([video_input_dict]).video_text_embedding.mdmmt(modality='video', device='cpu').show()
+
+towhee.dc(['Hello world.']).video_text_embedding.mdmmt(modality='text', device='cpu').show()
+```
+![](vect_simplified_video.png)
+![](vect_simplified_text.png)
+
+*Write a same pipeline with explicit inputs/outputs name specifications:*
+
+<br />
+
+
+
+## Factory Constructor
+
+Create the operator via the following factory method
+
+***mdmmt(modality: str)***
+
+**Parameters:**
+
+   ***modality:*** *str*
+
+   Which modality(*video* or *text*) is used to generate the embedding. 
+
+   ***weight_path:*** *Optional[str]*
+
+   pretrained model weights path.  
+
+   ***device:*** *Optional[str]*
+
+   cpu or cuda.  
+
+   ***mmtvid_params:*** *Optional[dict]*
+
+   mmtvid model params for custom model.  
+
+   ***mmttxt_params:*** *Optional[dict]*
+
+   mmttxt model params for custom model.  
+
+
+<br />
+
+
+
+## Interface
+
+When video modality, load a video embeddings extracted from different upstream expert networks, such as video, RGB, audio.  
+When text modality, read the text to generate a text embedding. 
+
+
+**Parameters:**
+
+	***data:*** *dict* or *str*
+
+  The embedding dict extracted from different upstream expert networks or text, based on specified modality).	
+
+
+
+**Returns:** *numpy.ndarray*
+
+   The data embedding extracted by model.
+
+
+

--- a/init.py
+++ b/init.py
@ -0,0 +1,20 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mdmmt import MDMMT
+
+
+def mdmmt(modality: str, **kwargs):
+    return MDMMT(modality, **kwargs)
+
--- a/mdmmt.py
+++ b/mdmmt.py
@ -0,0 +1,137 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from typing import Dict, Union
+from towhee.models.mdmmt.mmt import MMTVID, MMTTXT
+from towhee.operator.base import NNOperator
+from towhee import register
+from pathlib import Path
+from transformers.models.bert.modeling_bert import BertModel as TxtBertModel
+from transformers import AutoTokenizer
+
+import warnings
+warnings.filterwarnings('ignore')
+
+
+@register(output_schema=['vec'])
+class MDMMT(NNOperator):
+    """
+    MDMMT multi-modal embedding operator
+    """
+
+    def __init__(self, modality: str, weight_path: str = None, device: str = None, mmtvid_params: Dict = None,
+                 mmttxt_params: Dict = None):
+        super().__init__()
+        self.modality = modality
+        if weight_path is None:
+            weight_path = str(Path(__file__).parent / 'mdmmt_3mod.pth')
+            # print('weight_path is None, use default path: {}'.format(weight_path))
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        self.mmtvid_model = None
+        self.mmttxt_model = None
+        state = torch.load(weight_path, map_location='cpu')
+        if self.modality == 'video':
+            if mmtvid_params is None:
+                expert_dims = {
+                    "VIDEO": {"dim": 2048, "idx": 1, "max_tok": 30},
+                    "CLIP": {"dim": 512, "idx": 2, "max_tok": 30},
+                    "tf_vggish": {"dim": 128, "idx": 3, "max_tok": 30},
+                }
+                vid_bert_params = {
+                    "vocab_size_or_config_json_file": 10,
+                    "hidden_size": 512,
+                    "num_hidden_layers": 9,
+                    "intermediate_size": 3072,
+                    "hidden_act": "gelu",
+                    "hidden_dropout_prob": 0.2,
+                    "attention_probs_dropout_prob": 0.2,
+                    "max_position_embeddings": 32,
+                    "type_vocab_size": 19,
+                    "initializer_range": 0.02,
+                    "layer_norm_eps": 1e-12,
+                    "num_attention_heads": 8,
+                }
+
+                class Struct:
+                    def __init__(self, **entries):
+                        self.__dict__.update(entries)
+
+                config = Struct(**vid_bert_params)
+                self.mmtvid_model = MMTVID(
+                    expert_dims=expert_dims,
+                    same_dim=512,
+                    hidden_size=512,
+                    vid_bert_config=config
+                )
+            else:
+                self.mmtvid_model = MMTVID(**mmtvid_params)
+            self.mmtvid_model.load_state_dict(state['vid_state_dict'])
+            self.mmtvid_model.to(device)
+            self.mmtvid_model.eval()
+        elif self.modality == 'text':
+            if mmttxt_params is None:
+                txt_bert_params = {
+                    'hidden_dropout_prob': 0.2,
+                    'attention_probs_dropout_prob': 0.2,
+                }
+                self.mmttxt_model = MMTTXT(
+                    txt_bert=TxtBertModel.from_pretrained('bert-base-cased', **txt_bert_params),
+                    tokenizer=AutoTokenizer.from_pretrained('bert-base-cased'),
+                    max_length=30,
+                    modalities=["CLIP", "tf_vggish", "VIDEO"],
+                    add_special_tokens=True,
+                    add_dot=True,
+                    same_dim=512,
+                    dout_prob=0.2,
+                )
+            else:
+                self.mmttxt_model = MMTTXT(**mmttxt_params)
+            self.mmttxt_model.load_state_dict(state['txt_state_dict'])
+            self.mmttxt_model.to(device)
+            self.mmttxt_model.eval()
+
+    def __call__(self, data: Union[Dict, str]):
+        if self.modality == 'video':
+            vec = self._inference_from_video(**data) # {"features"=..., "features_t"=..., "features_ind"=...}
+        elif self.modality == 'text':
+            vec = self._inference_from_text(data) # str
+        else:
+            raise ValueError("modality[{}] not implemented.".format(self._modality))
+        return vec
+
+    def _inference_from_text(self, text: str):
+        self.mmttxt_model.eval()
+        output = self.mmttxt_model([text])
+        # self.assertTrue(output.shape == (batch_size, 1024))
+        return output.detach().flatten().cpu().numpy()
+
+    def _inference_from_video(self, features, features_t, features_ind):
+        self.mmtvid_model.eval()
+        output = self.mmtvid_model(
+            features=self._preprocess_video_input(features),
+            features_t=self._preprocess_video_input(features_t),
+            features_ind=self._preprocess_video_input(features_ind),
+            features_maxp=None,
+        )
+        return output.detach().flatten().cpu().numpy()
+
+    def _preprocess_video_input(self, data: Dict):
+        for k, v in data.items():
+            data[k] = v.unsqueeze(0).to(self.device)
+        return data
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+transformers
+torch
+towhee.models
+towhee
--- a/vect_simplified_text.png
+++ b/vect_simplified_text.png
--- a/vect_simplified_video.png
+++ b/vect_simplified_video.png