modifty

4 years ago · 879cbb40d1
7 changed files with 127 additions and 72 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,111 @@
 # clip4clip
 # Video-Text Retrieval Embdding with CLIP4Clip
 *author: Chen Zhang*
 <br />
 ## Description
 This operator extracts features for video or text with [CLIP4Clip](https://arxiv.org/abs/2104.08860) which can generate embeddings for text and video by jointly training a video encoder and text encoder to maximize the cosine similarity.
 <br />
 ## Code Example
 Load an video from path './demo_video.mp4' to generate an video embedding. 
 Read the text 'kids feeding and playing with the horse' to generate an text embedding. 
 *Write the pipeline in simplified style*:
 ```python
 import towhee
 towhee.dc(['./demo_video.mp4']) \
        .video_decode.ffmpeg(sample_type='uniform_temporal_subsample', args={'num_samples': 12}) \
        .runas_op(func=lambda x: [y[0] for y in x]) \
        .clip4clip(model_name='clip_vit_b32', modality='video', weight_path='./pytorch_model.bin.1') \
        .show()
 towhee.dc(['kids feeding and playing with the horse']) \
      .clip4clip(model_name='clip_vit_b32', modality='text', weight_path='./pytorch_model.bin.1') \
      .show()
 ```
 ![](vect_simplified_video.png)   
 ![](vect_simplified_text.png)   
 *Write a same pipeline with explicit inputs/outputs name specifications:*
 ```python
 import towhee
 towhee.dc['path'](['./demo_video.mp4']) \
        .video_decode.ffmpeg['path', 'frames'](sample_type='uniform_temporal_subsample', args={'num_samples': 12}) \
        .runas_op['frames', 'frames'](func=lambda x: [y[0] for y in x]) \
        .clip4clip['frames', 'vec'](model_name='clip_vit_b32', modality='video', weight_path='./pytorch_model.bin.1') \
        .show()
 towhee.dc['text'](["kids feeding and playing with the horse"]) \
      .clip4clip['text','vec'](model_name='clip_vit_b32', modality='text', weight_path='./pytorch_model.bin.1') \
      .select['text', 'vec']() \
      .show()
 ```
 ![](vect_explicit_video.png)    
 ![](vect_explicit_text.png)   
 <br />
 ## Factory Constructor
 Create the operator via the following factory method
 ***clip4clip(model_name, modality, weight_path)***
 **Parameters:**
   ***model_name:*** *str*
   The model name of CLIP. Supported model names: 
 - clip_vit_b32
   ***modality:*** *str*
   Which modality(*video* or *text*) is used to generate the embedding. 
   ***weight_path:*** *str*
   pretrained model weights path.  
 <br />
 ## Interface
 An video-text embedding operator takes a list of [towhee image](link/to/towhee/image/api/doc) or string as input and generate an embedding in ndarray.
 **Parameters:**
 	***data:*** *List[towhee.types.Image]*  or *str*
  The data (list of image(which is uniform subsampled from a video) or text based on specified modality) to generate embedding.	
 **Returns:** *numpy.ndarray*
   The data embedding extracted by model.
--- a/clip4clip.py
+++ b/clip4clip.py
@ -11,31 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import random
 import sys
 from pathlib import Path
 import numpy as np
 import torch
 import towhee
 from typing import List, Union
 from torchvision import transforms
 from towhee.models.clip4clip import convert_tokens_to_id
 from towhee.types.image_utils import to_pil
 from towhee.operator.base import NNOperator, OperatorFlag
 from towhee.types.arg import arg, to_image_color
 from towhee.operator.base import NNOperator
 from towhee import register
 from towhee.models import clip4clip
 from towhee.utils.ndarray_utils import to_ndarray
 from PIL import Image as PILImage
 from towhee.types.image import Image
@register(name='clip4clip', output_schema=['vec'])
@register(output_schema=['vec'])
 class CLIP4Clip(NNOperator):
    """
    CLIP multi-modal embedding operator
    CLIP4Clip multi-modal embedding operator
    """
    def __init__(self, model_name: str, modality: str, weight_path: str = None):
        super().__init__()
        self.modality = modality
@ -52,11 +48,11 @@ class CLIP4Clip(NNOperator):
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(
               (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
            ])
                (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ])
        self.model.eval()
    def __call__(self, data):
    def __call__(self, data: Union[str, List[Image]]):
        if self.modality == 'video':
            vec = self._inference_from_video(data)
        elif self.modality == 'text':
@ -64,43 +60,33 @@ class CLIP4Clip(NNOperator):
        else:
            raise ValueError("modality[{}] not implemented.".format(self._modality))
        return vec
    #
    def _inference_from_text(self, text):
    def _inference_from_text(self, text: str):
        self.model.eval()
        # text = self.tokenize(text)
        text_ids = convert_tokens_to_id(self.tokenize, text)
        print(text_ids)
        text_ids = torch.tensor(text_ids).unsqueeze(0).to(self.device)
        text_features = self.model.get_sequence_output(text_ids)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        # print(text_features.norm(dim=-1, keepdim=True))
        return text_features#.unsqueeze(0).cpu().numpy()
        return text_features.detach().flatten().cpu().numpy()
    def _inference_from_video(self, img_list):
    def _inference_from_video(self, img_list: List[Image]):
        self.model.eval()
        # video = self.tfms(video)
        max_frames = 12
        video = np.zeros((1, max_frames, 1, 3, 224, 224), dtype=np.float)
        video = np.zeros((1, max_frames, 1, 3, 224, 224), dtype=np.float64)
        slice_len = len(img_list)
        max_video_length = 0 if 0 > slice_len else slice_len
        for i, img in enumerate(img_list):
            pil_img = PILImage.fromarray(img.to_ndarray(), img.mode)
            tfmed_img = self.tfms(pil_img).unsqueeze(0).to(self.device)
            print('tfmed_img.shape', tfmed_img.shape)
            if slice_len >= 1:
                video[0, i, ...] = tfmed_img
        video_mask = np.zeros((1, max_frames), dtype=np.long)
        video_mask = np.zeros((1, max_frames), dtype=np.int32)
        video_mask[0, :max_video_length] = [1] * max_video_length
        video = torch.as_tensor(video).float()
        pair, bs, ts, channel, h, w = video.shape
        video = video.view(pair * bs * ts, channel, h, w)
        video_mask = torch.as_tensor(video_mask).float()
        #     video_list.append(video)
        #     video_mask_list.append(video_mask)
        # video_list_tensor = torch.cat(video_list, dim=0)
        # video_mask_list_tensor = torch.cat(video_mask_list, dim=0)
        visual_output = self.model.get_visual_output(video, video_mask, shaped=True)
@ -114,44 +100,4 @@ class CLIP4Clip(NNOperator):
        visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)
        return visual_output#.unsqueeze(0).cpu().numpy()
    #
    # @arg(1, to_image_color('RGB'))
    # def _inference_from_video(self, img):
    #     img = to_pil(img)
    #     image = self.tfms(img).unsqueeze(0).to(self.device)
    #     image_features = self.model.encode_image(image)
    #     return image_features
 if __name__ == '__main__':
    # op = CLIP4Clip('clip_vit_b32', 'text', './pytorch_model.bin.1')
    # res = op('kids feeding and playing with the horse')
    # print(res.shape)
    # from towhee import ops
    # op = CLIP4Clip('clip_vit_b32', 'video', './pytorch_model.bin.1')
    # d = ops.video_decode.ffmpeg(sample_type='uniform_temporal_subsample',
    #                             args={'num_samples': 12})
    # # ops.video_decode.get_video_duration()
    video_path = '/Users/zilliz/dataset/MSRVTT/MSRVTT/videos/all/video9451.mp4'
    # img_list = []
    # for frame in d(video_path):
    #     print(frame)
    #     img_list.append(frame[0])
    # res = op(img_list)
    # print(res.shape)
    dc = (
        towhee.dc['path']([video_path])
            .video_decode.ffmpeg['path', 'frames'](
                sample_type='uniform_temporal_subsample',
                args={'num_samples': 12})
            .runas_op['frames', 'frames'](func=lambda x: [y[0] for y in x])
            .clip4clip['frames', 'vec'](model_name='clip_vit_b32', modality='video', weight_path='./pytorch_model.bin.1')
    )
    dc.show()
        return visual_output.detach().flatten().cpu().numpy()
--- a/demo_video.mp4
+++ b/demo_video.mp4
--- a/vect_explicit_text.png
+++ b/vect_explicit_text.png
--- a/vect_explicit_video.png
+++ b/vect_explicit_video.png
--- a/vect_simplified_text.png
+++ b/vect_simplified_text.png
--- a/vect_simplified_video.png
+++ b/vect_simplified_video.png