diff --git a/README.md b/README.md index cb0cfea..39e773f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,111 @@ -# clip4clip +# Video-Text Retrieval Embdding with CLIP4Clip + +*author: Chen Zhang* + + +
+ + + +## Description + +This operator extracts features for video or text with [CLIP4Clip](https://arxiv.org/abs/2104.08860) which can generate embeddings for text and video by jointly training a video encoder and text encoder to maximize the cosine similarity. + + +
+ + +## Code Example + +Load an video from path './demo_video.mp4' to generate an video embedding. + +Read the text 'kids feeding and playing with the horse' to generate an text embedding. + + *Write the pipeline in simplified style*: + +```python +import towhee + +towhee.dc(['./demo_video.mp4']) \ + .video_decode.ffmpeg(sample_type='uniform_temporal_subsample', args={'num_samples': 12}) \ + .runas_op(func=lambda x: [y[0] for y in x]) \ + .clip4clip(model_name='clip_vit_b32', modality='video', weight_path='./pytorch_model.bin.1') \ + .show() + +towhee.dc(['kids feeding and playing with the horse']) \ + .clip4clip(model_name='clip_vit_b32', modality='text', weight_path='./pytorch_model.bin.1') \ + .show() +``` +![](vect_simplified_video.png) +![](vect_simplified_text.png) + +*Write a same pipeline with explicit inputs/outputs name specifications:* + +```python +import towhee + +towhee.dc['path'](['./demo_video.mp4']) \ + .video_decode.ffmpeg['path', 'frames'](sample_type='uniform_temporal_subsample', args={'num_samples': 12}) \ + .runas_op['frames', 'frames'](func=lambda x: [y[0] for y in x]) \ + .clip4clip['frames', 'vec'](model_name='clip_vit_b32', modality='video', weight_path='./pytorch_model.bin.1') \ + .show() + +towhee.dc['text'](["kids feeding and playing with the horse"]) \ + .clip4clip['text','vec'](model_name='clip_vit_b32', modality='text', weight_path='./pytorch_model.bin.1') \ + .select['text', 'vec']() \ + .show() +``` + +![](vect_explicit_video.png) +![](vect_explicit_text.png) + +
+ + + +## Factory Constructor + +Create the operator via the following factory method + +***clip4clip(model_name, modality, weight_path)*** + +**Parameters:** + +​ ***model_name:*** *str* + +​ The model name of CLIP. Supported model names: +- clip_vit_b32 + + +​ ***modality:*** *str* + +​ Which modality(*video* or *text*) is used to generate the embedding. + +​ ***weight_path:*** *str* + +​ pretrained model weights path. + +
+ + + +## Interface + +An video-text embedding operator takes a list of [towhee image](link/to/towhee/image/api/doc) or string as input and generate an embedding in ndarray. + + +**Parameters:** + +​ ***data:*** *List[towhee.types.Image]* or *str* + +​ The data (list of image(which is uniform subsampled from a video) or text based on specified modality) to generate embedding. + + + +**Returns:** *numpy.ndarray* + +​ The data embedding extracted by model. + + + diff --git a/clip4clip.py b/clip4clip.py index d66af59..74a4ecd 100644 --- a/clip4clip.py +++ b/clip4clip.py @@ -11,31 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import random -import sys -from pathlib import Path import numpy as np import torch import towhee + +from typing import List, Union from torchvision import transforms from towhee.models.clip4clip import convert_tokens_to_id - -from towhee.types.image_utils import to_pil -from towhee.operator.base import NNOperator, OperatorFlag -from towhee.types.arg import arg, to_image_color +from towhee.operator.base import NNOperator from towhee import register from towhee.models import clip4clip -from towhee.utils.ndarray_utils import to_ndarray from PIL import Image as PILImage +from towhee.types.image import Image -@register(name='clip4clip', output_schema=['vec']) +@register(output_schema=['vec']) class CLIP4Clip(NNOperator): """ - CLIP multi-modal embedding operator + CLIP4Clip multi-modal embedding operator """ + def __init__(self, model_name: str, modality: str, weight_path: str = None): super().__init__() self.modality = modality @@ -52,11 +48,11 @@ class CLIP4Clip(NNOperator): transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( - (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) - ]) + (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) + ]) self.model.eval() - def __call__(self, data): + def __call__(self, data: Union[str, List[Image]]): if self.modality == 'video': vec = self._inference_from_video(data) elif self.modality == 'text': @@ -64,43 +60,33 @@ class CLIP4Clip(NNOperator): else: raise ValueError("modality[{}] not implemented.".format(self._modality)) return vec - # - def _inference_from_text(self, text): + + def _inference_from_text(self, text: str): self.model.eval() - # text = self.tokenize(text) text_ids = convert_tokens_to_id(self.tokenize, text) - print(text_ids) text_ids = torch.tensor(text_ids).unsqueeze(0).to(self.device) text_features = self.model.get_sequence_output(text_ids) text_features = text_features / text_features.norm(dim=-1, keepdim=True) - # print(text_features.norm(dim=-1, keepdim=True)) - return text_features#.unsqueeze(0).cpu().numpy() + return text_features.detach().flatten().cpu().numpy() - def _inference_from_video(self, img_list): + def _inference_from_video(self, img_list: List[Image]): self.model.eval() - # video = self.tfms(video) max_frames = 12 - video = np.zeros((1, max_frames, 1, 3, 224, 224), dtype=np.float) + video = np.zeros((1, max_frames, 1, 3, 224, 224), dtype=np.float64) slice_len = len(img_list) max_video_length = 0 if 0 > slice_len else slice_len for i, img in enumerate(img_list): pil_img = PILImage.fromarray(img.to_ndarray(), img.mode) tfmed_img = self.tfms(pil_img).unsqueeze(0).to(self.device) - print('tfmed_img.shape', tfmed_img.shape) - if slice_len >= 1: video[0, i, ...] = tfmed_img - video_mask = np.zeros((1, max_frames), dtype=np.long) + video_mask = np.zeros((1, max_frames), dtype=np.int32) video_mask[0, :max_video_length] = [1] * max_video_length video = torch.as_tensor(video).float() pair, bs, ts, channel, h, w = video.shape video = video.view(pair * bs * ts, channel, h, w) video_mask = torch.as_tensor(video_mask).float() - # video_list.append(video) - # video_mask_list.append(video_mask) - # video_list_tensor = torch.cat(video_list, dim=0) - # video_mask_list_tensor = torch.cat(video_mask_list, dim=0) visual_output = self.model.get_visual_output(video, video_mask, shaped=True) @@ -114,44 +100,4 @@ class CLIP4Clip(NNOperator): visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True) - return visual_output#.unsqueeze(0).cpu().numpy() - - - # - # @arg(1, to_image_color('RGB')) - # def _inference_from_video(self, img): - # img = to_pil(img) - # image = self.tfms(img).unsqueeze(0).to(self.device) - # image_features = self.model.encode_image(image) - # return image_features -if __name__ == '__main__': - # op = CLIP4Clip('clip_vit_b32', 'text', './pytorch_model.bin.1') - # res = op('kids feeding and playing with the horse') - # print(res.shape) - - - - - # from towhee import ops - # op = CLIP4Clip('clip_vit_b32', 'video', './pytorch_model.bin.1') - # d = ops.video_decode.ffmpeg(sample_type='uniform_temporal_subsample', - # args={'num_samples': 12}) - # # ops.video_decode.get_video_duration() - video_path = '/Users/zilliz/dataset/MSRVTT/MSRVTT/videos/all/video9451.mp4' - # img_list = [] - # for frame in d(video_path): - # print(frame) - # img_list.append(frame[0]) - # res = op(img_list) - # print(res.shape) - - dc = ( - towhee.dc['path']([video_path]) - .video_decode.ffmpeg['path', 'frames']( - sample_type='uniform_temporal_subsample', - args={'num_samples': 12}) - .runas_op['frames', 'frames'](func=lambda x: [y[0] for y in x]) - .clip4clip['frames', 'vec'](model_name='clip_vit_b32', modality='video', weight_path='./pytorch_model.bin.1') - ) - dc.show() - + return visual_output.detach().flatten().cpu().numpy() diff --git a/demo_video.mp4 b/demo_video.mp4 new file mode 100755 index 0000000..e6fb645 Binary files /dev/null and b/demo_video.mp4 differ diff --git a/vect_explicit_text.png b/vect_explicit_text.png new file mode 100644 index 0000000..9569f6f Binary files /dev/null and b/vect_explicit_text.png differ diff --git a/vect_explicit_video.png b/vect_explicit_video.png new file mode 100644 index 0000000..c534085 Binary files /dev/null and b/vect_explicit_video.png differ diff --git a/vect_simplified_text.png b/vect_simplified_text.png new file mode 100644 index 0000000..7ce111d Binary files /dev/null and b/vect_simplified_text.png differ diff --git a/vect_simplified_video.png b/vect_simplified_video.png new file mode 100644 index 0000000..a4a3235 Binary files /dev/null and b/vect_simplified_video.png differ