logo
Browse Source

modifty

main
ChengZi 2 years ago
parent
commit
879cbb40d1
  1. 111
      README.md
  2. 88
      clip4clip.py
  3. BIN
      demo_video.mp4
  4. BIN
      vect_explicit_text.png
  5. BIN
      vect_explicit_video.png
  6. BIN
      vect_simplified_text.png
  7. BIN
      vect_simplified_video.png

111
README.md

@ -1,2 +1,111 @@
# clip4clip
# Video-Text Retrieval Embdding with CLIP4Clip
*author: Chen Zhang*
<br />
## Description
This operator extracts features for video or text with [CLIP4Clip](https://arxiv.org/abs/2104.08860) which can generate embeddings for text and video by jointly training a video encoder and text encoder to maximize the cosine similarity.
<br />
## Code Example
Load an video from path './demo_video.mp4' to generate an video embedding.
Read the text 'kids feeding and playing with the horse' to generate an text embedding.
*Write the pipeline in simplified style*:
```python
import towhee
towhee.dc(['./demo_video.mp4']) \
.video_decode.ffmpeg(sample_type='uniform_temporal_subsample', args={'num_samples': 12}) \
.runas_op(func=lambda x: [y[0] for y in x]) \
.clip4clip(model_name='clip_vit_b32', modality='video', weight_path='./pytorch_model.bin.1') \
.show()
towhee.dc(['kids feeding and playing with the horse']) \
.clip4clip(model_name='clip_vit_b32', modality='text', weight_path='./pytorch_model.bin.1') \
.show()
```
![](vect_simplified_video.png)
![](vect_simplified_text.png)
*Write a same pipeline with explicit inputs/outputs name specifications:*
```python
import towhee
towhee.dc['path'](['./demo_video.mp4']) \
.video_decode.ffmpeg['path', 'frames'](sample_type='uniform_temporal_subsample', args={'num_samples': 12}) \
.runas_op['frames', 'frames'](func=lambda x: [y[0] for y in x]) \
.clip4clip['frames', 'vec'](model_name='clip_vit_b32', modality='video', weight_path='./pytorch_model.bin.1') \
.show()
towhee.dc['text'](["kids feeding and playing with the horse"]) \
.clip4clip['text','vec'](model_name='clip_vit_b32', modality='text', weight_path='./pytorch_model.bin.1') \
.select['text', 'vec']() \
.show()
```
![](vect_explicit_video.png)
![](vect_explicit_text.png)
<br />
## Factory Constructor
Create the operator via the following factory method
***clip4clip(model_name, modality, weight_path)***
**Parameters:**
***model_name:*** *str*
​ The model name of CLIP. Supported model names:
- clip_vit_b32
***modality:*** *str*
​ Which modality(*video* or *text*) is used to generate the embedding.
***weight_path:*** *str*
​ pretrained model weights path.
<br />
## Interface
An video-text embedding operator takes a list of [towhee image](link/to/towhee/image/api/doc) or string as input and generate an embedding in ndarray.
**Parameters:**
***data:*** *List[towhee.types.Image]* or *str*
​ The data (list of image(which is uniform subsampled from a video) or text based on specified modality) to generate embedding.
**Returns:** *numpy.ndarray*
​ The data embedding extracted by model.

88
clip4clip.py

@ -11,31 +11,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import sys
from pathlib import Path
import numpy as np
import torch
import towhee
from typing import List, Union
from torchvision import transforms
from towhee.models.clip4clip import convert_tokens_to_id
from towhee.types.image_utils import to_pil
from towhee.operator.base import NNOperator, OperatorFlag
from towhee.types.arg import arg, to_image_color
from towhee.operator.base import NNOperator
from towhee import register
from towhee.models import clip4clip
from towhee.utils.ndarray_utils import to_ndarray
from PIL import Image as PILImage
from towhee.types.image import Image
@register(name='clip4clip', output_schema=['vec'])
@register(output_schema=['vec'])
class CLIP4Clip(NNOperator):
"""
CLIP multi-modal embedding operator
CLIP4Clip multi-modal embedding operator
"""
def __init__(self, model_name: str, modality: str, weight_path: str = None):
super().__init__()
self.modality = modality
@ -52,11 +48,11 @@ class CLIP4Clip(NNOperator):
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
(0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
])
(0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
])
self.model.eval()
def __call__(self, data):
def __call__(self, data: Union[str, List[Image]]):
if self.modality == 'video':
vec = self._inference_from_video(data)
elif self.modality == 'text':
@ -64,43 +60,33 @@ class CLIP4Clip(NNOperator):
else:
raise ValueError("modality[{}] not implemented.".format(self._modality))
return vec
#
def _inference_from_text(self, text):
def _inference_from_text(self, text: str):
self.model.eval()
# text = self.tokenize(text)
text_ids = convert_tokens_to_id(self.tokenize, text)
print(text_ids)
text_ids = torch.tensor(text_ids).unsqueeze(0).to(self.device)
text_features = self.model.get_sequence_output(text_ids)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
# print(text_features.norm(dim=-1, keepdim=True))
return text_features#.unsqueeze(0).cpu().numpy()
return text_features.detach().flatten().cpu().numpy()
def _inference_from_video(self, img_list):
def _inference_from_video(self, img_list: List[Image]):
self.model.eval()
# video = self.tfms(video)
max_frames = 12
video = np.zeros((1, max_frames, 1, 3, 224, 224), dtype=np.float)
video = np.zeros((1, max_frames, 1, 3, 224, 224), dtype=np.float64)
slice_len = len(img_list)
max_video_length = 0 if 0 > slice_len else slice_len
for i, img in enumerate(img_list):
pil_img = PILImage.fromarray(img.to_ndarray(), img.mode)
tfmed_img = self.tfms(pil_img).unsqueeze(0).to(self.device)
print('tfmed_img.shape', tfmed_img.shape)
if slice_len >= 1:
video[0, i, ...] = tfmed_img
video_mask = np.zeros((1, max_frames), dtype=np.long)
video_mask = np.zeros((1, max_frames), dtype=np.int32)
video_mask[0, :max_video_length] = [1] * max_video_length
video = torch.as_tensor(video).float()
pair, bs, ts, channel, h, w = video.shape
video = video.view(pair * bs * ts, channel, h, w)
video_mask = torch.as_tensor(video_mask).float()
# video_list.append(video)
# video_mask_list.append(video_mask)
# video_list_tensor = torch.cat(video_list, dim=0)
# video_mask_list_tensor = torch.cat(video_mask_list, dim=0)
visual_output = self.model.get_visual_output(video, video_mask, shaped=True)
@ -114,44 +100,4 @@ class CLIP4Clip(NNOperator):
visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)
return visual_output#.unsqueeze(0).cpu().numpy()
#
# @arg(1, to_image_color('RGB'))
# def _inference_from_video(self, img):
# img = to_pil(img)
# image = self.tfms(img).unsqueeze(0).to(self.device)
# image_features = self.model.encode_image(image)
# return image_features
if __name__ == '__main__':
# op = CLIP4Clip('clip_vit_b32', 'text', './pytorch_model.bin.1')
# res = op('kids feeding and playing with the horse')
# print(res.shape)
# from towhee import ops
# op = CLIP4Clip('clip_vit_b32', 'video', './pytorch_model.bin.1')
# d = ops.video_decode.ffmpeg(sample_type='uniform_temporal_subsample',
# args={'num_samples': 12})
# # ops.video_decode.get_video_duration()
video_path = '/Users/zilliz/dataset/MSRVTT/MSRVTT/videos/all/video9451.mp4'
# img_list = []
# for frame in d(video_path):
# print(frame)
# img_list.append(frame[0])
# res = op(img_list)
# print(res.shape)
dc = (
towhee.dc['path']([video_path])
.video_decode.ffmpeg['path', 'frames'](
sample_type='uniform_temporal_subsample',
args={'num_samples': 12})
.runas_op['frames', 'frames'](func=lambda x: [y[0] for y in x])
.clip4clip['frames', 'vec'](model_name='clip_vit_b32', modality='video', weight_path='./pytorch_model.bin.1')
)
dc.show()
return visual_output.detach().flatten().cpu().numpy()

BIN
demo_video.mp4

Binary file not shown.

BIN
vect_explicit_text.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

BIN
vect_explicit_video.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

BIN
vect_simplified_text.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

BIN
vect_simplified_video.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Loading…
Cancel
Save