Add files

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 1c1303dec4
10 changed files with 254 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,4 +1,3 @@
-
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@ -1,2 +1,104 @@
-# actionclip
+# Action Classification with ActionClip

+*Author: [Jael Gu](https://github.com/jaelgu)*
+
+<br />
+
+## Description
+
+An action classification operator generates labels of human activities (with corresponding scores) and extracts features for the input video.
+It transforms the video into frames and loads pre-trained models by model names.
+This operator has implemented pre-trained models from [ActionClip](https://arxiv.org/abs/2109.08472)
+and maps vectors with labels provided by datasets used for pre-training.
+
+<br />
+
+## Code Example
+
+Use the pretrained ActionClip model to classify and generate a vector for the given video path './archery.mp4' 
+([download](https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4)).
+
+ *Write the pipeline in simplified style*:
+
+- Predict labels (default):
+```python
+import towhee
+
+(
+    towhee.glob('./archery.mp4') 
+          .video_decode.ffmpeg()
+          .action_classification.actionclip(model_name='clip_vit_b16')
+          .show()
+)
+```
+
+<img src="./result1.png" width="800px"/>
+
+*Write a same pipeline with explicit inputs/outputs name specifications:*
+
+```python
+import towhee
+
+(
+    towhee.glob['path']('./archery.mp4')
+      .video_decode.ffmpeg['path', 'frames']()
+      .action_classification.actionclip['frames', ('labels', 'scores', 'features')](model_name='clip_vit_b16')
+      .select['path', 'labels', 'scores', 'features']()
+      .show(formatter={'path': 'video_path'})
+)
+```
+
+<img src="./result2.png" width="800px"/>
+
+<br />
+
+## Factory Constructor
+
+Create the operator via the following factory method
+
+***action_classification.actionclip(model_name='clip_vit_b16', skip_preprocess=False, classmap=None, topk=5)***
+
+**Parameters:**
+
+	***model_name***: *str*
+
+	The name of pre-trained clip model.
+
+    Supported model names:
+- clip_vit_b16
+- clip_vit_b32
+
+	***skip_preprocess***: *bool*
+
+	Flag to control whether to skip video transforms, defaults to False.
+If set to True, the step to transform videos will be skipped.
+In this case, the user should guarantee that all the input video frames are already reprocessed properly,
+and thus can be fed to model directly.
+
+	***classmap***: *Dict[str: int]*: 
+
+	Dictionary that maps class names to one hot vectors.
+If not given, the operator will load the default class map dictionary.
+
+	***topk***: *int*
+
+	The topk labels & scores to present in result. The default value is 5.
+
+## Interface
+
+A video classification operator generates a list of class labels
+and a corresponding vector in numpy.ndarray given a video input data.
+
+**Parameters:**
+
+	***frames***: *List[VideoFrame]*
+
+	Video frames in towhee.types.video_frame.VideoFrame.
+
+**Returns**:
+
+   ***labels, scores， features***: *Tuple(List[str], List[float], numpy.ndarray)*
+
+- labels: predicted class names.
+- scores: possibility scores ranking from high to low corresponding to predicted labels.
+- features: a video embedding in shape of (num_features,) representing features extracted by model.
--- a/init.py
+++ b/init.py
@ -0,0 +1,19 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .action_clip import ActionClip
+
+
+def actionclip(**kwargs):
+    return ActionClip(**kwargs)
--- a/action_clip.py
+++ b/action_clip.py
@ -0,0 +1,123 @@
+import logging
+import os
+import json
+from pathlib import Path
+from typing import List, Union
+
+import torch
+import numpy
+
+from towhee import register
+from towhee.types.video_frame import VideoFrame
+from towhee.operator.base import NNOperator
+from towhee.models.utils.video_transforms import transform_video, get_configs
+from towhee.models import action_clip
+
+log = logging.getLogger()
+
+
+@register(output_schema=['label', 'vec'])
+class ActionClip(NNOperator):
+    """
+    Generate a list of class labels given a video input data.
+    Default labels are from [Kinetics400 Dataset](https://deepmind.com/research/open-source/kinetics).
+
+    Args:
+        model_name (`str`):
+            Clip model name to be used in ActionClip
+        weights_path (`str`):
+            Pretrained model weights
+        skip_preprocess (`bool=False`):
+            If or not skip video transforms.
+        classmap (`str=None`):
+            Path of the json file to match class names.
+        topk (`int=5`):
+            The number of classification labels to be returned (ordered by possibility from high to low).
+    """
+    def __init__(self,
+                 model_name: str = 'clip_vit_b16',
+                 weights_path: str = None,
+                 skip_preprocess: bool = False,
+                 classmap: dict = None,
+                 topk: int = 5
+                 ):
+        super().__init__(framework='pytorch')
+        self.device = 'cpu'  # todo: self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.model_name = model_name
+        self.skip_preprocess = skip_preprocess
+        self.topk = topk
+        if classmap is None:
+            class_file = os.path.join(str(Path(__file__).parent), 'kinetics_400.json')
+            with open(class_file, 'r') as f:
+                kinetics_classes = json.load(f)
+            self.classmap = {}
+            for k, v in kinetics_classes.items():
+                self.classmap[v] = str(k).replace('"', '')
+        else:
+            self.classmap = classmap
+
+        if weights_path is None:
+            weights_path = os.path.join(str(Path(__file__).parent), 'saved_model', 'action_' + model_name + '.pth')
+        checkpoints = torch.load(weights_path, map_location=self.device)
+        self.model = action_clip.create_model(
+            clip_model=model_name,
+            pretrained=True,
+            jit=True,
+            checkpoints=checkpoints
+            )
+
+        self.transform_cfgs = get_configs(
+            side_size=224,
+            crop_size=224,
+            num_frames=8,
+            mean=[0.48145466, 0.4578275, 0.40821073],
+            std=[0.26862954, 0.26130258, 0.27577711],
+            )
+
+    def __call__(self, data: Union[List[VideoFrame], List[str]]):
+        """
+        Args:
+            data (`Union[List[str], List[VideoFrame]]`):
+                Input video data or text data
+
+        Returns:
+            - (labels, scores)
+                A tuple of lists (labels, scores).
+            - a video embedding
+            - a list of text embeddings
+
+        """
+        # Convert list of towhee.types.Image to numpy.ndarray in float32
+        video = numpy.stack([img.astype(numpy.float32) / 255. for img in data], axis=0)
+        assert len(video.shape) == 4
+        video = video.transpose(3, 0, 1, 2)  # twhc -> ctwh
+
+        if self.skip_preprocess:
+            self.transform_cfgs.update(num_frames=None)
+        video = transform_video(
+            video=video,
+            **self.transform_cfgs
+            )
+        video = video.to(self.device)[None, ...].transpose(1, 2)
+        visual_features = self.encode_video(video)
+        features = visual_features.to('cpu').squeeze(0).detach().numpy()
+
+        kinetic_classes = list(self.classmap.values())
+        if self.model_name in ['clip_vit_b16', 'clip_vit_b32']:
+            saved_text_features = os.path.join(str(Path(__file__).parent), 'kinetics400_' + self.model_name + '.npz')
+            text_features = torch.from_numpy(numpy.load(saved_text_features)['arr_0'])
+        else:
+            text_features = self.encode_text(kinetic_classes)
+
+        num_text_aug = int(text_features.size(0) / len(kinetic_classes))
+        similarity = action_clip.get_similarity(text_features, visual_features, num_text_augs=num_text_aug)
+        values_k, indices_k = similarity.topk(self.topk, dim=-1)
+        labels = [kinetic_classes[int(i)] for i in indices_k[0]]
+        scores = [round(float(x), 5) for x in values_k[0]]
+        return labels, scores, features
+
+    def encode_text(self, text: List[str]):
+        return self.model.encode_text(text)
+
+    def encode_video(self, video: List[VideoFrame]):
+        return self.model.encode_video(video)
--- a/kinetics400_clip_vit_b16.npz
+++ b/kinetics400_clip_vit_b16.npz
--- a/kinetics_400.json
+++ b/kinetics_400.json
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+# towhee
+# towhee.models
+torch
+torchvision
+scipy
--- a/result1.png
+++ b/result1.png
--- a/result2.png
+++ b/result2.png
--- a/saved_model/action_clip_vit_b16.pth
+++ b/saved_model/action_clip_vit_b16.pth