Add files

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
3 years ago · 257075a76b
5 changed files with 314 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,112 @@
-# pytorchvideo
+# Video Classification with Pytorchvideo

+*Author: [Jael Gu](https://github.com/jaelgu)*
+
+<br />
+
+## Description
+
+A video classification operator is able to predict labels (and corresponding scores)
+and extracts features given the input video.
+It preprocesses video frames with video transforms and then loads pre-trained models by model names.
+This operator has implemented pre-trained models from [Pytorchvideo](https://github.com/facebookresearch/pytorchvideo)
+and maps vectors with labels provided by the [Kinetics400 Dataset](https://deepmind.com/research/open-source/kinetics).
+
+<br />
+
+## Code Example
+
+Use the pretrained SLOWFAST model ('slowfast_r50')
+to classify and generate a vector for the given video path './archery.mp4' ([download](https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4)).
+
+ *Write the pipeline in simplified style*:
+
+```python
+import towhee
+
+(
+    towhee.glob('./archery.mp4')
+          .video_decode.ffmpeg()
+          .video_classification.pytorchvideo(model_name='slowfast_r50')
+          .to_list()
+)
+```
+
+*Write a same pipeline with explicit inputs/outputs name specifications:*
+
+```python
+import towhee
+
+(
+    towhee.glob['path']('./archery.mp4')
+          .video_decode.ffmpeg['path', 'frames']()
+          .video_classification.pytorchvideo['frames', ('labels', 'scores', 'features')](
+                model_name='slowfast_r50')
+          .select['labels', 'scores', 'features']()
+          .show()
+)
+```
+
+
+<br />
+
+## Factory Constructor
+
+Create the operator via the following factory method
+
+***video_classification.pytorchvideo(
+model_name='x3d_xs', skip_preprocess=False, classmap=None, topk=5)***
+
+**Parameters:**
+
+	***model_name***: *str*
+
+	The name of pre-trained model from pytorchvideo hub.
+
+    Supported model names:
+- c2d_r50
+- i3d_r50
+- slow_r50
+- slowfast_r50
+- slowfast_r101
+- x3d_xs
+- x3d_s
+- x3d_m
+- mvit_base_16x4
+- mvit_base_32x3
+
+	***skip_preprocess***: *bool*
+
+	Flag to control whether to skip UniformTemporalSubsample in video transforms, defaults to False.
+If set to True, the step of UniformTemporalSubsample will be skipped.
+In this case, the user should guarantee that all the input video frames are already reprocessed properly,
+and thus can be fed to model directly.
+
+	***classmap***: *Dict[str: int]*:
+
+	Dictionary that maps class names to one hot vectors.
+If not given, the operator will load the default class map dictionary.
+
+	***topk***: *int*
+
+	The topk labels & scores to present in result. The default value is 5.
+
+## Interface
+
+Given a video data, the video classification operator predicts a list of class labels
+and generates a video embedding in numpy.ndarray.
+
+**Parameters:**
+
+	***frames***: *List[VideoFrame]*
+
+	Video frames in towhee.types.video_frame.VideoFrame.
+
+
+**Returns**:
+
+   ***labels, scores， features***: *Tuple(List[str], List[float], numpy.ndarray)*
+
+- labels: predicted class names.
+- scores: possibility scores ranking from high to low corresponding to predicted labels.
+- features: a video embedding in shape of (num_features,) representing features extracted by model.
--- a/init.py
+++ b/init.py
@ -0,0 +1,19 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pytorchvideo import PytorchVideo
+
+
+def pytorchvideo(**kwargs):
+    return PytorchVideo(**kwargs)
--- a/kinetics_400.json
+++ b/kinetics_400.json
--- a/pytorchvideo.py
+++ b/pytorchvideo.py
@ -0,0 +1,179 @@
+import logging
+import os
+import json
+from pathlib import Path
+from typing import List, Union, Iterable, Callable
+
+import torch
+from torch import nn
+import numpy
+
+from towhee import register
+from towhee.types import VideoFrame
+from towhee.operator.base import NNOperator
+from towhee.models.utils.video_transforms import transform_video
+
+log = logging.getLogger()
+
+
+@register(output_schema=['labels', 'scores', 'features'])
+class PytorchVideo(NNOperator):
+    """
+    Generate a list of class labels given a video input data.
+    Default labels are from [Kinetics400 Dataset](https://deepmind.com/research/open-source/kinetics).
+
+    Args:
+        model_name (`str`):
+            The pretrained model name from torch hub.
+            Supported model names:
+            - c2d_r50
+            - i3d_r50
+            - slow_r50
+            - slowfast_r50
+            - slowfast_r101
+            - x3d_xs
+            - x3d_s
+            - x3d_m
+            - mvit_base_16x4
+            - mvit_base_32x3
+        skip_preprocess (`str`):
+            Flag to skip video transforms.
+        classmap (`str=None`):
+            Path of the json file to match class names.
+        topk (`int=5`):
+            The number of classification labels to be returned (ordered by possibility from high to low).
+    """
+
+    def __init__(
+            self,
+            model_name: str = 'x3d_xs',
+            framework: str = 'pytorch',
+            skip_preprocess: bool = False,
+            classmap: str = None,
+            topk: int = 5,
+    ) -> None:
+        super().__init__(framework=framework)
+        self.model_name = model_name
+        self.skip_preprocess = skip_preprocess
+        self.topk = topk
+        if classmap is None:
+            class_file = os.path.join(str(Path(__file__).parent), 'kinetics_400.json')
+            with open(class_file, 'r') as f:
+                kinetics_classes = json.load(f)
+            self.classmap = {}
+            for k, v in kinetics_classes.items():
+                self.classmap[v] = str(k).replace('"', '')
+        else:
+            self.classmap = classmap
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+        self.model = torch.hub.load('facebookresearch/pytorchvideo', model=model_name, pretrained=True)
+        self.model.eval()
+        self.model.to(self.device)
+
+    def __call__(self, frames: List[VideoFrame]):
+        """
+        Args:
+            frames (`List[VideoFrame]`):
+                Video frames in towhee.types.video_frame.VideoFrame.
+
+        Returns:
+            labels, scores:
+                A tuple of lists (labels, scores).
+            video embedding:
+                A video embedding in numpy.ndarray.
+        """
+        # Convert list of towhee.types.Image to numpy.ndarray in float32
+        video = numpy.stack([img.astype(numpy.float32) / 255. for img in frames], axis=0)
+        assert len(video.shape) == 4
+        video = video.transpose(3, 0, 1, 2)  # twhc -> ctwh
+
+        if self.skip_preprocess:
+            data = transform_video(
+                video=video,
+                model_name=self.model_name,
+                num_frames=None
+            )
+        else:
+            data = transform_video(
+                video=video,
+                model_name=self.model_name
+            )
+        if self.model_name.startswith('slowfast'):
+            inputs = [data[0].to(self.device)[None, ...], data[1].to(self.device)[None, ...]]
+        else:
+            inputs = data.to(self.device)[None, ...]
+
+        feats, outs = self.new_forward(inputs)
+        features = feats.to('cpu').squeeze(0).detach().numpy()
+
+        post_act = torch.nn.Softmax(dim=1)
+        preds = post_act(outs)
+        pred_scores, pred_classes = preds.topk(k=self.topk)
+        labels = [self.classmap[int(i)] for i in pred_classes[0]]
+        scores = [round(float(x), 5) for x in pred_scores[0]]
+        return labels, scores, features
+
+    def new_forward(self, x: Union[torch.Tensor, list]):
+        """
+        Generate embeddings returned by the second last hidden layer.
+
+        Args:
+            x (`Union[torch.Tensor, list]`):
+                tensor or list of input video after transforms
+
+        Returns:
+            Tensor of layer outputs.
+        """
+        blocks = list(self.model.children())
+        if len(blocks) == 1:
+            blocks = blocks[0]
+        if self.model_name.startswith('x3d'):
+            sub_blocks = list(blocks[-1].children())
+            extractor = FeatureExtractor(self.model, sub_blocks, layer=0)
+        elif self.model_name.startswith('mvit'):
+            sub_blocks = list(blocks[-1].children())
+            extractor = FeatureExtractor(self.model, sub_blocks, layer=0)
+        else:
+            extractor = FeatureExtractor(self.model, blocks, layer=-2)
+        features, outs = extractor(x)
+        if features.dim() == 5:
+            global_pool = nn.AdaptiveAvgPool3d(1)
+            features = global_pool(features)
+        return features.flatten(), outs
+
+    def get_model_name(self):
+        full_list = [
+            'c2d_r50',
+            'i3d_r50',
+            'slow_r50',
+            'slowfast_r50',
+            'slowfast_r101',
+            'x3d_xs',
+            'x3d_s',
+            'x3d_m',
+            'mvit_base_16x4',
+            'mvit_base_32x3'
+        ]
+        full_list.sort()
+        return full_list
+
+
+class FeatureExtractor(nn.Module):
+    def __init__(self, model: nn.Module, blocks: List[nn.Module], layer: int):
+        super().__init__()
+        self.model = model
+        self.features = None
+
+        target_layer = blocks[layer]
+        self.handler = target_layer.register_forward_hook(self.save_outputs_hook())
+
+    def save_outputs_hook(self) -> Callable:
+        def fn(_, __, output):
+            self.features = output
+        return fn
+
+    def forward(self, x):
+        outs = self.model(x)
+        self.handler.remove()
+        return self.features, outs
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+# torch>=1.8.0
+# torchvision>=0.9.0
+# pytorchvideo
+# towhee>=0.6.0