add UNIFORM_TEMPORAL_SUBSAMPLE

Signed-off-by: junjie.jiang <junjie.jiang@zilliz.com>
2 years ago · 31e0e5d491
3 changed files with 151 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,119 @@
-# VPF
+# Operator: video-decoder
+
+Author: JunJie Jiang
+
+## Overview
+
+Decode video by using https://github.com/NVIDIA/VideoProcessingFramework
+
+
+- Users need to install the vpf package by themselves.
+
+- Gpu decode only support h.264, h.265 and vp9, others will use cpu decode.
+
+- 4% diff with cpu-decode.
+
+
+## Interface
+
+```python
+__init__(self, gpu_id, start_time=None, end_time=None, sample_type=None, args=None)
+
+
+Args:
+
+- gpu_id: int >= 0
+
+- start_time: float
+
+- end_time: float
+
+  decode video from start_time to end_time
+
+-  sample_type: str
+  
+        uniform_temporal_subsample
+        time_step_sample
+
+-  args: dict
+   sample_type is `uniform_temporal_subsample`
+      num_samples: int
+
+   sample_type is `time_step_sample`
+      time_step: int
+    
+```      
+
+
+```python
+__call__(self, video_path: str)
+```
+
+Args:
+
+- video_path:
+  support local path and http/https url.
+
+Returns:
+
+- towhee.types.VideoFrame
+
+## Requirements
+https://github.com/NVIDIA/VideoProcessingFramework
+
+
+
+## How it works
+```python
+from towhee import pipe, ops, DataCollection
+
+p = (
+    pipe.input('video_file')
+    .flat_map('video_file', 'frame', ops.video_decode.ffmpeg(gpu_id=0, start_time=10.0, end_time=15.0, sample_type='time_step_sample', args={'time_step': 1}))
+    .output('frame')
+)
+
+DataCollection(p('./video.mp4')).show(limit=1)
+```
+
+```python
+from towhee import ops
+
+d = ops.video_decode.ffmpeg(gpu_id=0, start_time=10.0, end_time=20.0, sample_type='uniform_temporal_subsample', args={'num_samples': 10})
+for frame in d(video_path):
+    print(frame)
+
+print('#' * 50)
+
+d = ops.video_decode.ffmpeg(gpu_id=0, start_time=10.0, end_time=20.0, sample_type='time_step_sample', args={'time_step': 1})
+for frame in d(video_path):
+    print(frame)
+
+
+result:
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 10010, key_frame: 1
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 11078, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 12145, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 13280, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 14348, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 15482, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 16550, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 17684, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 18752, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 19887, key_frame: 0
+##################################################
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 10010, key_frame: 1
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 11011, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 12012, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 13013, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 14014, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 15015, key_frame: 1
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 16015, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 17017, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 18018, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 19019, key_frame: 0
+
+
+
+## Reference

--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,2 @@
 av
+numpy
--- a/video_decoder.py
+++ b/video_decoder.py
@ -1,5 +1,6 @@
 import logging
-
+import av
+import numpy as np
 from towhee.operator.base import PyOperator

 from cpu_decode import PyAVDecode
@ -31,7 +32,8 @@ class VideoDecoder(PyOperator):
        super().__init__()
        self._gpu_id = gpu_id
        self._start_time = start_time if start_time is not None else 0
-        self._end_time = end_time * 1000 if end_time is not None else None
+        self._end_time = end_time if end_time is not None else None
+        self._end_time_ms = end_time * 1000 if end_time is not None else None
        self._sample_type = sample_type.lower() if sample_type else None
        self._args = args if args is not None else {}

@ -61,12 +63,34 @@ class VideoDecoder(PyOperator):
            logger.warn('GPU decode failed, only supports [h264,h265,vp9] format, will use CPU')
            yield from self._cpu_time_step_decode(video_path, time_step)

+    def _uniform_temporal_subsample(self, frames, num_samples, total_frames):
+        print(num_samples, total_frames)
+        indexs = np.linspace(0, total_frames - 1, num_samples).astype('int')
+        cur_index = 0
+        count = 0
+        for frame in frames:
+            if cur_index >= len(indexs):
+                return
+
+            while cur_index < len(indexs) and indexs[cur_index] <= count:
+                cur_index += 1
+                yield frame
+            count += 1
+
    def _filter(self, frames):
        for f in frames:
-            if self._end_time and f.timestamp > self._end_time:
+            if self._end_time_ms and f.timestamp > self._end_time_ms:
                break
            yield f

+    def frame_nums(self, video_path):
+        with av.open(video_path) as c:
+            video = c.streams.video[0]
+            start = self._start_time if self._start_time is not None else 0
+            duration = c.duration / 1000000
+            end = self._end_time if self._end_time and self._end_time <= duration else duration
+            return int(round((end - start) * video.average_rate))
+
    def __call__(self, video_path: str):
        if self._sample_type is None:
            yield from self._filter(self.decode(video_path))
@ -75,7 +99,10 @@ class VideoDecoder(PyOperator):
            if time_step is None:
                raise RuntimeError('time_step_sample sample lost args time_step')
            yield from self._filter(self.time_step_decode(video_path, time_step))
-        elif self._sample_type == SAMPLE_TYPE.TIME_STEP_SAMPLE:
-            pass
+        elif self._sample_type == SAMPLE_TYPE.UNIFORM_TEMPORAL_SUBSAMPLE:
+            num_samples = self._args.get('num_samples')
+            if num_samples is None:
+                raise RuntimeError('uniform_temporal_subsample lost args num_samples')
+            yield from self._uniform_temporal_subsample(self.decode(video_path), num_samples, self.frame_nums(video_path))
        else:
            raise RuntimeError('Unkown sample type, only supports: [%s|%s]' % (SAMPLE_TYPE.TIME_STEP_SAMPLE, SAMPLE_TYPE.UNIFORM_TEMPORAL_SUBSAMPLE))