diff --git a/README.md b/README.md index 13470b4..8961bb4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,119 @@ -# VPF +# Operator: video-decoder + +Author: JunJie Jiang + +## Overview + +Decode video by using https://github.com/NVIDIA/VideoProcessingFramework + + +- Users need to install the vpf package by themselves. + +- Gpu decode only support h.264, h.265 and vp9, others will use cpu decode. + +- 4% diff with cpu-decode. + + +## Interface + +```python +__init__(self, gpu_id, start_time=None, end_time=None, sample_type=None, args=None) + + +Args: + +- gpu_id: int >= 0 + +- start_time: float + +- end_time: float + + decode video from start_time to end_time + +- sample_type: str + + uniform_temporal_subsample + time_step_sample + +- args: dict + sample_type is `uniform_temporal_subsample` + num_samples: int + + sample_type is `time_step_sample` + time_step: int + +``` + + +```python +__call__(self, video_path: str) +``` + +Args: + +- video_path: + support local path and http/https url. + +Returns: + +- towhee.types.VideoFrame + +## Requirements +https://github.com/NVIDIA/VideoProcessingFramework + + + +## How it works +```python +from towhee import pipe, ops, DataCollection + +p = ( + pipe.input('video_file') + .flat_map('video_file', 'frame', ops.video_decode.ffmpeg(gpu_id=0, start_time=10.0, end_time=15.0, sample_type='time_step_sample', args={'time_step': 1})) + .output('frame') +) + +DataCollection(p('./video.mp4')).show(limit=1) +``` + +```python +from towhee import ops + +d = ops.video_decode.ffmpeg(gpu_id=0, start_time=10.0, end_time=20.0, sample_type='uniform_temporal_subsample', args={'num_samples': 10}) +for frame in d(video_path): + print(frame) + +print('#' * 50) + +d = ops.video_decode.ffmpeg(gpu_id=0, start_time=10.0, end_time=20.0, sample_type='time_step_sample', args={'time_step': 1}) +for frame in d(video_path): + print(frame) + + +result: +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 10010, key_frame: 1 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 11078, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 12145, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 13280, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 14348, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 15482, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 16550, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 17684, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 18752, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 19887, key_frame: 0 +################################################## +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 10010, key_frame: 1 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 11011, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 12012, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 13013, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 14014, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 15015, key_frame: 1 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 16015, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 17017, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 18018, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 19019, key_frame: 0 + + + +## Reference diff --git a/requirements.txt b/requirements.txt index dc1ce6e..6b69008 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ av +numpy diff --git a/video_decoder.py b/video_decoder.py index 9aa3a55..c681fa1 100644 --- a/video_decoder.py +++ b/video_decoder.py @@ -1,5 +1,6 @@ import logging - +import av +import numpy as np from towhee.operator.base import PyOperator from cpu_decode import PyAVDecode @@ -31,7 +32,8 @@ class VideoDecoder(PyOperator): super().__init__() self._gpu_id = gpu_id self._start_time = start_time if start_time is not None else 0 - self._end_time = end_time * 1000 if end_time is not None else None + self._end_time = end_time if end_time is not None else None + self._end_time_ms = end_time * 1000 if end_time is not None else None self._sample_type = sample_type.lower() if sample_type else None self._args = args if args is not None else {} @@ -61,12 +63,34 @@ class VideoDecoder(PyOperator): logger.warn('GPU decode failed, only supports [h264,h265,vp9] format, will use CPU') yield from self._cpu_time_step_decode(video_path, time_step) + def _uniform_temporal_subsample(self, frames, num_samples, total_frames): + print(num_samples, total_frames) + indexs = np.linspace(0, total_frames - 1, num_samples).astype('int') + cur_index = 0 + count = 0 + for frame in frames: + if cur_index >= len(indexs): + return + + while cur_index < len(indexs) and indexs[cur_index] <= count: + cur_index += 1 + yield frame + count += 1 + def _filter(self, frames): for f in frames: - if self._end_time and f.timestamp > self._end_time: + if self._end_time_ms and f.timestamp > self._end_time_ms: break yield f + def frame_nums(self, video_path): + with av.open(video_path) as c: + video = c.streams.video[0] + start = self._start_time if self._start_time is not None else 0 + duration = c.duration / 1000000 + end = self._end_time if self._end_time and self._end_time <= duration else duration + return int(round((end - start) * video.average_rate)) + def __call__(self, video_path: str): if self._sample_type is None: yield from self._filter(self.decode(video_path)) @@ -75,7 +99,10 @@ class VideoDecoder(PyOperator): if time_step is None: raise RuntimeError('time_step_sample sample lost args time_step') yield from self._filter(self.time_step_decode(video_path, time_step)) - elif self._sample_type == SAMPLE_TYPE.TIME_STEP_SAMPLE: - pass + elif self._sample_type == SAMPLE_TYPE.UNIFORM_TEMPORAL_SUBSAMPLE: + num_samples = self._args.get('num_samples') + if num_samples is None: + raise RuntimeError('uniform_temporal_subsample lost args num_samples') + yield from self._uniform_temporal_subsample(self.decode(video_path), num_samples, self.frame_nums(video_path)) else: raise RuntimeError('Unkown sample type, only supports: [%s|%s]' % (SAMPLE_TYPE.TIME_STEP_SAMPLE, SAMPLE_TYPE.UNIFORM_TEMPORAL_SUBSAMPLE))