From 9c2ed27ce0c0f5ca74c993ae23d4324da3051337 Mon Sep 17 00:00:00 2001 From: video-decode Date: Fri, 20 May 2022 14:38:46 +0800 Subject: [PATCH] update --- README.md | 50 +++++++++++++++++++++++++++++++++++------------- video_decoder.py | 38 +++++++++++++++++++++++++----------- 2 files changed, 64 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 7a097e8..7c24f0c 100644 --- a/README.md +++ b/README.md @@ -23,10 +23,15 @@ Args: - sample_type: str uniform_temporal_subsample + time_step_sample - args: dict - + sample_type is `uniform_temporal_subsample` num_samples: int + + sample_type is `time_step_sample` + time_step: int + ``` @@ -41,7 +46,7 @@ Args: Returns: -- Image +- towhee.types.VideoImage ## Requirements av @@ -56,18 +61,37 @@ d = ops.video_decode.ffmpeg(start_time=10.0, end_time=20.0, sample_type='uniform for frame in d(video_path): print(frame) +print('#' * 50) + +d = ops.video_decode.ffmpeg(start_time=10.0, end_time=20.0, sample_type='time_step_sample', args={'time_step': 1}) +for frame in d(video_path): + print(frame) + + result: -Outputs(image=, TIMESTAMP=10010) -Outputs(image=, TIMESTAMP=11078) -Outputs(image=, TIMESTAMP=12145) -Outputs(image=, TIMESTAMP=13280) -Outputs(image=, TIMESTAMP=14348) -Outputs(image=, TIMESTAMP=15482) -Outputs(image=, TIMESTAMP=16550) -Outputs(image=, TIMESTAMP=17684) -Outputs(image=, TIMESTAMP=18752) -Outputs(image=, TIMESTAMP=19887) -``` +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 10010, key_frame: 1 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 11078, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 12145, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 13280, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 14348, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 15482, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 16550, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 17684, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 18752, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 19887, key_frame: 0 +################################################## +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 10010, key_frame: 1 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 11011, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 12012, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 13013, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 14014, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 15015, key_frame: 1 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 16015, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 17017, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 18018, key_frame: 0 +VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 19019, key_frame: 0 + + ## Reference diff --git a/video_decoder.py b/video_decoder.py index e28a950..2ee17de 100644 --- a/video_decoder.py +++ b/video_decoder.py @@ -8,16 +8,16 @@ import logging import av import numpy as np -from towhee.types.image import Image +from towhee.types.video_frame import VideoFrame from towhee.operator.base import PyOperator -VideoOutput = NamedTuple("Outputs", [("image", Image), ("TIMESTAMP", int)]) logger = logging.getLogger() class SAMPLE_TYPE: UNIFORM_TEMPORAL_SUBSAMPLE = 'uniform_temporal_subsample' + TIME_STEP_SAMPLE = 'time_step_sample' class VideoDecoder(PyOperator): @@ -35,17 +35,22 @@ class VideoDecoder(PyOperator): def get_sample(self, stream): if self._sample_type is None: - return self._no_smaple + return self._no_sample elif self._sample_type.lower() == SAMPLE_TYPE.UNIFORM_TEMPORAL_SUBSAMPLE: duration = VideoDecoder.get_video_duration(stream) end_time = self._end_time if self._end_time is not None and self._end_time <= duration else duration start_time = self._start_time if self._start_time is not None else 0 nums = int(stream.rate * (end_time - start_time)) return partial(self._uniform_temporal_subsample, total_frames=nums) + elif self._sample_type.lower() == SAMPLE_TYPE.TIME_STEP_SAMPLE: + duration = VideoDecoder.get_video_duration(stream) + start_time = self._start_time if self._start_time is not None else 0 + end_time = self._end_time if self._end_time is not None and self._end_time <= duration else duration + return partial(self._time_step_sample, start_time=start_time, end_time=end_time) else: raise RuntimeError('Unkown sample type: %s' % self._sample_type) - def _no_smaple(self, frame_iter): + def _no_sample(self, frame_iter): if self._end_time is None: yield from frame_iter else: @@ -53,6 +58,20 @@ class VideoDecoder(PyOperator): frame.time < self._end_time yield frame + def _time_step_sample(self, frame_iter, start_time, end_time): + time_step = self._args.get('time_step') + if time_step is None: + raise RuntimeError('time_step_sample sample lost args time_step') + + time_index = start_time + for frame in frame_iter: + if time_index >= self._end_time: + break + + if frame.time >= time_index: + time_index += time_step + yield frame + def _uniform_temporal_subsample(self, frame_iter, total_frames): num_samples = self._args.get('num_samples') if num_samples is None: @@ -101,9 +120,6 @@ class VideoDecoder(PyOperator): def __call__(self, video_path: str) -> Generator: with av.open(video_path) as container: stream = container.streams.video[0] - width = stream.width - height = stream.height - channel = 3 image_format = 'RGB' frame_gen = VideoDecoder._decdoe(stream, container, self._start_time) @@ -111,8 +127,8 @@ class VideoDecoder(PyOperator): for frame in sample_function(frame_gen): timestamp = int(frame.time * 1000) ndarray = frame.to_ndarray(format='rgb24') - img = Image(ndarray.tobytes(), width, height, channel, image_format, None, key_frame=frame.key_frame) - yield VideoOutput(img, timestamp) + img = VideoFrame(ndarray, image_format, timestamp, frame.key_frame) + yield img @@ -123,10 +139,10 @@ class VideoDecoder(PyOperator): # video_path3 = "/home/zhangchen/zhangchen_workspace/dataset/MSRVTT/msrvtt_data/MSRVTT_Videos/video9991.mp4" # def d(video_path): -# d = VideoDecoder(10, 11, 'uniform_temporal_subsample', {'num_samples': 30}) +# d = VideoDecoder(10, 17, 'time_step_sample', {'time_step': 1}) # fs = d(video_path) # for f in fs: -# print(f.TIMESTAMP) +# print(f.mode, f.key_frame, f.timestamp) # d(video_path1) # print('#' * 100)