From 9c2ed27ce0c0f5ca74c993ae23d4324da3051337 Mon Sep 17 00:00:00 2001
From: video-decode <video-decode@towhee.io>
Date: Fri, 20 May 2022 14:38:46 +0800
Subject: [PATCH] update

---
 README.md        | 50 +++++++++++++++++++++++++++++++++++-------------
 video_decoder.py | 38 +++++++++++++++++++++++++-----------
 2 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 7a097e8..7c24f0c 100644
--- a/README.md
+++ b/README.md
@@ -23,10 +23,15 @@ Args:
 -  sample_type: str
   
         uniform_temporal_subsample
+        time_step_sample
 
 -  args: dict
-  
+   sample_type is `uniform_temporal_subsample`
       num_samples: int
+
+   sample_type is `time_step_sample`
+      time_step: int
+    
 ```      
 
 
@@ -41,7 +46,7 @@ Args:
 
 Returns:
 
-- Image
+- towhee.types.VideoImage
 
 ## Requirements
 av
@@ -56,18 +61,37 @@ d = ops.video_decode.ffmpeg(start_time=10.0, end_time=20.0, sample_type='uniform
 for frame in d(video_path):
     print(frame)
 
+print('#' * 50)
+
+d = ops.video_decode.ffmpeg(start_time=10.0, end_time=20.0, sample_type='time_step_sample', args={'time_step': 1})
+for frame in d(video_path):
+    print(frame)
+
+
 result:
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776310>, TIMESTAMP=10010)
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776700>, TIMESTAMP=11078)
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776310>, TIMESTAMP=12145)
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776700>, TIMESTAMP=13280)
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776310>, TIMESTAMP=14348)
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776700>, TIMESTAMP=15482)
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776310>, TIMESTAMP=16550)
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776700>, TIMESTAMP=17684)
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776310>, TIMESTAMP=18752)
-Outputs(image=<towhee.types.image.Image object at 0x7fa444776700>, TIMESTAMP=19887)
-```
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 10010, key_frame: 1
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 11078, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 12145, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 13280, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 14348, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 15482, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 16550, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 17684, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 18752, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 19887, key_frame: 0
+##################################################
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 10010, key_frame: 1
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 11011, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 12012, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 13013, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 14014, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 15015, key_frame: 1
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 16015, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 17017, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 18018, key_frame: 0
+VideoFrame shape: (360, 480, 3), mode: RGB, timestamp: 19019, key_frame: 0
+
+
 
 ## Reference
 
diff --git a/video_decoder.py b/video_decoder.py
index e28a950..2ee17de 100644
--- a/video_decoder.py
+++ b/video_decoder.py
@@ -8,16 +8,16 @@ import logging
 import av
 import numpy as np
 
-from towhee.types.image import Image
+from towhee.types.video_frame import VideoFrame
 from towhee.operator.base import PyOperator
 
-VideoOutput = NamedTuple("Outputs", [("image", Image), ("TIMESTAMP", int)])
 
 logger = logging.getLogger()
 
 
 class SAMPLE_TYPE:
     UNIFORM_TEMPORAL_SUBSAMPLE = 'uniform_temporal_subsample'
+    TIME_STEP_SAMPLE = 'time_step_sample'
 
 
 class VideoDecoder(PyOperator):
@@ -35,17 +35,22 @@ class VideoDecoder(PyOperator):
 
     def get_sample(self, stream):
         if self._sample_type is None:
-            return self._no_smaple
+            return self._no_sample
         elif self._sample_type.lower() == SAMPLE_TYPE.UNIFORM_TEMPORAL_SUBSAMPLE:
             duration = VideoDecoder.get_video_duration(stream)
             end_time = self._end_time if self._end_time is not None and self._end_time <= duration else duration
             start_time = self._start_time if self._start_time is not None else 0
             nums = int(stream.rate * (end_time - start_time))
             return partial(self._uniform_temporal_subsample, total_frames=nums)
+        elif self._sample_type.lower() == SAMPLE_TYPE.TIME_STEP_SAMPLE:
+            duration = VideoDecoder.get_video_duration(stream)
+            start_time = self._start_time if self._start_time is not None else 0
+            end_time = self._end_time if self._end_time is not None and self._end_time <= duration else duration
+            return partial(self._time_step_sample, start_time=start_time, end_time=end_time)
         else:
             raise RuntimeError('Unkown sample type: %s' % self._sample_type)
 
-    def _no_smaple(self, frame_iter):
+    def _no_sample(self, frame_iter):
         if self._end_time is None:
             yield from frame_iter
         else:
@@ -53,6 +58,20 @@ class VideoDecoder(PyOperator):
                 frame.time < self._end_time
                 yield frame
 
+    def _time_step_sample(self, frame_iter, start_time, end_time):
+        time_step = self._args.get('time_step')
+        if time_step is None:
+            raise RuntimeError('time_step_sample sample lost args time_step')
+
+        time_index = start_time
+        for frame in frame_iter:
+            if time_index >= self._end_time:
+                break
+            
+            if frame.time >= time_index:
+                time_index += time_step
+                yield frame
+
     def _uniform_temporal_subsample(self, frame_iter, total_frames):
         num_samples = self._args.get('num_samples')
         if num_samples is None:
@@ -101,9 +120,6 @@ class VideoDecoder(PyOperator):
     def __call__(self, video_path: str) -> Generator:
         with av.open(video_path) as container:
             stream = container.streams.video[0]
-            width = stream.width
-            height = stream.height
-            channel = 3
             image_format = 'RGB'
 
             frame_gen = VideoDecoder._decdoe(stream, container, self._start_time)
@@ -111,8 +127,8 @@ class VideoDecoder(PyOperator):
             for frame in sample_function(frame_gen):
                 timestamp = int(frame.time * 1000)
                 ndarray = frame.to_ndarray(format='rgb24')
-                img = Image(ndarray.tobytes(), width, height, channel, image_format, None, key_frame=frame.key_frame)
-                yield VideoOutput(img, timestamp)
+                img = VideoFrame(ndarray, image_format, timestamp, frame.key_frame)
+                yield img
 
 
 
@@ -123,10 +139,10 @@ class VideoDecoder(PyOperator):
 #     video_path3 = "/home/zhangchen/zhangchen_workspace/dataset/MSRVTT/msrvtt_data/MSRVTT_Videos/video9991.mp4"
 
 #     def d(video_path):
-#         d = VideoDecoder(10, 11, 'uniform_temporal_subsample', {'num_samples': 30})
+#         d = VideoDecoder(10, 17, 'time_step_sample', {'time_step': 1})
 #         fs = d(video_path)
 #         for f in fs:
-#             print(f.TIMESTAMP)
+#             print(f.mode, f.key_frame, f.timestamp)
 
 #     d(video_path1)
 #     print('#' * 100)