ffmpeg
              
                 
                
            
          copied
				 5 changed files with 193 additions and 1 deletions
			
			
		| @ -1,2 +1,44 @@ | |||
| # ffmpeg | |||
| # Operator: video-decoder | |||
| 
 | |||
| Author: JunJie Jiang | |||
| 
 | |||
| ## Overview | |||
| 
 | |||
| 
 | |||
| 
 | |||
| ## Interface | |||
| 
 | |||
| ```python | |||
| __init__(self, key_frame: bool) | |||
| ``` | |||
| 
 | |||
| Args: | |||
| 
 | |||
| - key_frame: | |||
| 
 | |||
|   True: Only return key frame | |||
|   False: Return all image frames of video | |||
| 
 | |||
| ```python | |||
| __call__(self, video_path: str) | |||
| ``` | |||
| 
 | |||
| Args: | |||
| 
 | |||
| - video_path: | |||
|   support local path and http/https url. | |||
| 
 | |||
| Returns: | |||
| 
 | |||
| - Image | |||
| 
 | |||
| ## Requirements | |||
| av | |||
| 
 | |||
| 
 | |||
| ## How it works | |||
| 
 | |||
| 
 | |||
| 
 | |||
| ## Reference | |||
| 
 | |||
|  | |||
| @ -0,0 +1,5 @@ | |||
| from .video_decoder import VideoDecoder | |||
| 
 | |||
| 
 | |||
| def ffmpeg(start_time=None, end_time=None, sample_type=None, args=None): | |||
|     return VideoDecoder(start_time, end_time, sample_type, args) | |||
| @ -0,0 +1 @@ | |||
| av | |||
| @ -0,0 +1,134 @@ | |||
| 
 | |||
| from typing import Generator, NamedTuple | |||
| from functools import partial, reduce | |||
| 
 | |||
| import math | |||
| import logging | |||
| 
 | |||
| import av | |||
| import numpy as np | |||
| 
 | |||
| from towhee.types.image import Image | |||
| from towhee.operator.base import PyOperator | |||
| 
 | |||
| VideoOutput = NamedTuple("Outputs", [("image", Image), ("TIMESTAMP", int)]) | |||
| 
 | |||
| logger = logging.getLogger() | |||
| 
 | |||
| 
 | |||
| class SAMPLE_TYPE: | |||
|     UNIFORM_TEMPORAL_SUBSAMPLE = 'uniform_temporal_subsample' | |||
| 
 | |||
| 
 | |||
| class VideoDecoder(PyOperator): | |||
|     ''' | |||
|     VideoDecoder | |||
|         Return images with RGB format. | |||
|     ''' | |||
| 
 | |||
|     def __init__(self, start_time=None, end_time=None, sample_type=None, args=None) -> None: | |||
|         super().__init__() | |||
|         self._start_time = start_time | |||
|         self._end_time = end_time | |||
|         self._sample_type = sample_type | |||
|         self._args = args if args is not None else {} | |||
| 
 | |||
|     def get_sample(self, stream): | |||
|         if self._sample_type is None: | |||
|             return self._no_smaple | |||
|         elif self._sample_type.lower() == SAMPLE_TYPE.UNIFORM_TEMPORAL_SUBSAMPLE: | |||
|             duration = VideoDecoder.get_video_duration(stream) | |||
|             end_time = self._end_time if self._end_time is not None and self._end_time <= duration else duration | |||
|             start_time = self._start_time if self._start_time is not None else 0 | |||
|             nums = int(stream.rate * (end_time - start_time)) | |||
|             return partial(self._uniform_temporal_subsample, total_frames=nums) | |||
|         else: | |||
|             raise RuntimeError('Unkown sample type: %s' % self._sample_type) | |||
| 
 | |||
|     def _no_smaple(self, frame_iter): | |||
|         if self._end_time is None: | |||
|             yield from frame_iter | |||
|         else: | |||
|             for frame in frame_iter: | |||
|                 frame.time < self._end_time | |||
|                 yield frame | |||
| 
 | |||
|     def _uniform_temporal_subsample(self, frame_iter, total_frames): | |||
|         num_samples = self._args.get('num_samples') | |||
|         if num_samples is None: | |||
|             raise RuntimeError('uniform_temporal_subsample lost args num_samples') | |||
|          | |||
|         indexs = np.linspace(0, total_frames - 1, num_samples).astype('int') | |||
|         cur_index = 0 | |||
|         count = 0 | |||
|         for frame in frame_iter: | |||
|             if cur_index >= len(indexs): | |||
|                 return | |||
| 
 | |||
|             while cur_index < len(indexs) and indexs[cur_index] <= count: | |||
|                 cur_index += 1 | |||
|                 yield frame | |||
|             count += 1 | |||
|      | |||
|     @staticmethod | |||
|     def _decdoe(video, container, start_time): | |||
|         if start_time is not None: | |||
|             start_offset = int(math.floor(start_time * (1 / video.time_base))) | |||
|         else: | |||
|             start_offset = 0 | |||
|         seek_offset = start_offset | |||
|         seek_offset = max(seek_offset - 1, 0) | |||
|         try: | |||
|             container.seek(seek_offset, any_frame=False, backward=True, stream=video) | |||
|         except av.AVError as e: | |||
|             logger.error('Seek to start_time: %s sec failed, the offset is %s, errors: %s' % (start_time, seek_offset, str(e))) | |||
|             raise RuntimeError from e | |||
| 
 | |||
|         for frame in container.decode(video): | |||
|             if frame.time < start_time: | |||
|                 continue | |||
|             yield frame | |||
| 
 | |||
|     def get_video_duration(video): | |||
|         if video.duration is not None: | |||
|             return float(video.duration * video.time_base) | |||
|         elif video.metadata.get('DURATION') is not None: | |||
|             time_str = video.metadata['DURATION'] | |||
|             return reduce(lambda x, y: float(x) * 60 + float(y), time_str.split(':')) | |||
|         else: | |||
|             return None | |||
| 
 | |||
|     def __call__(self, video_path: str) -> Generator: | |||
|         with av.open(video_path) as container: | |||
|             stream = container.streams.video[0] | |||
|             width = stream.width | |||
|             height = stream.height | |||
|             channel = 3 | |||
|             image_format = 'RGB' | |||
| 
 | |||
|             frame_gen = VideoDecoder._decdoe(stream, container, self._start_time) | |||
|             sample_function = self.get_sample(stream) | |||
|             for frame in sample_function(frame_gen): | |||
|                 timestamp = int(frame.time * 1000) | |||
|                 ndarray = frame.to_ndarray(format='rgb24') | |||
|                 img = Image(ndarray.tobytes(), width, height, channel, image_format, None, key_frame=frame.key_frame) | |||
|                 yield VideoOutput(img, timestamp) | |||
| 
 | |||
| 
 | |||
| 
 | |||
| # if __name__ == '__main__': | |||
| #     video_path = "/home/junjie.jiangjjj/workspace/video/[The Rock] [1996] [Trailer] [#2]-16-l-rO5B64.mkv" | |||
| #     video_path1 = "/home/junjie.jiangjjj/workspace/video/'Eagle Eye' Trailer (2008)-_wkqo_Rd3_Q.mp4" | |||
| #     video_path2 = "/home/junjie.jiangjjj/workspace/video/2001 -  A Space Odyssey - Trailer [1968] HD-Z2UWOeBcsJI.webm" | |||
| #     video_path3 = "/home/zhangchen/zhangchen_workspace/dataset/MSRVTT/msrvtt_data/MSRVTT_Videos/video9991.mp4" | |||
| 
 | |||
| #     def d(video_path): | |||
| #         d = VideoDecoder(10, 11, 'uniform_temporal_subsample', {'num_samples': 30}) | |||
| #         fs = d(video_path) | |||
| #         for f in fs: | |||
| #             print(f.TIMESTAMP) | |||
| 
 | |||
| #     d(video_path1) | |||
| #     print('#' * 100) | |||
| #     d(video_path2)     | |||
| 
 | |||
| @ -0,0 +1,10 @@ | |||
| name: 'video-decoder' | |||
| labels:  | |||
| operator: 'towhee/video-decoder' | |||
| init: | |||
|   key_frame: bool | |||
| call: | |||
|   input: | |||
|     video_path: str | |||
|   output: | |||
|     image: Image | |||
					Loading…
					
					
				
		Reference in new issue
	
	