diff --git a/README.md b/README.md index 0bd8c96..8bcfe56 100644 --- a/README.md +++ b/README.md @@ -25,42 +25,35 @@ Read the text 'kids feeding and playing with the horse' to generate a text embed *Write the pipeline in simplified style*: ```python -import towhee +from towhee.dc2 import pipe, ops, DataCollection -towhee.dc(['./demo_video.mp4']) \ - .video_decode.ffmpeg(sample_type='uniform_temporal_subsample', args={'num_samples': 12}) \ - .runas_op(func=lambda x: [y for y in x]) \ - .video_text_embedding.drl(base_encoder='clip_vit_b32', modality='video', device='cpu') \ - .show() +p = ( + pipe.input('text') \ + .map('text', 'vec', ops.video_text_embedding.drl(base_encoder='clip_vit_b32', modality='text', device='cuda:0')) \ + .output('text', 'vec') +) -towhee.dc(['kids feeding and playing with the horse']) \ - .video_text_embedding.drl(base_encoder='clip_vit_b32', modality='text', device='cpu') \ - .show() +DataCollection(p('kids feeding and playing with the horse')).show() ``` -![](vect_simplified_video.png) -![](vect_simplified_text.png) +![](text_emb_result.png) -*Write a same pipeline with explicit inputs/outputs name specifications:* ```python -import towhee - -towhee.dc['path'](['./demo_video.mp4']) \ - .video_decode.ffmpeg['path', 'frames'](sample_type='uniform_temporal_subsample', args={'num_samples': 12}) \ - .runas_op['frames', 'frames'](func=lambda x: [y for y in x]) \ - .video_text_embedding.drl['frames', 'vec'](base_encoder='clip_vit_b32', modality='video', device='cpu') \ - .show(formatter={'path': 'video_path'}) - -towhee.dc['text'](['kids feeding and playing with the horse']) \ - .video_text_embedding.drl['text','vec'](base_encoder='clip_vit_b32', modality='text', device='cpu') \ - .select['text', 'vec']() \ - .show() -``` +from towhee.dc2 import pipe, ops, DataCollection + +p = ( + pipe.input('video_path') \ + .map('video_path', 'flame_gen', ops.video_decode.ffmpeg(sample_type='uniform_temporal_subsample', args={'num_samples': 12})) \ + .map('flame_gen', 'flame_list', lambda x: [y for y in x]) \ + .map('flame_list', 'vec', ops.video_text_embedding.drl(base_encoder='clip_vit_b32', modality='video', device='cuda:0')) \ + .output('video_path', 'flame_list', 'vec') +) -![](vect_explicit_video.png) -![](vect_explicit_text.png) +DataCollection(p('./demo_video.mp4')).show() +``` +![](video_emb_result.png)
diff --git a/drl.py b/drl.py index 20505b7..4905da3 100644 --- a/drl.py +++ b/drl.py @@ -41,7 +41,7 @@ class DRL(NNOperator): self.device = "cuda" if torch.cuda.is_available() else "cpu" else: self.device = device - self.model = drl.create_model(base_encoder=base_encoder, pretrained=True, cdcr=0, weights_path=weight_path) + self.model = drl.create_model(base_encoder=base_encoder, pretrained=True, cdcr=0, weights_path=weight_path, device=device) self.tokenize = clip4clip.SimpleTokenizer() self.tfms = transforms.Compose([ diff --git a/text_emb_result.png b/text_emb_result.png new file mode 100644 index 0000000..a8bf088 Binary files /dev/null and b/text_emb_result.png differ diff --git a/vect_explicit_text.png b/vect_explicit_text.png deleted file mode 100644 index 521ea68..0000000 Binary files a/vect_explicit_text.png and /dev/null differ diff --git a/vect_explicit_video.png b/vect_explicit_video.png deleted file mode 100644 index 5aabbba..0000000 Binary files a/vect_explicit_video.png and /dev/null differ diff --git a/vect_simplified_text.png b/vect_simplified_text.png deleted file mode 100644 index 5e2eacb..0000000 Binary files a/vect_simplified_text.png and /dev/null differ diff --git a/vect_simplified_video.png b/vect_simplified_video.png deleted file mode 100644 index ec824f4..0000000 Binary files a/vect_simplified_video.png and /dev/null differ diff --git a/video_emb_result.png b/video_emb_result.png new file mode 100644 index 0000000..fb4fc55 Binary files /dev/null and b/video_emb_result.png differ