Browse Source
l
Signed-off-by: xujinling <jinling.xu@zilliz.com>
main
xujinling
3 years ago
6 changed files with
3 additions and
4 deletions
-
README.md
-
bridge_former.py
-
BIN
result1.png
-
BIN
result2.png
-
BIN
result3.png
-
BIN
result4.png
|
|
@ -47,8 +47,7 @@ towhee.dc(['kids feeding and playing with the horse']) \ |
|
|
|
import towhee |
|
|
|
|
|
|
|
towhee.dc['path'](['./demo_video.mp4']) \ |
|
|
|
.video_decode.ffmpeg['path', 'frames'](sample_type='uniform_temporal_subsample', args={'num_samples': 4}) \ |
|
|
|
.runas_op['frames', 'frames'](func=lambda x: [y for y in x]) \ |
|
|
|
.video_decode.ffmpeg['path', 'frames']() \ |
|
|
|
.video_text_embedding.bridge_former['frames', 'vec'](model_name='frozen_model', modality='video') \ |
|
|
|
.select['path', 'vec']() \ |
|
|
|
.show(formatter={'path': 'video_path'}) |
|
|
|
|
|
@ -82,12 +82,12 @@ class BridgeFormer(NNOperator): |
|
|
|
return vec |
|
|
|
|
|
|
|
def _inference_from_text(self, text: List[str]): |
|
|
|
text_data = self.tokenizer(text, return_tensors='pt') |
|
|
|
|
|
|
|
text_data = text_data.to(self.device) |
|
|
|
if self.model_name == "clip_initialized_model": |
|
|
|
text_data = self.tokenizer(text, return_tensors='pt', padding='max_length', max_length=77).to(self.device) |
|
|
|
text_features = self.model.encode_text(text_data["input_ids"]) |
|
|
|
else: |
|
|
|
text_data = self.tokenizer(text, return_tensors='pt').to(self.device) |
|
|
|
text_features = self.model.compute_text(text_data) |
|
|
|
return text_features.squeeze(0).detach().flatten().cpu().numpy() |
|
|
|
|
|
|
|
Width:
|
Height:
|
Size: 12 KiB
|
Width:
|
Height:
|
Size: 12 KiB
|
Width:
|
Height:
|
Size: 115 KiB
|
Width:
|
Height:
|
Size: 6.7 KiB
|