l

Signed-off-by: xujinling <jinling.xu@zilliz.com>
3 years ago · c051de2314
6 changed files with 3 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -47,8 +47,7 @@ towhee.dc(['kids feeding and playing with the horse']) \
 import towhee

 towhee.dc['path'](['./demo_video.mp4']) \
-      .video_decode.ffmpeg['path', 'frames'](sample_type='uniform_temporal_subsample', args={'num_samples': 4}) \
-      .runas_op['frames', 'frames'](func=lambda x: [y for y in x]) \
+      .video_decode.ffmpeg['path', 'frames']() \
      .video_text_embedding.bridge_former['frames', 'vec'](model_name='frozen_model', modality='video') \
      .select['path', 'vec']() \
      .show(formatter={'path': 'video_path'})
--- a/bridge_former.py
+++ b/bridge_former.py
@ -82,12 +82,12 @@ class BridgeFormer(NNOperator):
        return vec

    def _inference_from_text(self, text: List[str]):
-        text_data = self.tokenizer(text, return_tensors='pt')

-        text_data = text_data.to(self.device)
        if self.model_name == "clip_initialized_model":
+            text_data = self.tokenizer(text, return_tensors='pt', padding='max_length', max_length=77).to(self.device)
            text_features = self.model.encode_text(text_data["input_ids"])
        else:
+            text_data = self.tokenizer(text, return_tensors='pt').to(self.device)
            text_features = self.model.compute_text(text_data)
        return text_features.squeeze(0).detach().flatten().cpu().numpy()

--- a/result1.png
+++ b/result1.png
--- a/result2.png
+++ b/result2.png
--- a/result3.png
+++ b/result3.png
--- a/result4.png
+++ b/result4.png