diff --git a/README.md b/README.md index df590ef..5e12456 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,7 @@ towhee.dc(['kids feeding and playing with the horse']) \ import towhee towhee.dc['path'](['./demo_video.mp4']) \ - .video_decode.ffmpeg['path', 'frames'](sample_type='uniform_temporal_subsample', args={'num_samples': 4}) \ - .runas_op['frames', 'frames'](func=lambda x: [y for y in x]) \ + .video_decode.ffmpeg['path', 'frames']() \ .video_text_embedding.bridge_former['frames', 'vec'](model_name='frozen_model', modality='video') \ .select['path', 'vec']() \ .show(formatter={'path': 'video_path'}) diff --git a/bridge_former.py b/bridge_former.py index 14fcedb..ba99614 100644 --- a/bridge_former.py +++ b/bridge_former.py @@ -82,12 +82,12 @@ class BridgeFormer(NNOperator): return vec def _inference_from_text(self, text: List[str]): - text_data = self.tokenizer(text, return_tensors='pt') - text_data = text_data.to(self.device) if self.model_name == "clip_initialized_model": + text_data = self.tokenizer(text, return_tensors='pt', padding='max_length', max_length=77).to(self.device) text_features = self.model.encode_text(text_data["input_ids"]) else: + text_data = self.tokenizer(text, return_tensors='pt').to(self.device) text_features = self.model.compute_text(text_data) return text_features.squeeze(0).detach().flatten().cpu().numpy() diff --git a/result1.png b/result1.png new file mode 100644 index 0000000..d083d3e Binary files /dev/null and b/result1.png differ diff --git a/result2.png b/result2.png new file mode 100644 index 0000000..73d17f6 Binary files /dev/null and b/result2.png differ diff --git a/result3.png b/result3.png new file mode 100644 index 0000000..22afb2b Binary files /dev/null and b/result3.png differ diff --git a/result4.png b/result4.png new file mode 100644 index 0000000..7a88200 Binary files /dev/null and b/result4.png differ