diff --git a/README.md b/README.md
index df590ef..5e12456 100644
--- a/README.md
+++ b/README.md
@@ -47,8 +47,7 @@ towhee.dc(['kids feeding and playing with the horse']) \
 import towhee
 
 towhee.dc['path'](['./demo_video.mp4']) \
-      .video_decode.ffmpeg['path', 'frames'](sample_type='uniform_temporal_subsample', args={'num_samples': 4}) \
-      .runas_op['frames', 'frames'](func=lambda x: [y for y in x]) \
+      .video_decode.ffmpeg['path', 'frames']() \
       .video_text_embedding.bridge_former['frames', 'vec'](model_name='frozen_model', modality='video') \
       .select['path', 'vec']() \
       .show(formatter={'path': 'video_path'})
diff --git a/bridge_former.py b/bridge_former.py
index 14fcedb..ba99614 100644
--- a/bridge_former.py
+++ b/bridge_former.py
@@ -82,12 +82,12 @@ class BridgeFormer(NNOperator):
         return vec
 
     def _inference_from_text(self, text: List[str]):
-        text_data = self.tokenizer(text, return_tensors='pt')
 
-        text_data = text_data.to(self.device)
         if self.model_name == "clip_initialized_model":
+            text_data = self.tokenizer(text, return_tensors='pt', padding='max_length', max_length=77).to(self.device)
             text_features = self.model.encode_text(text_data["input_ids"])
         else:
+            text_data = self.tokenizer(text, return_tensors='pt').to(self.device)
             text_features = self.model.compute_text(text_data)
         return text_features.squeeze(0).detach().flatten().cpu().numpy()
 
diff --git a/result1.png b/result1.png
new file mode 100644
index 0000000..d083d3e
Binary files /dev/null and b/result1.png differ
diff --git a/result2.png b/result2.png
new file mode 100644
index 0000000..73d17f6
Binary files /dev/null and b/result2.png differ
diff --git a/result3.png b/result3.png
new file mode 100644
index 0000000..22afb2b
Binary files /dev/null and b/result3.png differ
diff --git a/result4.png b/result4.png
new file mode 100644
index 0000000..7a88200
Binary files /dev/null and b/result4.png differ