diff --git a/action_clip.py b/action_clip.py
index 938738f..409cd27 100644
--- a/action_clip.py
+++ b/action_clip.py
@@ -42,7 +42,7 @@ class ActionClip(NNOperator):
                  topk: int = 5
                  ):
         super().__init__(framework='pytorch')
-        self.device = 'cpu'  # todo: self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = 'cuda:2' if torch.cuda.is_available() else 'cpu'
         self.model_name = model_name
         self.skip_preprocess = skip_preprocess
         self.topk = topk
@@ -63,7 +63,8 @@ class ActionClip(NNOperator):
             clip_model=model_name,
             pretrained=True,
             jit=True,
-            checkpoints=checkpoints
+            checkpoints=checkpoints,
+            device=self.device
             )
 
         self.transform_cfgs = get_configs(
@@ -99,7 +100,7 @@ class ActionClip(NNOperator):
             **self.transform_cfgs
             )
         video = video.to(self.device)[None, ...].transpose(1, 2)
-        visual_features = self.encode_video(video)
+        visual_features = self.encode_video(video).float()
         features = visual_features.to('cpu').squeeze(0).detach().numpy()
 
         kinetic_classes = list(self.classmap.values())
@@ -109,6 +110,7 @@ class ActionClip(NNOperator):
         else:
             text_features = self.encode_text(kinetic_classes)
 
+        text_features = text_features.float().to(self.device)
         num_text_aug = int(text_features.size(0) / len(kinetic_classes))
         similarity = action_clip.get_similarity(text_features, visual_features, num_text_augs=num_text_aug)
         values_k, indices_k = similarity.topk(self.topk, dim=-1)