diff --git a/action_clip.py b/action_clip.py index 938738f..409cd27 100644 --- a/action_clip.py +++ b/action_clip.py @@ -42,7 +42,7 @@ class ActionClip(NNOperator): topk: int = 5 ): super().__init__(framework='pytorch') - self.device = 'cpu' # todo: self.device = 'cuda' if torch.cuda.is_available() else 'cpu' + self.device = 'cuda:2' if torch.cuda.is_available() else 'cpu' self.model_name = model_name self.skip_preprocess = skip_preprocess self.topk = topk @@ -63,7 +63,8 @@ class ActionClip(NNOperator): clip_model=model_name, pretrained=True, jit=True, - checkpoints=checkpoints + checkpoints=checkpoints, + device=self.device ) self.transform_cfgs = get_configs( @@ -99,7 +100,7 @@ class ActionClip(NNOperator): **self.transform_cfgs ) video = video.to(self.device)[None, ...].transpose(1, 2) - visual_features = self.encode_video(video) + visual_features = self.encode_video(video).float() features = visual_features.to('cpu').squeeze(0).detach().numpy() kinetic_classes = list(self.classmap.values()) @@ -109,6 +110,7 @@ class ActionClip(NNOperator): else: text_features = self.encode_text(kinetic_classes) + text_features = text_features.float().to(self.device) num_text_aug = int(text_features.size(0) / len(kinetic_classes)) similarity = action_clip.get_similarity(text_features, visual_features, num_text_augs=num_text_aug) values_k, indices_k = similarity.topk(self.topk, dim=-1)