|
|
@ -42,7 +42,7 @@ class ActionClip(NNOperator): |
|
|
|
topk: int = 5 |
|
|
|
): |
|
|
|
super().__init__(framework='pytorch') |
|
|
|
self.device = 'cpu' # todo: self.device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
|
self.device = 'cuda:2' if torch.cuda.is_available() else 'cpu' |
|
|
|
self.model_name = model_name |
|
|
|
self.skip_preprocess = skip_preprocess |
|
|
|
self.topk = topk |
|
|
@ -63,7 +63,8 @@ class ActionClip(NNOperator): |
|
|
|
clip_model=model_name, |
|
|
|
pretrained=True, |
|
|
|
jit=True, |
|
|
|
checkpoints=checkpoints |
|
|
|
checkpoints=checkpoints, |
|
|
|
device=self.device |
|
|
|
) |
|
|
|
|
|
|
|
self.transform_cfgs = get_configs( |
|
|
@ -99,7 +100,7 @@ class ActionClip(NNOperator): |
|
|
|
**self.transform_cfgs |
|
|
|
) |
|
|
|
video = video.to(self.device)[None, ...].transpose(1, 2) |
|
|
|
visual_features = self.encode_video(video) |
|
|
|
visual_features = self.encode_video(video).float() |
|
|
|
features = visual_features.to('cpu').squeeze(0).detach().numpy() |
|
|
|
|
|
|
|
kinetic_classes = list(self.classmap.values()) |
|
|
@ -109,6 +110,7 @@ class ActionClip(NNOperator): |
|
|
|
else: |
|
|
|
text_features = self.encode_text(kinetic_classes) |
|
|
|
|
|
|
|
text_features = text_features.float().to(self.device) |
|
|
|
num_text_aug = int(text_features.size(0) / len(kinetic_classes)) |
|
|
|
similarity = action_clip.get_similarity(text_features, visual_features, num_text_augs=num_text_aug) |
|
|
|
values_k, indices_k = similarity.topk(self.topk, dim=-1) |
|
|
|