From fe3c1620cc4fb9ed0ee8f2d17bb38e8450fcc17a Mon Sep 17 00:00:00 2001
From: wxywb <xy.wang@zilliz.com>
Date: Thu, 13 Oct 2022 17:28:19 +0800
Subject: [PATCH] init the operator.

Signed-off-by: wxywb <xy.wang@zilliz.com>
---
 .gitignore                                    |    1 +
 __init__.py                                   |   18 +
 japanese_clip/__init__.py                     |   19 +
 japanese_clip/auto_model.py                   |   95 ++
 japanese_clip/clip/__init__.py                |   16 +
 japanese_clip/clip/configuration_clip.py      |  219 ++++
 japanese_clip/clip/modeling_clip.py           |  815 +++++++++++++
 japanese_clip/cloob/__init__.py               |   16 +
 japanese_clip/cloob/configuration_cloob.py    |  203 ++++
 japanese_clip/cloob/loss.py                   |   58 +
 japanese_clip/cloob/modeling_cloob.py         |  783 +++++++++++++
 japanese_clip/tokenizer.py                    |   63 +
 japanese_clip/utils/__init__.py               |    0
 japanese_clip/utils/callbacks.py              |   96 ++
 japanese_clip/utils/imagenet_zeroshot_data.py | 1043 +++++++++++++++++
 .../utils/imagenet_zeroshot_data_en.py        |  248 ++++
 japanese_clip/version.py                      |   16 +
 jclip.py                                      |   79 ++
 requirements.txt                              |    0
 19 files changed, 3788 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 __init__.py
 create mode 100644 japanese_clip/__init__.py
 create mode 100644 japanese_clip/auto_model.py
 create mode 100644 japanese_clip/clip/__init__.py
 create mode 100644 japanese_clip/clip/configuration_clip.py
 create mode 100644 japanese_clip/clip/modeling_clip.py
 create mode 100644 japanese_clip/cloob/__init__.py
 create mode 100644 japanese_clip/cloob/configuration_cloob.py
 create mode 100644 japanese_clip/cloob/loss.py
 create mode 100644 japanese_clip/cloob/modeling_cloob.py
 create mode 100644 japanese_clip/tokenizer.py
 create mode 100644 japanese_clip/utils/__init__.py
 create mode 100644 japanese_clip/utils/callbacks.py
 create mode 100644 japanese_clip/utils/imagenet_zeroshot_data.py
 create mode 100644 japanese_clip/utils/imagenet_zeroshot_data_en.py
 create mode 100644 japanese_clip/version.py
 create mode 100644 jclip.py
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..9d2895f
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .jclip import Jaclip
+
+def jclip(model_name: str, modality: str):
+    return Jaclip(model_name, modality)
diff --git a/japanese_clip/__init__.py b/japanese_clip/__init__.py
new file mode 100644
index 0000000..5b60b75
--- /dev/null
+++ b/japanese_clip/__init__.py
@@ -0,0 +1,19 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .clip import CLIPModel, CLIPConfig
+from .cloob import CLOOBModel, CLOOBConfig
+from .auto_model import load, available_models
+from .tokenizer import load_tokenizer, tokenize
diff --git a/japanese_clip/auto_model.py b/japanese_clip/auto_model.py
new file mode 100644
index 0000000..eddfd44
--- /dev/null
+++ b/japanese_clip/auto_model.py
@@ -0,0 +1,95 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+import json
+import torch
+from torchvision import transforms as T
+from huggingface_hub import hf_hub_url, cached_download
+import os
+
+from .clip import CLIPModel
+from .cloob import CLOOBModel
+
+# TODO: Fill in repo_ids
+MODELS = {
+    'rinna/japanese-clip-vit-b-16': {
+        'repo_id': 'rinna/japanese-clip-vit-b-16',
+        'model_class': CLIPModel,
+    },
+    'rinna/japanese-cloob-vit-b-16': {
+        'repo_id': 'rinna/japanese-cloob-vit-b-16',
+        'model_class': CLOOBModel,
+    }
+}
+MODEL_CLASSES = {
+    "cloob": CLOOBModel,
+    "clip": CLIPModel,
+}
+MODEL_FILE = "pytorch_model.bin"
+CONFIG_FILE = "config.json"
+
+
+def available_models():
+    return list(MODELS.keys())
+
+
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+
+
+def _transform(image_size):
+    return T.Compose([
+        T.Resize(image_size, interpolation=T.InterpolationMode.BILINEAR),
+        T.CenterCrop(image_size),
+        _convert_to_rgb,
+        T.ToTensor(),
+        T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711),)
+    ])
+
+
+def _download(repo_id: str, cache_dir: str):
+    config_file_url = hf_hub_url(repo_id=repo_id, filename=CONFIG_FILE)
+    cached_download(config_file_url, cache_dir=cache_dir)
+    model_file_url = hf_hub_url(repo_id=repo_id, filename=MODEL_FILE)
+    cached_download(model_file_url, cache_dir=cache_dir)
+
+
+def load(
+        model_name: str,
+        device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+        **kwargs
+):
+    """
+    Args:
+        model_name: model unique name or path to pre-downloaded model
+        device: device to put the loaded model
+        kwargs: kwargs for huggingface pretrained model class
+    Return:
+        (torch.nn.Module, A torchvision transform)
+    """
+    if model_name in MODELS.keys():
+        ModelClass = CLIPModel if 'clip' in model_name else CLOOBModel
+    elif os.path.exists(model_name):
+        assert os.path.exists(os.path.join(model_name, CONFIG_FILE))
+        with open(os.path.join(model_name, CONFIG_FILE), "r", encoding="utf-8") as f:
+            j = json.load(f)
+        ModelClass = MODEL_CLASSES[j["model_type"]]
+    else:
+        RuntimeError(f"Model {model_name} not found; available models = {available_models()}")
+
+    model = ModelClass.from_pretrained(model_name, **kwargs)
+    model = model.eval().requires_grad_(False).to(device)
+    return model, _transform(model.config.vision_config.image_size)
diff --git a/japanese_clip/clip/__init__.py b/japanese_clip/clip/__init__.py
new file mode 100644
index 0000000..c377d55
--- /dev/null
+++ b/japanese_clip/clip/__init__.py
@@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .modeling_clip import *
+from .configuration_clip import *
diff --git a/japanese_clip/clip/configuration_clip.py b/japanese_clip/clip/configuration_clip.py
new file mode 100644
index 0000000..3e5f071
--- /dev/null
+++ b/japanese_clip/clip/configuration_clip.py
@@ -0,0 +1,219 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CLIP model configuration"""
+import logging
+import copy
+import os
+from typing import Union
+
+import numpy as np
+from transformers import AutoConfig, PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class CLIPTextConfig(PretrainedConfig):
+    model_type = "clip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPVisionConfig(PretrainedConfig):
+    model_type = "clip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    CLIP model according to the specified arguments, defining the text model and vision model configs.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config_dict (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config_dict (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+
+    model_type = "clip"
+    is_composition = True
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=512,
+        logit_scale_init_value=None,
+        **kwargs
+    ):
+        super().__init__(text_config=text_config, vision_config=vision_config, **kwargs)
+
+        if vision_config is None:
+            raise ValueError("`vision_config` can not be `None`.")
+
+        if text_config is None:
+            raise ValueError("`text_config` can not be `None`.")
+
+        vision_model_type = vision_config.pop("model_type")
+        text_model_type = text_config.pop("model_type")
+
+        if vision_model_type == "clip_vision_model":
+            self.vision_config = CLIPVisionConfig(**vision_config)
+        else:
+            self.vision_config = AutoConfig.for_model(
+                vision_model_type, **vision_config
+            )
+
+        if text_model_type == "clip_text_model":
+            self.text_config = CLIPTextConfig(**text_config)
+        else:
+            self.text_config = AutoConfig.for_model(
+                text_model_type, **text_config
+            )
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value if logit_scale_init_value is not None else np.log(1 / 0.07)
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/japanese_clip/clip/modeling_clip.py b/japanese_clip/clip/modeling_clip.py
new file mode 100644
index 0000000..16a713f
--- /dev/null
+++ b/japanese_clip/clip/modeling_clip.py
@@ -0,0 +1,815 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers import AutoModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel, ModelOutput
+from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+
+
+def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.T)
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+class CLIPOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class CLIPVisionEmbeddings(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class CLIPMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, config: CLIPConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CLIPConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, CLIPModel):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPEncoder):
+            module.gradient_checkpointing = value
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        bsz, seq_len = input_shape
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len).to(hidden_states.device)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, bsz, seq_len):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(bsz, seq_len, seq_len)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        mask = mask.unsqueeze(1)  # expand mask
+        return mask
+
+
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPModel(CLIPPreTrainedModel):
+    config_class = CLIPConfig
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        if isinstance(text_config, CLIPTextConfig):
+            text_model = CLIPTextTransformer(text_config)
+        else:
+            text_model = AutoModel.from_config(config.text_config, add_pooling_layer=False)
+
+        if isinstance(config.vision_config, CLIPVisionConfig):
+            vision_model = CLIPVisionModel(config.vision_config)
+        else:
+            vision_model = AutoModel.from_config(config.vision_config, add_pooling_layer=False)
+
+        self.text_model = text_model
+        self.vision_model = vision_model
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def encode_text(self, input_ids, **kwargs):
+        return self.get_text_features(input_ids=input_ids, **kwargs)
+
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs.last_hidden_state[:, 0, :]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    def encode_image(self, pixel_values, **kwargs):
+        return self.get_image_features(pixel_values=pixel_values, **kwargs)
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs.last_hidden_state[:, 0, :]
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs.last_hidden_state[:, 0, :]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs.last_hidden_state[:, 0, :]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.T
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
diff --git a/japanese_clip/cloob/__init__.py b/japanese_clip/cloob/__init__.py
new file mode 100644
index 0000000..5266914
--- /dev/null
+++ b/japanese_clip/cloob/__init__.py
@@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_cloob import *
+from .modeling_cloob import *
diff --git a/japanese_clip/cloob/configuration_cloob.py b/japanese_clip/cloob/configuration_cloob.py
new file mode 100644
index 0000000..215015c
--- /dev/null
+++ b/japanese_clip/cloob/configuration_cloob.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CLOOB model configuration"""
+import logging
+import copy
+import os
+from typing import Union
+
+from transformers import AutoConfig, PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class CLOOBTextConfig(PretrainedConfig):
+    model_type = "cloob_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLOOBVisionConfig(PretrainedConfig):
+    model_type = "cloob_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLOOBConfig(PretrainedConfig):
+    model_type = "cloob"
+    is_composition = True
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=512,
+        init_inv_tau=30.0,
+        scale_hopfield=15.0,
+        **kwargs
+    ):
+        super().__init__(text_config=text_config, vision_config=vision_config, **kwargs)
+
+        if vision_config is None:
+            raise ValueError("`vision_config` can not be `None`.")
+
+        if text_config is None:
+            raise ValueError("`text_config` can not be `None`.")
+
+        vision_model_type = vision_config.pop("model_type")
+        text_model_type = text_config.pop("model_type")
+
+        if vision_model_type == "cloob_vision_model":
+            self.vision_config = CLOOBVisionConfig(**vision_config)
+        else:
+            self.vision_config = AutoConfig.for_model(
+                vision_model_type, **vision_config
+            )
+
+        if text_model_type == "cloob_text_model":
+            self.text_config = CLOOBTextConfig(**text_config)
+        else:
+            self.text_config = AutoConfig.for_model(
+                text_model_type, **text_config
+            )
+
+        self.projection_dim = projection_dim
+        self.initializer_factor = 1.0
+        self.init_inv_tau = init_inv_tau
+        self.scale_hopfield = scale_hopfield
+
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLOOBTextConfig, vision_config: CLOOBVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
diff --git a/japanese_clip/cloob/loss.py b/japanese_clip/cloob/loss.py
new file mode 100644
index 0000000..5050309
--- /dev/null
+++ b/japanese_clip/cloob/loss.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+
+
+def cloob_loss(image_features, text_features, inv_tau, scale_hopfield):
+    """
+    Note: this loss has been rescaled from the original CLOOB loss for interpretability,
+    to convert to the original, divide it by inv_tau / 2.
+    """
+    p_xx, p_yy, p_xy, p_yx = hopfield_retrieval(image_features, text_features, scale_hopfield)
+    identity = torch.eye(p_xx.shape[1]) > 0.5
+    i = identity.to(p_xx.device)
+    loss_img = infoLOOB_loss(p_xx.T, p_xy.T, i, inv_tau=inv_tau)
+    loss_txt = infoLOOB_loss(p_yy.T, p_yx.T, i, inv_tau=inv_tau)
+    return (loss_img + loss_txt) / 2
+
+
+def infoLOOB_loss(x, y, i, inv_tau):
+    tau = 1 / inv_tau
+    k = x @ y.T / tau
+    positives = -torch.mean(torch.sum(k * i, dim=1))
+
+    # For logsumexp the zero entries must be equal to a very large negative number
+    large_neg = -10000.0
+    arg_lse = k * torch.logical_not(i) + i * large_neg
+    negatives = torch.mean(torch.logsumexp(arg_lse, dim=1))
+    return positives + negatives
+
+
+def hopfield_retrieval(image_features, text_features, scale_hopfield):
+    patterns_xx = hopfield(state_patterns=image_features, stored_patterns=image_features, scale_hopfield=scale_hopfield)
+    patterns_yy = hopfield(state_patterns=text_features, stored_patterns=text_features, scale_hopfield=scale_hopfield)
+    patterns_xy = hopfield(state_patterns=text_features, stored_patterns=image_features, scale_hopfield=scale_hopfield)
+    patterns_yx = hopfield(state_patterns=image_features, stored_patterns=text_features, scale_hopfield=scale_hopfield)
+
+    return patterns_xx, patterns_yy, patterns_xy, patterns_yx
+
+
+def hopfield(state_patterns, stored_patterns, scale_hopfield):
+    retrieved_patterns = stored_patterns.T @ F.softmax(scale_hopfield * stored_patterns @ state_patterns.T, dim=0)
+    # Row vectors -> dim=1 to normalize the row vectors
+    retrieved_patterns = retrieved_patterns / retrieved_patterns.norm(dim=0, keepdim=True)
+    return retrieved_patterns
diff --git a/japanese_clip/cloob/modeling_cloob.py b/japanese_clip/cloob/modeling_cloob.py
new file mode 100644
index 0000000..563f4c5
--- /dev/null
+++ b/japanese_clip/cloob/modeling_cloob.py
@@ -0,0 +1,783 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers import AutoModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel, ModelOutput
+from .configuration_cloob import CLOOBConfig, CLOOBTextConfig, CLOOBVisionConfig
+from .loss import cloob_loss
+from ..clip.modeling_clip import _expand_mask
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CLOOBOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    inv_tau: Union[torch.FloatTensor, float] = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class CLOOBVisionEmbeddings(nn.Module):
+    def __init__(self, config: CLOOBVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class CLOOBTextEmbeddings(nn.Module):
+    def __init__(self, config: CLOOBTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class CLOOBAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class CLOOBMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class CLOOBEncoderLayer(nn.Module):
+    def __init__(self, config: CLOOBConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLOOBAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = CLOOBMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class CLOOBPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CLOOBConfig
+    base_model_prefix = "cloob"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLOOBTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLOOBVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLOOBAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLOOBMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, CLOOBModel):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLOOBEncoder):
+            module.gradient_checkpointing = value
+
+
+class CLOOBEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLOOBEncoderLayer`].
+    Args:
+        config: CLOOBConfig
+    """
+
+    def __init__(self, config: CLOOBConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([CLOOBEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class CLOOBTextTransformer(nn.Module):
+    def __init__(self, config: CLOOBTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLOOBTextEmbeddings(config)
+        self.encoder = CLOOBEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        bsz, seq_len = input_shape
+        # CLOOB's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLOOB/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/CLOOB/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len).to(hidden_states.device)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, bsz, seq_len):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(bsz, seq_len, seq_len)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        mask = mask.unsqueeze(1)  # expand mask
+        return mask
+
+
+class CLOOBTextModel(CLOOBPreTrainedModel):
+    config_class = CLOOBTextConfig
+
+    def __init__(self, config: CLOOBTextConfig):
+        super().__init__(config)
+        self.text_model = CLOOBTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLOOBVisionTransformer(nn.Module):
+    def __init__(self, config: CLOOBVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLOOBVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim)
+        self.encoder = CLOOBEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class CLOOBVisionModel(CLOOBPreTrainedModel):
+    config_class = CLOOBVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLOOBVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLOOBVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLOOBModel(CLOOBPreTrainedModel):
+    config_class = CLOOBConfig
+
+    def __init__(self, config: CLOOBConfig):
+        super().__init__(config)
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        if isinstance(text_config, CLOOBTextConfig):
+            text_model = CLOOBTextTransformer(text_config)
+        else:
+            text_model = AutoModel.from_config(config.text_config, add_pooling_layer=False)
+
+        if isinstance(config.vision_config, CLOOBVisionConfig):
+            vision_model = CLOOBVisionModel(config.vision_config)
+        else:
+            vision_model = AutoModel.from_config(config.vision_config, add_pooling_layer=False)
+
+        self.text_model = text_model
+        self.vision_model = vision_model
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+
+        self.inv_tau = config.init_inv_tau
+        self.scale_hopfield = config.scale_hopfield
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def encode_text(self, input_ids, **kwargs):
+        return self.get_text_features(input_ids=input_ids, **kwargs)
+
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        # Use CLOOB model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs.last_hidden_state[:, 0, :]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    def encode_image(self, pixel_values, **kwargs):
+        return self.get_image_features(pixel_values=pixel_values, **kwargs)
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        # Use CLOOB model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs.last_hidden_state[:, 0, :]
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLOOBOutput]:
+        # Use CLOOB model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs.last_hidden_state[:, 0, :]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs.last_hidden_state[:, 0, :]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
+
+        loss = None
+        if return_loss:
+            loss = cloob_loss(image_embeds, text_embeds, self.inv_tau, self.scale_hopfield)
+
+        if not return_dict:
+            output = (text_embeds, image_embeds, self.inv_tau, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLOOBOutput(
+            loss=loss,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            inv_tau=self.inv_tau,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
diff --git a/japanese_clip/tokenizer.py b/japanese_clip/tokenizer.py
new file mode 100644
index 0000000..67209d6
--- /dev/null
+++ b/japanese_clip/tokenizer.py
@@ -0,0 +1,63 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union, List
+import torch
+from transformers import T5Tokenizer
+
+
+def load_tokenizer():
+    """
+    https://huggingface.co/rinna/japanese-roberta-base
+    """
+    tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-roberta-base")
+    tokenizer.do_lower_case = True  # due to some bug of tokenizer config loading
+    return tokenizer
+
+
+def tokenize(
+        texts: Union[str, List[str]],
+        tokenizer: T5Tokenizer = None,
+        max_seq_len: int = 77,
+        device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+):
+    """
+    This is a function that have the original clip's code has.
+    https://github.com/openai/CLIP/blob/main/clip/clip.py#L195
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    if tokenizer is None:
+        tokenizer = load_tokenizer()
+    inputs = tokenizer(
+        texts,
+        max_length=max_seq_len-1,
+        padding="max_length",
+        truncation=True,
+        add_special_tokens=False,
+    )
+    # add cls token at first place
+    input_ids = [[tokenizer.cls_token_id] + ids for ids in inputs['input_ids']]
+    attention_mask = [[1] + am for am in inputs['attention_mask']]
+    position_ids = [list(range(0, len(input_ids[0])))] * len(texts)
+
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    attention_mask = torch.tensor(attention_mask, dtype=torch.long)
+    position_ids = torch.tensor(position_ids, dtype=torch.long)
+    return {
+        "input_ids": input_ids.to(device),
+        "attention_mask": attention_mask.to(device),
+        "position_ids": position_ids.to(device),
+    }
diff --git a/japanese_clip/utils/__init__.py b/japanese_clip/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/japanese_clip/utils/callbacks.py b/japanese_clip/utils/callbacks.py
new file mode 100644
index 0000000..06ddcf5
--- /dev/null
+++ b/japanese_clip/utils/callbacks.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tqdm.auto import tqdm
+import numpy as np
+import torch
+
+
+def accuracy(output, target, topk=(1,)):
+    output = torch.from_numpy(np.asarray(output))
+    target = torch.from_numpy(np.asarray(target))
+    pred = output.topk(max(topk), dim=1, largest=True, sorted=True)[1].t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [
+        float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy())
+        for k in topk
+    ]
+
+
+class ImagenetClassificationCallback:
+    def __init__(
+            self,
+            imagenet_classes,
+            imagenet_templates,
+            imagenet_dataloader,
+    ):
+        self.imagenet_classes = imagenet_classes
+        self.imagenet_templates = imagenet_templates
+        self.imagenet_dataloader = imagenet_dataloader
+
+    def tokenize(self, tokenizer, examples, device):
+        encoding_inputs = tokenizer(examples, max_length=76, padding="max_length", truncation=True, add_special_tokens=False)
+        # add cls token at first place
+        input_ids = [[tokenizer.cls_token_id] + ids for ids in encoding_inputs['input_ids']]
+        attention_mask = [[1] + am for am in encoding_inputs['attention_mask']]
+        position_ids = [list(range(0, len(input_ids[0])))] * len(examples)
+
+        input_ids = torch.tensor(input_ids, dtype=torch.long, device=device)
+        attention_mask = torch.tensor(attention_mask, dtype=torch.long, device=device)
+        position_ids = torch.tensor(position_ids, dtype=torch.long, device=device)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def zeroshot_classifier(self, model, tokenizer, classnames, templates):
+        zeroshot_weights = []
+        for classname in tqdm(classnames):
+            texts = [template.format(classname) for template in templates]
+            class_embeddings = model.get_text_features(**self.tokenize(tokenizer, texts, model.device)).detach().cpu().numpy()
+            class_embeddings = class_embeddings / np.linalg.norm(
+                class_embeddings, axis=-1, keepdims=True
+            )
+            class_embedding = np.mean(class_embeddings, axis=0)
+            class_embedding /= np.linalg.norm(class_embedding, axis=-1)
+            zeroshot_weights.append(class_embedding)
+        zeroshot_weights = np.stack(zeroshot_weights, axis=1)
+        return zeroshot_weights
+
+    def zeroshot(self, model, tokenizer) -> dict:
+        print("Imagenet Zeroshot Classification...")
+        zeroshot_weights = self.zeroshot_classifier(model, tokenizer, self.imagenet_classes, self.imagenet_templates)
+        top_ns = [1, 5, 10, 100]
+        acc_counters = [0.0 for _ in top_ns]
+        n = 0.0
+
+        for i, (images, target) in enumerate(tqdm(self.imagenet_dataloader)):
+            target = target.numpy()
+            # predict
+            image_features = model.get_image_features(images.to(model.device)).detach().cpu().numpy()
+            image_features = image_features / np.linalg.norm(image_features, axis=-1, keepdims=True)
+            logits = 100.0 * image_features @ zeroshot_weights
+
+            # measure accuracy
+            accs = accuracy(logits, target, topk=top_ns)
+            for j in range(len(top_ns)):
+                acc_counters[j] += accs[j]
+            n += images.shape[0]
+
+        tops = {f"imagenet/top{top_ns[i]}": acc_counters[i] / n * 100 for i in range(len(top_ns))}
+
+        return tops
+
diff --git a/japanese_clip/utils/imagenet_zeroshot_data.py b/japanese_clip/utils/imagenet_zeroshot_data.py
new file mode 100644
index 0000000..5533a1a
--- /dev/null
+++ b/japanese_clip/utils/imagenet_zeroshot_data.py
@@ -0,0 +1,1043 @@
+imagenet_classnames = [{'en': 'tench', 'ja': 'テンチ'},
+ {'en': 'goldfish', 'ja': '金魚'},
+ {'en': 'great white shark', 'ja': 'ホホジロザメ'},
+ {'en': 'tiger shark', 'ja': 'イタチザメ'},
+ {'en': 'hammerhead shark', 'ja': 'ハンマーヘッド'},
+ {'en': 'electric ray', 'ja': 'シビレエイ'},
+ {'en': 'stingray', 'ja': 'アカエイ'},
+ {'en': 'rooster', 'ja': 'コック'},
+ {'en': 'hen', 'ja': 'めんどり'},
+ {'en': 'ostrich', 'ja': 'ダチョウ'},
+ {'en': 'brambling', 'ja': 'アトリ'},
+ {'en': 'goldfinch', 'ja': 'ゴシキヒワ'},
+ {'en': 'house finch', 'ja': 'ハウスフィンチ'},
+ {'en': 'junco', 'ja': 'ユキヒメドリ'},
+ {'en': 'indigo bunting', 'ja': 'インディゴホオジロ'},
+ {'en': 'American robin', 'ja': 'ロビン'},
+ {'en': 'bulbul', 'ja': 'ブルブル'},
+ {'en': 'jay', 'ja': 'カケス'},
+ {'en': 'magpie', 'ja': 'カササギ'},
+ {'en': 'chickadee', 'ja': '四十雀'},
+ {'en': 'American dipper', 'ja': '水クロウタドリ'},
+ {'en': 'kite (bird of prey)', 'ja': '凧'},
+ {'en': 'bald eagle', 'ja': '白頭ワシ'},
+ {'en': 'vulture', 'ja': 'ハゲワシ'},
+ {'en': 'great grey owl', 'ja': 'カラフトフクロウ'},
+ {'en': 'fire salamander', 'ja': '欧州ファイアサラマンダー'},
+ {'en': 'smooth newt', 'ja': '共通イモリ'},
+ {'en': 'newt', 'ja': 'イモリ'},
+ {'en': 'spotted salamander', 'ja': 'サンショウウオを発見'},
+ {'en': 'axolotl', 'ja': 'アホロートル'},
+ {'en': 'American bullfrog', 'ja': 'ウシガエル'},
+ {'en': 'tree frog', 'ja': 'アマガエル'},
+ {'en': 'tailed frog', 'ja': 'つかれたカエル'},
+ {'en': 'loggerhead sea turtle', 'ja': 'とんちき'},
+ {'en': 'leatherback sea turtle', 'ja': 'オサガメ'},
+ {'en': 'mud turtle', 'ja': '鼈'},
+ {'en': 'terrapin', 'ja': 'テラピン'},
+ {'en': 'box turtle', 'ja': 'ハコガメ'},
+ {'en': 'banded gecko', 'ja': '縞模様のヤモリ'},
+ {'en': 'green iguana', 'ja': '共通イグアナ'},
+ {'en': 'Carolina anole', 'ja': 'アメリカンカメレオン'},
+ {'en': 'desert grassland whiptail lizard', 'ja': 'ウィッペイル'},
+ {'en': 'agama', 'ja': 'アガマトカゲ'},
+ {'en': 'frilled-necked lizard', 'ja': 'フリルトカゲ'},
+ {'en': 'alligator lizard', 'ja': 'アリゲータートカゲ'},
+ {'en': 'Gila monster', 'ja': 'アメリカドクトカゲ'},
+ {'en': 'European green lizard', 'ja': '緑のトカゲ'},
+ {'en': 'chameleon', 'ja': 'アフリカのカメレオン'},
+ {'en': 'Komodo dragon', 'ja': 'コモドドラゴン'},
+ {'en': 'Nile crocodile', 'ja': 'アフリカのワニ'},
+ {'en': 'American alligator', 'ja': 'アメリカワニ'},
+ {'en': 'triceratops', 'ja': 'トリケラトプス'},
+ {'en': 'worm snake', 'ja': '雷のヘビ'},
+ {'en': 'ring-necked snake', 'ja': 'リングネックスネーク'},
+ {'en': 'eastern hog-nosed snake', 'ja': 'ホーノースヘビ'},
+ {'en': 'smooth green snake', 'ja': '緑のヘビ'},
+ {'en': 'kingsnake', 'ja': 'キングスネーク'},
+ {'en': 'garter snake', 'ja': 'ガータースネーク'},
+ {'en': 'water snake', 'ja': '水蛇'},
+ {'en': 'vine snake', 'ja': 'つるヘビ'},
+ {'en': 'night snake', 'ja': '夜のヘビ'},
+ {'en': 'boa constrictor', 'ja': 'ボア・コンストリクター'},
+ {'en': 'African rock python', 'ja': 'ロックパイソン'},
+ {'en': 'Indian cobra', 'ja': 'インドコブラ'},
+ {'en': 'green mamba', 'ja': 'グリーンマンバ'},
+ {'en': 'sea snake', 'ja': 'ウミヘビ'},
+ {'en': 'Saharan horned viper', 'ja': 'ツノクサリヘビ'},
+ {'en': 'eastern diamondback rattlesnake', 'ja': 'ダイヤ'},
+ {'en': 'sidewinder rattlesnake', 'ja': 'サイドワインダー'},
+ {'en': 'trilobite', 'ja': '三葉虫'},
+ {'en': 'harvestman', 'ja': '刈り入れ作業者'},
+ {'en': 'scorpion', 'ja': 'サソリ'},
+ {'en': 'yellow garden spider', 'ja': '黒と金の庭クモ'},
+ {'en': 'barn spider', 'ja': '納屋クモ'},
+ {'en': 'European garden spider', 'ja': '庭クモ'},
+ {'en': 'southern black widow', 'ja': 'クロゴケグモ'},
+ {'en': 'tarantula', 'ja': 'タランチュラ'},
+ {'en': 'wolf spider', 'ja': 'オオカミのクモ'},
+ {'en': 'tick', 'ja': 'ダニ'},
+ {'en': 'centipede', 'ja': '百足'},
+ {'en': 'black grouse', 'ja': 'クロライチョウ'},
+ {'en': 'ptarmigan', 'ja': '雷鳥'},
+ {'en': 'ruffed grouse', 'ja': 'ひだえりの付いたライチョウ'},
+ {'en': 'prairie grouse', 'ja': '草原チキン'},
+ {'en': 'peafowl', 'ja': '孔雀'},
+ {'en': 'quail', 'ja': 'ウズラ'},
+ {'en': 'partridge', 'ja': 'ヤマウズラ'},
+ {'en': 'african grey parrot', 'ja': 'アフリカの灰色'},
+ {'en': 'macaw', 'ja': 'コンゴウインコ'},
+ {'en': 'sulphur-crested cockatoo', 'ja': '硫黄トキオウム'},
+ {'en': 'lorikeet', 'ja': 'インコ'},
+ {'en': 'coucal', 'ja': 'バンケン'},
+ {'en': 'bee eater', 'ja': '蜂食べる人'},
+ {'en': 'hornbill', 'ja': 'サイチョウ'},
+ {'en': 'hummingbird', 'ja': 'ハチドリ'},
+ {'en': 'jacamar', 'ja': '錐嘴'},
+ {'en': 'toucan', 'ja': 'オオハシ'},
+ {'en': 'duck', 'ja': 'ドレイク'},
+ {'en': 'red-breasted merganser', 'ja': '赤ブレストアイサ属のガモ'},
+ {'en': 'goose', 'ja': 'ガチョウ'},
+ {'en': 'black swan', 'ja': '黒い白鳥'},
+ {'en': 'tusker', 'ja': 'タスカービール'},
+ {'en': 'echidna', 'ja': 'ハリモグラ'},
+ {'en': 'platypus', 'ja': 'カモノハシ'},
+ {'en': 'wallaby', 'ja': 'ワラビー'},
+ {'en': 'koala', 'ja': 'コアラ'},
+ {'en': 'wombat', 'ja': 'ウォンバット'},
+ {'en': 'jellyfish', 'ja': 'クラゲ'},
+ {'en': 'sea anemone', 'ja': 'イソギンチャク'},
+ {'en': 'brain coral', 'ja': '脳サンゴ'},
+ {'en': 'flatworm', 'ja': '扁形動物'},
+ {'en': 'nematode', 'ja': '線虫'},
+ {'en': 'conch', 'ja': '巻き貝'},
+ {'en': 'snail', 'ja': 'カタツムリ'},
+ {'en': 'slug', 'ja': 'ナメクジ'},
+ {'en': 'sea slug', 'ja': 'ウミウシ'},
+ {'en': 'chiton', 'ja': 'キトン'},
+ {'en': 'chambered nautilus', 'ja': 'オウムガイ'},
+ {'en': 'Dungeness crab', 'ja': 'アメリカイチョウガニ'},
+ {'en': 'rock crab', 'ja': '岩カニ'},
+ {'en': 'fiddler crab', 'ja': 'シオマネキ'},
+ {'en': 'red king crab', 'ja': 'タラバガニ'},
+ {'en': 'American lobster', 'ja': 'アメリカンロブスター'},
+ {'en': 'spiny lobster', 'ja': '伊勢エビ'},
+ {'en': 'crayfish', 'ja': 'ザリガニ'},
+ {'en': 'hermit crab', 'ja': 'ヤドカリ'},
+ {'en': 'isopod', 'ja': '等脚類'},
+ {'en': 'white stork', 'ja': 'コウノトリ'},
+ {'en': 'black stork', 'ja': 'ナベコウ'},
+ {'en': 'spoonbill', 'ja': 'ヘラサギ'},
+ {'en': 'flamingo', 'ja': 'フラミンゴ'},
+ {'en': 'little blue heron', 'ja': '小さな青いサギ'},
+ {'en': 'great egret', 'ja': 'アメリカン白鷺'},
+ {'en': 'bittern bird', 'ja': 'にがり'},
+ {'en': 'crane bird', 'ja': 'クレーン'},
+ {'en': 'limpkin', 'ja': 'ツルモドキ科の鳥'},
+ {'en': 'common gallinule', 'ja': 'ヨーロピアン水鳥'},
+ {'en': 'American coot', 'ja': 'アメリカオオバン'},
+ {'en': 'bustard', 'ja': 'ノガン'},
+ {'en': 'ruddy turnstone', 'ja': 'キョウジョシギ'},
+ {'en': 'dunlin', 'ja': '赤担保シギ'},
+ {'en': 'common redshank', 'ja': 'アカアシシギ'},
+ {'en': 'dowitcher', 'ja': 'オオハシシギ'},
+ {'en': 'oystercatcher', 'ja': 'ミヤコドリ'},
+ {'en': 'pelican', 'ja': 'ペリカン'},
+ {'en': 'king penguin', 'ja': 'キングペンギン'},
+ {'en': 'albatross', 'ja': 'アルバトロス'},
+ {'en': 'grey whale', 'ja': 'コククジラ'},
+ {'en': 'killer whale', 'ja': 'シャチ'},
+ {'en': 'dugong', 'ja': 'ジュゴン'},
+ {'en': 'sea lion', 'ja': 'アシカ'},
+ {'en': 'Chihuahua', 'ja': 'チワワ'},
+ {'en': 'Japanese Chin', 'ja': '狆'},
+ {'en': 'Maltese', 'ja': 'マルチーズ犬'},
+ {'en': 'Pekingese', 'ja': '狆'},
+ {'en': 'Shih Tzu', 'ja': 'シーズー、シーズー'},
+ {'en': 'King Charles Spaniel', 'ja': 'ブレナムスパニエル'},
+ {'en': 'Papillon', 'ja': 'パピヨン'},
+ {'en': 'toy terrier', 'ja': 'トイテリア'},
+ {'en': 'Rhodesian Ridgeback', 'ja': 'ローデシアン・リッジバック'},
+ {'en': 'Afghan Hound', 'ja': 'アフガンハウンド'},
+ {'en': 'Basset Hound', 'ja': 'バセット犬'},
+ {'en': 'Beagle', 'ja': 'ビーグル'},
+ {'en': 'Bloodhound', 'ja': 'ブラッドハウンド'},
+ {'en': 'Bluetick Coonhound', 'ja': 'ブルーティック'},
+ {'en': 'Black and Tan Coonhound', 'ja': '黒と黄褐色の猟犬'},
+ {'en': 'Treeing Walker Coonhound', 'ja': 'ウォーカーハウンド'},
+ {'en': 'English foxhound', 'ja': 'イングリッシュフォックスハウンド'},
+ {'en': 'Redbone Coonhound', 'ja': 'レッドボーン'},
+ {'en': 'borzoi', 'ja': 'ボルゾイ'},
+ {'en': 'Irish Wolfhound', 'ja': 'アイリッシュ・ウルフハウンド'},
+ {'en': 'Italian Greyhound', 'ja': 'イタリアングレーハウンド'},
+ {'en': 'Whippet', 'ja': 'ウィペット'},
+ {'en': 'Ibizan Hound', 'ja': 'イビサハウンド'},
+ {'en': 'Norwegian Elkhound', 'ja': 'ノルウェーエルクハウンド'},
+ {'en': 'Otterhound', 'ja': 'オッターハウンド'},
+ {'en': 'Saluki', 'ja': 'サルーキ'},
+ {'en': 'Scottish Deerhound', 'ja': 'スコティッシュ・ディアハウンド'},
+ {'en': 'Weimaraner', 'ja': 'ワイマラナー'},
+ {'en': 'Staffordshire Bull Terrier', 'ja': 'スタフォードシャーブルテリア'},
+ {'en': 'American Staffordshire Terrier', 'ja': 'アメリカン・スタッフォードシャー・テリア'},
+ {'en': 'Bedlington Terrier', 'ja': 'ベドリントンテリア'},
+ {'en': 'Border Terrier', 'ja': 'ボーダーテリア'},
+ {'en': 'Kerry Blue Terrier', 'ja': 'ケリーブルーテリア'},
+ {'en': 'Irish Terrier', 'ja': 'アイリッシュテリア'},
+ {'en': 'Norfolk Terrier', 'ja': 'ノーフォークテリア'},
+ {'en': 'Norwich Terrier', 'ja': 'ノーリッチ・テリア'},
+ {'en': 'Yorkshire Terrier', 'ja': 'ヨークシャーテリア'},
+ {'en': 'Wire Fox Terrier', 'ja': 'ワイヤーヘアー・フォックステリア'},
+ {'en': 'Lakeland Terrier', 'ja': 'レークランドテリア'},
+ {'en': 'Sealyham Terrier', 'ja': 'シーリーハムテリア'},
+ {'en': 'Airedale Terrier', 'ja': 'エアデール'},
+ {'en': 'Cairn Terrier', 'ja': 'ケルン'},
+ {'en': 'Australian Terrier', 'ja': 'オーストラリアテリア'},
+ {'en': 'Dandie Dinmont Terrier', 'ja': 'ダンディディンモントテリア'},
+ {'en': 'Boston Terrier', 'ja': 'ボストンブル'},
+ {'en': 'Miniature Schnauzer', 'ja': 'ミニチュアシュナウザー'},
+ {'en': 'Giant Schnauzer', 'ja': 'ジャイアントシュナウザー'},
+ {'en': 'Standard Schnauzer', 'ja': 'スタンダードシュナウザー'},
+ {'en': 'Scottish Terrier', 'ja': 'スコッチテリア'},
+ {'en': 'Tibetan Terrier', 'ja': 'チベタンテリア'},
+ {'en': 'Australian Silky Terrier', 'ja': 'シルキーテリア'},
+ {'en': 'Soft-coated Wheaten Terrier', 'ja': 'ソフトコーテッド・ウィートン・テリア'},
+ {'en': 'West Highland White Terrier', 'ja': 'ウェストハイランドホワイトテリア'},
+ {'en': 'Lhasa Apso', 'ja': 'ラサ'},
+ {'en': 'Flat-Coated Retriever', 'ja': 'フラットコーテッド・レトリーバー'},
+ {'en': 'Curly-coated Retriever', 'ja': 'カーリーコーティングされたレトリーバー'},
+ {'en': 'Golden Retriever', 'ja': 'ゴールデンレトリバー'},
+ {'en': 'Labrador Retriever', 'ja': 'ラブラドル・レトリーバー犬'},
+ {'en': 'Chesapeake Bay Retriever', 'ja': 'チェサピーク湾レトリーバー'},
+ {'en': 'German Shorthaired Pointer', 'ja': 'ジャーマン・ショートヘア・ポインタ'},
+ {'en': 'Vizsla', 'ja': 'ビズラ'},
+ {'en': 'English Setter', 'ja': 'イングリッシュセッター'},
+ {'en': 'Irish Setter', 'ja': 'アイリッシュセッター'},
+ {'en': 'Gordon Setter', 'ja': 'ゴードンセッター'},
+ {'en': 'Brittany dog', 'ja': 'ブリタニースパニエル'},
+ {'en': 'Clumber Spaniel', 'ja': 'クランバー'},
+ {'en': 'English Springer Spaniel', 'ja': 'イングリッシュスプリンガー'},
+ {'en': 'Welsh Springer Spaniel', 'ja': 'ウェルシュスプリンガースパニエル'},
+ {'en': 'Cocker Spaniel', 'ja': 'コッカースパニエル'},
+ {'en': 'Sussex Spaniel', 'ja': 'サセックススパニエル'},
+ {'en': 'Irish Water Spaniel', 'ja': 'アイルランドのウォータースパニエル'},
+ {'en': 'Kuvasz', 'ja': 'クバース犬'},
+ {'en': 'Schipperke', 'ja': 'スキッパーキー'},
+ {'en': 'Groenendael dog', 'ja': 'ベルジアン・シェパード・ドッグ・グローネンダール'},
+ {'en': 'Malinois', 'ja': 'マリノア'},
+ {'en': 'Briard', 'ja': 'ブリアール'},
+ {'en': 'Australian Kelpie', 'ja': 'ケルピー'},
+ {'en': 'Komondor', 'ja': 'コモンドール'},
+ {'en': 'Old English Sheepdog', 'ja': 'オールドイングリッシュシープドッグ'},
+ {'en': 'Shetland Sheepdog', 'ja': 'シェトランドシープドッグ'},
+ {'en': 'collie', 'ja': 'コリー'},
+ {'en': 'Border Collie', 'ja': 'ボーダーコリー'},
+ {'en': 'Bouvier des Flandres dog', 'ja': 'ブーヴィエ・デ・フランドル'},
+ {'en': 'Rottweiler', 'ja': 'ロットワイラー'},
+ {'en': 'German Shepherd Dog', 'ja': 'ジャーマンシェパード'},
+ {'en': 'Dobermann', 'ja': 'ドーベルマン犬'},
+ {'en': 'Miniature Pinscher', 'ja': 'ミニチュアピンシャー'},
+ {'en': 'Greater Swiss Mountain Dog', 'ja': 'グレータースイスマウンテンドッグ'},
+ {'en': 'Bernese Mountain Dog', 'ja': 'バーネーズマウンテンドッグ'},
+ {'en': 'Appenzeller Sennenhund', 'ja': 'アッペンツェル'},
+ {'en': 'Entlebucher Sennenhund', 'ja': 'エントレブッシャー'},
+ {'en': 'Boxer', 'ja': 'ボクサー'},
+ {'en': 'Bullmastiff', 'ja': 'ブルマスチフ'},
+ {'en': 'Tibetan Mastiff', 'ja': 'チベットマスチフ'},
+ {'en': 'French Bulldog', 'ja': 'フレンチブルドッグ'},
+ {'en': 'Great Dane', 'ja': 'グレートデーン'},
+ {'en': 'St. Bernard', 'ja': 'セントバーナード'},
+ {'en': 'husky', 'ja': 'エスキモー犬'},
+ {'en': 'Alaskan Malamute', 'ja': 'マラミュート'},
+ {'en': 'Siberian Husky', 'ja': 'シベリアンハスキー'},
+ {'en': 'Dalmatian', 'ja': 'ダルメシアン'},
+ {'en': 'Affenpinscher', 'ja': 'アーフェンピンシャー'},
+ {'en': 'Basenji', 'ja': 'バセンジー'},
+ {'en': 'pug', 'ja': 'パグ'},
+ {'en': 'Leonberger', 'ja': 'レオンバーグ'},
+ {'en': 'Newfoundland dog', 'ja': 'ニューファンドランド島'},
+ {'en': 'Great Pyrenees dog', 'ja': 'グレートピレニーズ'},
+ {'en': 'Samoyed', 'ja': 'サモエド'},
+ {'en': 'Pomeranian', 'ja': 'ポメラニアン'},
+ {'en': 'Chow Chow', 'ja': 'チャウ'},
+ {'en': 'Keeshond', 'ja': 'キースホンド'},
+ {'en': 'brussels griffon', 'ja': 'ブラバンソングリフォン'},
+ {'en': 'Pembroke Welsh Corgi', 'ja': 'ペンブローク'},
+ {'en': 'Cardigan Welsh Corgi', 'ja': 'カーディガン'},
+ {'en': 'Toy Poodle', 'ja': 'トイプードル'},
+ {'en': 'Miniature Poodle', 'ja': 'ミニチュアプードル'},
+ {'en': 'Standard Poodle', 'ja': 'スタンダードプードル'},
+ {'en': 'Mexican hairless dog (xoloitzcuintli)', 'ja': 'メキシカン・ヘアーレス'},
+ {'en': 'grey wolf', 'ja': 'シンリンオオカミ'},
+ {'en': 'Alaskan tundra wolf', 'ja': '白いオオカミ'},
+ {'en': 'red wolf or maned wolf', 'ja': 'レッドウルフ'},
+ {'en': 'coyote', 'ja': 'コヨーテ'},
+ {'en': 'dingo', 'ja': 'ディンゴ'},
+ {'en': 'dhole', 'ja': 'ドール'},
+ {'en': 'African wild dog', 'ja': 'リカオン'},
+ {'en': 'hyena', 'ja': 'ハイエナ'},
+ {'en': 'red fox', 'ja': 'アカギツネ'},
+ {'en': 'kit fox', 'ja': 'キットキツネ'},
+ {'en': 'Arctic fox', 'ja': 'ホッキョクギツネ'},
+ {'en': 'grey fox', 'ja': '灰色のキツネ'},
+ {'en': 'tabby cat', 'ja': 'タビー'},
+ {'en': 'tiger cat', 'ja': '虎猫'},
+ {'en': 'Persian cat', 'ja': 'ペルシャ猫'},
+ {'en': 'Siamese cat', 'ja': 'シャム猫'},
+ {'en': 'Egyptian Mau', 'ja': 'エジプトの猫'},
+ {'en': 'cougar', 'ja': 'クーガー'},
+ {'en': 'lynx', 'ja': 'オオヤマネコ'},
+ {'en': 'leopard', 'ja': 'ヒョウ'},
+ {'en': 'snow leopard', 'ja': 'ユキヒョウ'},
+ {'en': 'jaguar', 'ja': 'ジャガー'},
+ {'en': 'lion', 'ja': 'ライオン'},
+ {'en': 'tiger', 'ja': '虎'},
+ {'en': 'cheetah', 'ja': 'チーター'},
+ {'en': 'brown bear', 'ja': 'ヒグマ'},
+ {'en': 'American black bear', 'ja': 'アメリカクロクマ'},
+ {'en': 'polar bear', 'ja': '氷のクマ'},
+ {'en': 'sloth bear', 'ja': 'ナマケグマ'},
+ {'en': 'mongoose', 'ja': 'マングース'},
+ {'en': 'meerkat', 'ja': 'ミーアキャット'},
+ {'en': 'tiger beetle', 'ja': 'ハンミョウ'},
+ {'en': 'ladybug', 'ja': 'てんとう虫'},
+ {'en': 'ground beetle', 'ja': 'グランドビートル'},
+ {'en': 'longhorn beetle', 'ja': 'カミキリムシ'},
+ {'en': 'leaf beetle', 'ja': 'ハムシ'},
+ {'en': 'dung beetle', 'ja': 'フンコロガシ'},
+ {'en': 'rhinoceros beetle', 'ja': 'サイハムシ'},
+ {'en': 'weevil', 'ja': 'ゾウムシ'},
+ {'en': 'fly', 'ja': 'ハエ'},
+ {'en': 'bee', 'ja': '蜂'},
+ {'en': 'ant', 'ja': '蟻'},
+ {'en': 'grasshopper', 'ja': 'バッタ'},
+ {'en': 'cricket insect', 'ja': 'クリケット'},
+ {'en': 'stick insect', 'ja': '杖'},
+ {'en': 'cockroach', 'ja': 'ゴキブリ'},
+ {'en': 'praying mantis', 'ja': 'カマキリ'},
+ {'en': 'cicada', 'ja': '蝉'},
+ {'en': 'leafhopper', 'ja': 'ヨコバイ'},
+ {'en': 'lacewing', 'ja': 'クサカゲロウ'},
+ {'en': 'dragonfly', 'ja': 'トンボ'},
+ {'en': 'damselfly', 'ja': 'イトトンボ'},
+ {'en': 'red admiral butterfly', 'ja': '提督'},
+ {'en': 'ringlet butterfly', 'ja': 'リングレット'},
+ {'en': 'monarch butterfly', 'ja': '君主'},
+ {'en': 'small white butterfly', 'ja': 'モンシロチョウ'},
+ {'en': 'sulphur butterfly', 'ja': '硫黄蝶'},
+ {'en': 'gossamer-winged butterfly', 'ja': 'シジミチョウ'},
+ {'en': 'starfish', 'ja': 'ヒトデ'},
+ {'en': 'sea urchin', 'ja': 'うに'},
+ {'en': 'sea cucumber', 'ja': 'ナマコ'},
+ {'en': 'cottontail rabbit', 'ja': '木のウサギ'},
+ {'en': 'hare', 'ja': '野ウサギ'},
+ {'en': 'Angora rabbit', 'ja': 'アンゴラ'},
+ {'en': 'hamster', 'ja': 'ハムスター'},
+ {'en': 'porcupine', 'ja': 'ヤマアラシ'},
+ {'en': 'fox squirrel', 'ja': 'キツネリス'},
+ {'en': 'marmot', 'ja': 'マーモット'},
+ {'en': 'beaver', 'ja': 'ビーバー'},
+ {'en': 'guinea pig', 'ja': 'モルモット'},
+ {'en': 'common sorrel horse', 'ja': '栗色'},
+ {'en': 'zebra', 'ja': 'シマウマ'},
+ {'en': 'pig', 'ja': '豚'},
+ {'en': 'wild boar', 'ja': 'イノシシ'},
+ {'en': 'warthog', 'ja': 'イボイノシシ'},
+ {'en': 'hippopotamus', 'ja': 'カバ'},
+ {'en': 'ox', 'ja': '雄牛'},
+ {'en': 'water buffalo', 'ja': '水牛'},
+ {'en': 'bison', 'ja': 'バイソン'},
+ {'en': 'ram (adult male sheep)', 'ja': 'ラム'},
+ {'en': 'bighorn sheep', 'ja': 'ビッグホーン'},
+ {'en': 'Alpine ibex', 'ja': 'アイベックス'},
+ {'en': 'hartebeest', 'ja': 'ハーテビースト'},
+ {'en': 'impala (antelope)', 'ja': 'インパラ'},
+ {'en': 'gazelle', 'ja': 'ガゼル'},
+ {'en': 'arabian camel', 'ja': 'アラビアラクダ'},
+ {'en': 'llama', 'ja': 'ラマ'},
+ {'en': 'weasel', 'ja': 'イタチ'},
+ {'en': 'mink', 'ja': 'ミンク'},
+ {'en': 'European polecat', 'ja': 'ケナガイタチ'},
+ {'en': 'black-footed ferret', 'ja': 'クロアシイタチ'},
+ {'en': 'otter', 'ja': 'カワウソ'},
+ {'en': 'skunk', 'ja': 'スカンク'},
+ {'en': 'badger', 'ja': '狸'},
+ {'en': 'armadillo', 'ja': 'アルマジロ'},
+ {'en': 'three-toed sloth', 'ja': 'ミユビナマケモノ'},
+ {'en': 'orangutan', 'ja': 'オランウータン'},
+ {'en': 'gorilla', 'ja': 'ゴリラ'},
+ {'en': 'chimpanzee', 'ja': 'チンパンジー'},
+ {'en': 'gibbon', 'ja': 'テナガザル'},
+ {'en': 'siamang', 'ja': 'フクロテナガザル'},
+ {'en': 'guenon', 'ja': 'オナガザル'},
+ {'en': 'patas monkey', 'ja': 'パタス'},
+ {'en': 'baboon', 'ja': 'ヒヒ'},
+ {'en': 'macaque', 'ja': 'マカク'},
+ {'en': 'langur', 'ja': 'ヤセザル'},
+ {'en': 'black-and-white colobus', 'ja': 'コロブス属'},
+ {'en': 'proboscis monkey', 'ja': 'テングザル'},
+ {'en': 'marmoset', 'ja': 'マーモセット'},
+ {'en': 'white-headed capuchin', 'ja': 'オマキザル'},
+ {'en': 'howler monkey', 'ja': 'ホエザル'},
+ {'en': 'titi monkey', 'ja': 'ティティ'},
+ {'en': "Geoffroy's spider monkey", 'ja': 'クモザル'},
+ {'en': 'common squirrel monkey', 'ja': 'リスザル'},
+ {'en': 'ring-tailed lemur', 'ja': 'マダガスカル猫'},
+ {'en': 'indri', 'ja': 'インドリ'},
+ {'en': 'Asian elephant', 'ja': 'インドゾウ'},
+ {'en': 'African bush elephant', 'ja': 'アフリカゾウ'},
+ {'en': 'red panda', 'ja': 'レッサーパンダ'},
+ {'en': 'giant panda', 'ja': 'ジャイアントパンダ'},
+ {'en': 'snoek fish', 'ja': 'バラクータ'},
+ {'en': 'eel', 'ja': 'ウナギ'},
+ {'en': 'silver salmon', 'ja': 'ギンザケ'},
+ {'en': 'rock beauty fish', 'ja': '岩の美しさ'},
+ {'en': 'clownfish', 'ja': 'クマノミ'},
+ {'en': 'sturgeon', 'ja': 'チョウザメ'},
+ {'en': 'gar fish', 'ja': 'ガー'},
+ {'en': 'lionfish', 'ja': 'ミノカサゴ'},
+ {'en': 'pufferfish', 'ja': 'フグ'},
+ {'en': 'abacus', 'ja': 'そろばん'},
+ {'en': 'abaya', 'ja': 'アバヤ'},
+ {'en': 'academic gown', 'ja': 'アカデミックガウン'},
+ {'en': 'accordion', 'ja': 'アコーディオン'},
+ {'en': 'acoustic guitar', 'ja': 'アコースティックギター'},
+ {'en': 'aircraft carrier', 'ja': '空母'},
+ {'en': 'airliner', 'ja': '旅客機'},
+ {'en': 'airship', 'ja': '飛行船'},
+ {'en': 'altar', 'ja': '祭壇'},
+ {'en': 'ambulance', 'ja': '救急車'},
+ {'en': 'amphibious vehicle', 'ja': '両生類'},
+ {'en': 'analog clock', 'ja': 'アナログ時計'},
+ {'en': 'apiary', 'ja': '養蜂場'},
+ {'en': 'apron', 'ja': 'エプロン'},
+ {'en': 'trash can', 'ja': 'ごみ入れ'},
+ {'en': 'assault rifle', 'ja': 'アサルトライフル'},
+ {'en': 'backpack', 'ja': 'バックパック'},
+ {'en': 'bakery', 'ja': 'ベーカリー'},
+ {'en': 'balance beam', 'ja': '平均台'},
+ {'en': 'balloon', 'ja': 'バルーン'},
+ {'en': 'ballpoint pen', 'ja': 'ボールペン'},
+ {'en': 'Band-Aid', 'ja': 'バンドエイド'},
+ {'en': 'banjo', 'ja': 'バンジョー'},
+ {'en': 'baluster / handrail', 'ja': 'バニスター'},
+ {'en': 'barbell', 'ja': 'バーベル'},
+ {'en': 'barber chair', 'ja': '理髪店の椅子'},
+ {'en': 'barbershop', 'ja': '理髪店'},
+ {'en': 'barn', 'ja': '納屋'},
+ {'en': 'barometer', 'ja': 'バロメーター'},
+ {'en': 'barrel', 'ja': 'バレル'},
+ {'en': 'wheelbarrow', 'ja': 'バロー'},
+ {'en': 'baseball', 'ja': '野球'},
+ {'en': 'basketball', 'ja': 'バスケットボール'},
+ {'en': 'bassinet', 'ja': 'バシネット'},
+ {'en': 'bassoon', 'ja': 'ファゴット'},
+ {'en': 'swimming cap', 'ja': '水泳帽'},
+ {'en': 'bath towel', 'ja': 'バスタオル'},
+ {'en': 'bathtub', 'ja': 'バスタブ'},
+ {'en': 'station wagon', 'ja': 'ビーチワゴン'},
+ {'en': 'lighthouse', 'ja': 'ビーコン'},
+ {'en': 'beaker', 'ja': 'ビーカー'},
+ {'en': 'military hat (bearskin or shako)', 'ja': 'ベアスキン'},
+ {'en': 'beer bottle', 'ja': 'ビール瓶'},
+ {'en': 'beer glass', 'ja': 'ビールグラス'},
+ {'en': 'bell tower', 'ja': 'ベルコート'},
+ {'en': 'baby bib', 'ja': 'ビブ'},
+ {'en': 'tandem bicycle', 'ja': '自転車'},
+ {'en': 'bikini', 'ja': 'ビキニ'},
+ {'en': 'ring binder', 'ja': 'バインダー'},
+ {'en': 'binoculars', 'ja': '双眼鏡'},
+ {'en': 'birdhouse', 'ja': '巣箱'},
+ {'en': 'boathouse', 'ja': 'ボートハウス'},
+ {'en': 'bobsleigh', 'ja': 'ボブスレー'},
+ {'en': 'bolo tie', 'ja': 'ループタイ'},
+ {'en': 'poke bonnet', 'ja': 'ボンネット'},
+ {'en': 'bookcase', 'ja': '本棚'},
+ {'en': 'bookstore', 'ja': '書店'},
+ {'en': 'bottle cap', 'ja': '瓶のキャップ'},
+ {'en': 'hunting bow', 'ja': '弓'},
+ {'en': 'bow tie', 'ja': 'ちょうネクタイ'},
+ {'en': 'brass memorial plaque', 'ja': '真鍮'},
+ {'en': 'bra', 'ja': 'ブラジャー'},
+ {'en': 'breakwater', 'ja': '防波堤'},
+ {'en': 'breastplate', 'ja': '胸当て'},
+ {'en': 'broom', 'ja': 'ほうき'},
+ {'en': 'bucket', 'ja': 'バケツ'},
+ {'en': 'buckle', 'ja': 'バックル'},
+ {'en': 'bulletproof vest', 'ja': '防弾チョッキ'},
+ {'en': 'high-speed train', 'ja': '新幹線'},
+ {'en': 'butcher shop', 'ja': '精肉店'},
+ {'en': 'taxicab', 'ja': 'タクシー'},
+ {'en': 'cauldron', 'ja': '大釜'},
+ {'en': 'candle', 'ja': 'キャンドル'},
+ {'en': 'cannon', 'ja': '大砲'},
+ {'en': 'canoe', 'ja': 'カヌー'},
+ {'en': 'can opener', 'ja': '缶切り'},
+ {'en': 'cardigan', 'ja': 'カーディガン'},
+ {'en': 'car mirror', 'ja': '車のミラー'},
+ {'en': 'carousel', 'ja': '回転木馬'},
+ {'en': 'tool kit', 'ja': '大工のキット'},
+ {'en': 'cardboard box / carton', 'ja': 'カートン'},
+ {'en': 'car wheel', 'ja': '車のホイール'},
+ {'en': 'automated teller machine', 'ja': '現金自動預け払い機'},
+ {'en': 'cassette', 'ja': 'カセット'},
+ {'en': 'cassette player', 'ja': 'カセット・プレーヤー'},
+ {'en': 'castle', 'ja': '城'},
+ {'en': 'catamaran', 'ja': 'カタマラン'},
+ {'en': 'CD player', 'ja': 'CDプレーヤー'},
+ {'en': 'cello', 'ja': 'チェロ'},
+ {'en': 'mobile phone', 'ja': 'スマートフォン'},
+ {'en': 'chain', 'ja': '鎖'},
+ {'en': 'chain-link fence', 'ja': 'チェーンリンクフェンス'},
+ {'en': 'chain mail', 'ja': 'チェーンメール'},
+ {'en': 'chainsaw', 'ja': 'チェーンソー'},
+ {'en': 'storage chest', 'ja': '胸'},
+ {'en': 'chiffonier', 'ja': 'シフォニア'},
+ {'en': 'bell or wind chime', 'ja': 'チャイム'},
+ {'en': 'china cabinet', 'ja': '中国キャビネット'},
+ {'en': 'Christmas stocking', 'ja': 'クリスマスの靴下'},
+ {'en': 'church', 'ja': '教会'},
+ {'en': 'movie theater', 'ja': '映画'},
+ {'en': 'cleaver', 'ja': 'クリーバー'},
+ {'en': 'cliff dwelling', 'ja': '崖の住居'},
+ {'en': 'cloak', 'ja': 'マント'},
+ {'en': 'clogs', 'ja': 'クロッグ'},
+ {'en': 'cocktail shaker', 'ja': 'カクテルシェーカー'},
+ {'en': 'coffee mug', 'ja': 'コーヒーマグ'},
+ {'en': 'coffeemaker', 'ja': 'コーヒーポット'},
+ {'en': 'spiral or coil', 'ja': 'コイル'},
+ {'en': 'combination lock', 'ja': 'ダイヤル錠'},
+ {'en': 'computer keyboard', 'ja': 'コンピュータのキーボード'},
+ {'en': 'candy store', 'ja': '製菓'},
+ {'en': 'container ship', 'ja': 'コンテナ船'},
+ {'en': 'convertible', 'ja': 'コンバーチブル'},
+ {'en': 'corkscrew', 'ja': 'コークスクリュー'},
+ {'en': 'cornet', 'ja': 'コルネット'},
+ {'en': 'cowboy boot', 'ja': 'カウボーイブーツ'},
+ {'en': 'cowboy hat', 'ja': 'カウボーイハット'},
+ {'en': 'cradle', 'ja': 'クレードル'},
+ {'en': 'construction crane', 'ja': 'クレーン'},
+ {'en': 'crash helmet', 'ja': 'クラッシュヘルメット'},
+ {'en': 'crate', 'ja': '木箱'},
+ {'en': 'infant bed', 'ja': 'ベビーベッド'},
+ {'en': 'Crock Pot', 'ja': 'クロークポット'},
+ {'en': 'croquet ball', 'ja': 'クロケットボール'},
+ {'en': 'crutch', 'ja': '松葉杖'},
+ {'en': 'cuirass', 'ja': '胸当て'},
+ {'en': 'dam', 'ja': 'ダム'},
+ {'en': 'desk', 'ja': '机'},
+ {'en': 'desktop computer', 'ja': 'デスクトップコンピューター'},
+ {'en': 'rotary dial telephone', 'ja': 'ダイヤル電話'},
+ {'en': 'diaper', 'ja': 'おむつ'},
+ {'en': 'digital clock', 'ja': 'デジタル時計'},
+ {'en': 'digital watch', 'ja': 'デジタル腕時計'},
+ {'en': 'dining table', 'ja': 'ダイニングテーブル'},
+ {'en': 'dishcloth', 'ja': '意気地なし'},
+ {'en': 'dishwasher', 'ja': '食器洗い機'},
+ {'en': 'disc brake', 'ja': 'ディスクブレーキ'},
+ {'en': 'dock', 'ja': 'ドック'},
+ {'en': 'dog sled', 'ja': '犬ぞり'},
+ {'en': 'dome', 'ja': 'ドーム'},
+ {'en': 'doormat', 'ja': '玄関マット'},
+ {'en': 'drilling rig', 'ja': '掘削基地'},
+ {'en': 'drum', 'ja': 'ドラム'},
+ {'en': 'drumstick', 'ja': 'ドラムスティック'},
+ {'en': 'dumbbell', 'ja': 'ダンベル'},
+ {'en': 'Dutch oven', 'ja': 'ダッチオーブン'},
+ {'en': 'electric fan', 'ja': '扇風機'},
+ {'en': 'electric guitar', 'ja': 'エレキギター'},
+ {'en': 'electric locomotive', 'ja': '電気機関車'},
+ {'en': 'entertainment center', 'ja': '娯楽施設'},
+ {'en': 'envelope', 'ja': '封筒'},
+ {'en': 'espresso machine', 'ja': 'エスプレッソマシーン'},
+ {'en': 'face powder', 'ja': 'フェースパウダー'},
+ {'en': 'feather boa', 'ja': 'フェザーボア'},
+ {'en': 'filing cabinet', 'ja': 'ファイル'},
+ {'en': 'fireboat', 'ja': '消防艇'},
+ {'en': 'fire truck', 'ja': '消防車'},
+ {'en': 'fire screen', 'ja': 'ファイアースクリーン'},
+ {'en': 'flagpole', 'ja': '旗竿'},
+ {'en': 'flute', 'ja': 'フルート'},
+ {'en': 'folding chair', 'ja': '折り畳み式椅子'},
+ {'en': 'football helmet', 'ja': 'フットボールヘルメット'},
+ {'en': 'forklift', 'ja': 'フォークリフト'},
+ {'en': 'fountain', 'ja': '噴水'},
+ {'en': 'fountain pen', 'ja': '万年筆'},
+ {'en': 'four-poster bed', 'ja': '四柱'},
+ {'en': 'freight car', 'ja': '貨車'},
+ {'en': 'French horn', 'ja': 'フレンチホルン'},
+ {'en': 'frying pan', 'ja': 'フライパン'},
+ {'en': 'fur coat', 'ja': '毛皮のコート'},
+ {'en': 'garbage truck', 'ja': 'ごみ収集車'},
+ {'en': 'gas mask or respirator', 'ja': 'ガスマスク'},
+ {'en': 'gas pump', 'ja': 'ガソリンポンプ'},
+ {'en': 'goblet', 'ja': 'ゴブレット'},
+ {'en': 'go-kart', 'ja': 'ゴーカート'},
+ {'en': 'golf ball', 'ja': 'ゴルフボール'},
+ {'en': 'golf cart', 'ja': 'ゴルフカート'},
+ {'en': 'gondola', 'ja': 'ゴンドラ'},
+ {'en': 'gong', 'ja': 'ゴング'},
+ {'en': 'gown', 'ja': 'ガウン'},
+ {'en': 'grand piano', 'ja': 'グランドピアノ'},
+ {'en': 'greenhouse', 'ja': '温室'},
+ {'en': 'radiator grille', 'ja': 'グリル'},
+ {'en': 'grocery store', 'ja': '食料品店'},
+ {'en': 'guillotine', 'ja': 'ギロチン'},
+ {'en': 'hair clip', 'ja': 'ヘアスライド'},
+ {'en': 'hair spray', 'ja': 'ヘアスプレー'},
+ {'en': 'half-track', 'ja': '半トラック'},
+ {'en': 'hammer', 'ja': 'ハンマー'},
+ {'en': 'hamper', 'ja': '妨げます'},
+ {'en': 'hair dryer', 'ja': 'ハンドブロワー'},
+ {'en': 'hand-held computer', 'ja': 'タブレット'},
+ {'en': 'handkerchief', 'ja': 'ハンカチ'},
+ {'en': 'hard disk drive', 'ja': 'ハードディスク'},
+ {'en': 'harmonica', 'ja': 'ハーモニカ'},
+ {'en': 'harp', 'ja': 'ハープ'},
+ {'en': 'combine harvester', 'ja': 'ハーベスタ'},
+ {'en': 'hatchet', 'ja': '斧'},
+ {'en': 'holster', 'ja': 'ホルスター'},
+ {'en': 'home theater', 'ja': 'ホームシアター'},
+ {'en': 'honeycomb', 'ja': 'ハニカム'},
+ {'en': 'hook', 'ja': 'フック'},
+ {'en': 'hoop skirt', 'ja': 'フープスカート'},
+ {'en': 'gymnastic horizontal bar', 'ja': '水平バー'},
+ {'en': 'horse-drawn vehicle', 'ja': '馬車'},
+ {'en': 'hourglass', 'ja': '砂時計'},
+ {'en': 'iPod', 'ja': 'アイフォーン'},
+ {'en': 'clothes iron', 'ja': '鉄'},
+ {'en': 'carved pumpkin', 'ja': 'ジャックオーランタン'},
+ {'en': 'jeans', 'ja': 'ジーンズ'},
+ {'en': 'jeep', 'ja': 'ジープ'},
+ {'en': 'T-shirt', 'ja': 'ジャージー'},
+ {'en': 'jigsaw puzzle', 'ja': 'ジグソーパズル'},
+ {'en': 'rickshaw', 'ja': '人力車'},
+ {'en': 'joystick', 'ja': 'ジョイスティック'},
+ {'en': 'kimono', 'ja': '着物'},
+ {'en': 'knee pad', 'ja': '膝パッド'},
+ {'en': 'knot', 'ja': '結び目'},
+ {'en': 'lab coat', 'ja': '白衣'},
+ {'en': 'ladle', 'ja': 'ひしゃく'},
+ {'en': 'lampshade', 'ja': 'ランプのかさ'},
+ {'en': 'laptop computer', 'ja': 'ノートパソコン'},
+ {'en': 'lawn mower', 'ja': '芝刈り機'},
+ {'en': 'lens cap', 'ja': 'レンズキャップ'},
+ {'en': 'letter opener', 'ja': 'レターオープナー'},
+ {'en': 'library', 'ja': 'ライブラリ'},
+ {'en': 'lifeboat', 'ja': '救命ボート'},
+ {'en': 'lighter', 'ja': 'ライター'},
+ {'en': 'limousine', 'ja': 'リムジン'},
+ {'en': 'ocean liner', 'ja': 'ライナー'},
+ {'en': 'lipstick', 'ja': '口紅'},
+ {'en': 'slip-on shoe', 'ja': 'ローファー'},
+ {'en': 'lotion', 'ja': 'ローション'},
+ {'en': 'music speaker', 'ja': 'スピーカー'},
+ {'en': 'loupe magnifying glass', 'ja': 'ルーペ'},
+ {'en': 'sawmill', 'ja': '製材所'},
+ {'en': 'magnetic compass', 'ja': '磁気コンパス'},
+ {'en': 'messenger bag', 'ja': '郵袋'},
+ {'en': 'mailbox', 'ja': 'メールボックス'},
+ {'en': 'tights', 'ja': 'マイヨ'},
+ {'en': 'one-piece bathing suit', 'ja': 'マイヨ'},
+ {'en': 'manhole cover', 'ja': 'マンホールの蓋'},
+ {'en': 'maraca', 'ja': 'マラカス'},
+ {'en': 'marimba', 'ja': 'マリンバ'},
+ {'en': 'mask', 'ja': 'マスク'},
+ {'en': 'matchstick', 'ja': 'マッチ棒'},
+ {'en': 'maypole', 'ja': 'メイポール'},
+ {'en': 'maze', 'ja': '迷路'},
+ {'en': 'measuring cup', 'ja': '計量カップ'},
+ {'en': 'medicine cabinet', 'ja': '薬箱'},
+ {'en': 'megalith', 'ja': '巨石'},
+ {'en': 'microphone', 'ja': 'マイク'},
+ {'en': 'microwave oven', 'ja': 'マイクロ波'},
+ {'en': 'military uniform', 'ja': '軍服'},
+ {'en': 'milk can', 'ja': 'ミルク缶'},
+ {'en': 'minibus', 'ja': 'ミニバス'},
+ {'en': 'miniskirt', 'ja': 'ミニスカート'},
+ {'en': 'minivan', 'ja': 'ミニバン'},
+ {'en': 'missile', 'ja': 'ミサイル'},
+ {'en': 'mitten', 'ja': 'ミトン'},
+ {'en': 'mixing bowl', 'ja': 'ミキシングボウル'},
+ {'en': 'mobile home', 'ja': '移動住宅'},
+ {'en': 'ford model t', 'ja': 'モデルT'},
+ {'en': 'modem', 'ja': 'モデム'},
+ {'en': 'monastery', 'ja': '修道院'},
+ {'en': 'monitor', 'ja': 'モニター'},
+ {'en': 'moped', 'ja': 'モペット'},
+ {'en': 'mortar and pestle', 'ja': 'モルタル'},
+ {'en': 'graduation cap', 'ja': 'モルタルボード'},
+ {'en': 'mosque', 'ja': 'モスク'},
+ {'en': 'mosquito net', 'ja': '蚊帳'},
+ {'en': 'vespa', 'ja': 'スクーター'},
+ {'en': 'mountain bike', 'ja': 'マウンテンバイク'},
+ {'en': 'tent', 'ja': '山のテント'},
+ {'en': 'computer mouse', 'ja': 'マウス'},
+ {'en': 'mousetrap', 'ja': 'ネズミ捕り'},
+ {'en': 'moving van', 'ja': '引っ越しトラック'},
+ {'en': 'muzzle', 'ja': '銃口'},
+ {'en': 'metal nail', 'ja': 'ネイル'},
+ {'en': 'neck brace', 'ja': 'ネックブレース'},
+ {'en': 'necklace', 'ja': 'ネックレス'},
+ {'en': 'baby pacifier', 'ja': '乳首'},
+ {'en': 'notebook computer', 'ja': 'ノート'},
+ {'en': 'obelisk', 'ja': 'オベリスク'},
+ {'en': 'oboe', 'ja': 'オーボエ'},
+ {'en': 'ocarina', 'ja': 'オカリナ'},
+ {'en': 'odometer', 'ja': 'オドメーター'},
+ {'en': 'oil filter', 'ja': 'オイルフィルター'},
+ {'en': 'pipe organ', 'ja': '器官'},
+ {'en': 'oscilloscope', 'ja': 'オシロスコープ'},
+ {'en': 'overskirt', 'ja': 'オーバースカート'},
+ {'en': 'bullock cart', 'ja': '牛車'},
+ {'en': 'oxygen mask', 'ja': '酸素マスク'},
+ {'en': 'product packet / packaging', 'ja': 'パケット'},
+ {'en': 'paddle', 'ja': 'パドル'},
+ {'en': 'paddle wheel', 'ja': 'パドルホイール'},
+ {'en': 'padlock', 'ja': '南京錠'},
+ {'en': 'paintbrush', 'ja': '絵筆'},
+ {'en': 'pajamas', 'ja': 'パジャマ'},
+ {'en': 'palace', 'ja': '宮殿'},
+ {'en': 'pan flute', 'ja': 'パンパイプ'},
+ {'en': 'paper towel', 'ja': 'ペーパータオル'},
+ {'en': 'parachute', 'ja': 'パラシュート'},
+ {'en': 'parallel bars', 'ja': '平行棒'},
+ {'en': 'park bench', 'ja': '公園のベンチ'},
+ {'en': 'parking meter', 'ja': 'パーキングメーター'},
+ {'en': 'railroad car', 'ja': '乗用車'},
+ {'en': 'patio', 'ja': 'パティオ'},
+ {'en': 'payphone', 'ja': '有料電話'},
+ {'en': 'pedestal', 'ja': '台座'},
+ {'en': 'pencil case', 'ja': '筆箱'},
+ {'en': 'pencil sharpener', 'ja': '鉛筆削り'},
+ {'en': 'perfume', 'ja': '香水'},
+ {'en': 'Petri dish', 'ja': 'ペトリ皿'},
+ {'en': 'photocopier', 'ja': 'コピー機'},
+ {'en': 'plectrum', 'ja': '選ぶ'},
+ {'en': 'Pickelhaube', 'ja': 'スパイク付き鉄かぶと'},
+ {'en': 'picket fence', 'ja': '杭柵'},
+ {'en': 'pickup truck', 'ja': '拾う'},
+ {'en': 'pier', 'ja': '桟橋'},
+ {'en': 'piggy bank', 'ja': '貯金箱'},
+ {'en': 'pill bottle', 'ja': '錠剤瓶'},
+ {'en': 'pillow', 'ja': '枕'},
+ {'en': 'ping-pong ball', 'ja': 'ピンポン球'},
+ {'en': 'pinwheel', 'ja': '風車'},
+ {'en': 'pirate ship', 'ja': '海賊'},
+ {'en': 'drink pitcher', 'ja': 'ピッチャー'},
+ {'en': 'block plane', 'ja': '飛行機'},
+ {'en': 'planetarium', 'ja': 'プラネタリウム'},
+ {'en': 'plastic bag', 'ja': 'ビニール袋'},
+ {'en': 'plate rack', 'ja': '皿立て'},
+ {'en': 'farm plow', 'ja': 'プラウ'},
+ {'en': 'plunger', 'ja': 'プランジャー'},
+ {'en': 'Polaroid camera', 'ja': 'ポラロイドカメラ'},
+ {'en': 'pole', 'ja': 'ポール'},
+ {'en': 'police van', 'ja': '警察車'},
+ {'en': 'poncho', 'ja': 'ポンチョ'},
+ {'en': 'pool table', 'ja': 'ビリヤード台'},
+ {'en': 'soda bottle', 'ja': 'ポップ・ボトル'},
+ {'en': 'plant pot', 'ja': 'ポット'},
+ {'en': "potter's wheel", 'ja': 'ろくろ'},
+ {'en': 'power drill', 'ja': 'パワードリル'},
+ {'en': 'prayer rug', 'ja': '礼拝用敷物'},
+ {'en': 'printer', 'ja': 'プリンタ'},
+ {'en': 'prison', 'ja': '刑務所'},
+ {'en': 'missile', 'ja': '発射体'},
+ {'en': 'projector', 'ja': 'プロジェクター'},
+ {'en': 'hockey puck', 'ja': 'パック'},
+ {'en': 'punching bag', 'ja': 'サンドバッグ'},
+ {'en': 'purse', 'ja': '財布'},
+ {'en': 'quill', 'ja': 'クイル'},
+ {'en': 'quilt', 'ja': 'キルト'},
+ {'en': 'race car', 'ja': 'レーサー'},
+ {'en': 'racket', 'ja': 'ラケット'},
+ {'en': 'radiator', 'ja': 'ラジエーター'},
+ {'en': 'radio', 'ja': '無線'},
+ {'en': 'radio telescope', 'ja': '電波望遠鏡'},
+ {'en': 'rain barrel', 'ja': '天水桶'},
+ {'en': 'recreational vehicle', 'ja': 'RV車'},
+ {'en': 'fishing casting reel', 'ja': 'リール'},
+ {'en': 'reflex camera', 'ja': 'レフレックスカメラ'},
+ {'en': 'refrigerator', 'ja': '冷蔵庫'},
+ {'en': 'remote control', 'ja': 'リモコン'},
+ {'en': 'restaurant', 'ja': 'レストラン'},
+ {'en': 'revolver', 'ja': 'リボルバー'},
+ {'en': 'rifle', 'ja': 'ライフル'},
+ {'en': 'rocking chair', 'ja': 'ロッキングチェア'},
+ {'en': 'rotisserie', 'ja': '焼肉料理店'},
+ {'en': 'eraser', 'ja': '消しゴム'},
+ {'en': 'rugby ball', 'ja': 'ラグビーボール'},
+ {'en': 'ruler measuring stick', 'ja': 'ルール'},
+ {'en': 'sneaker', 'ja': 'ランニングシューズ'},
+ {'en': 'safe', 'ja': '安全'},
+ {'en': 'safety pin', 'ja': '安全ピン'},
+ {'en': 'salt shaker', 'ja': '塩の入れ物'},
+ {'en': 'sandal', 'ja': 'サンダル'},
+ {'en': 'sarong', 'ja': 'サロン'},
+ {'en': 'saxophone', 'ja': 'サックス'},
+ {'en': 'scabbard', 'ja': '鞘'},
+ {'en': 'weighing scale', 'ja': '規模'},
+ {'en': 'school bus', 'ja': 'スクールバス'},
+ {'en': 'schooner', 'ja': 'スクーナー'},
+ {'en': 'scoreboard', 'ja': 'スコアボード'},
+ {'en': 'CRT monitor', 'ja': '画面'},
+ {'en': 'screw', 'ja': 'スクリュー'},
+ {'en': 'screwdriver', 'ja': 'ドライバー'},
+ {'en': 'seat belt', 'ja': 'シートベルト'},
+ {'en': 'sewing machine', 'ja': 'ミシン'},
+ {'en': 'shield', 'ja': 'シールド'},
+ {'en': 'shoe store', 'ja': '靴屋'},
+ {'en': 'shoji screen / room divider', 'ja': '障子'},
+ {'en': 'shopping basket', 'ja': '買い物かご'},
+ {'en': 'shopping cart', 'ja': 'ショッピングカート'},
+ {'en': 'shovel', 'ja': 'シャベル'},
+ {'en': 'shower cap', 'ja': 'シャワーキャップ'},
+ {'en': 'shower curtain', 'ja': 'シャワーカーテン'},
+ {'en': 'ski', 'ja': 'スキー'},
+ {'en': 'balaclava ski mask', 'ja': 'スキーマスク'},
+ {'en': 'sleeping bag', 'ja': '寝袋'},
+ {'en': 'slide rule', 'ja': '計算尺'},
+ {'en': 'sliding door', 'ja': '引き戸'},
+ {'en': 'slot machine', 'ja': 'スロット'},
+ {'en': 'snorkel', 'ja': 'スノーケル'},
+ {'en': 'snowmobile', 'ja': 'スノーモービル'},
+ {'en': 'snowplow', 'ja': '除雪機'},
+ {'en': 'soap dispenser', 'ja': 'ソープディスペンサー'},
+ {'en': 'soccer ball', 'ja': 'サッカーボール'},
+ {'en': 'sock', 'ja': '靴下'},
+ {'en': 'solar thermal collector', 'ja': '太陽の皿'},
+ {'en': 'sombrero', 'ja': 'ソンブレロ'},
+ {'en': 'soup bowl', 'ja': 'スープ皿'},
+ {'en': 'keyboard space bar', 'ja': 'スペースキー'},
+ {'en': 'space heater', 'ja': 'スペースヒーター'},
+ {'en': 'space shuttle', 'ja': 'スペースシャトル'},
+ {'en': 'spatula', 'ja': 'へら'},
+ {'en': 'motorboat', 'ja': 'スピードボート'},
+ {'en': 'spider web', 'ja': 'クモの巣'},
+ {'en': 'spindle', 'ja': 'スピンドル'},
+ {'en': 'sports car', 'ja': 'スポーツカー'},
+ {'en': 'spotlight', 'ja': 'スポットライト'},
+ {'en': 'stage', 'ja': 'ステージ'},
+ {'en': 'steam locomotive', 'ja': '蒸気機関車'},
+ {'en': 'through arch bridge', 'ja': '鋼アーチ橋'},
+ {'en': 'steel drum', 'ja': 'スチールドラム'},
+ {'en': 'stethoscope', 'ja': '聴診器'},
+ {'en': 'scarf', 'ja': 'ストール'},
+ {'en': 'stone wall', 'ja': '石垣'},
+ {'en': 'stopwatch', 'ja': 'ストップウォッチ'},
+ {'en': 'stove', 'ja': 'レンジ'},
+ {'en': 'strainer', 'ja': 'ストレーナー'},
+ {'en': 'tram', 'ja': '路面電車'},
+ {'en': 'stretcher', 'ja': 'ストレッチャー'},
+ {'en': 'couch', 'ja': 'スタジオソファ'},
+ {'en': 'stupa', 'ja': '仏舎利塔'},
+ {'en': 'submarine', 'ja': '潜水艦'},
+ {'en': 'suit', 'ja': 'スーツ'},
+ {'en': 'sundial', 'ja': '日時計'},
+ {'en': 'sunglasses', 'ja': 'サングラス'},
+ {'en': 'sunglasses', 'ja': 'サングラス'},
+ {'en': 'sunscreen', 'ja': '日焼け止め剤'},
+ {'en': 'suspension bridge', 'ja': 'つり橋'},
+ {'en': 'mop', 'ja': '綿棒'},
+ {'en': 'sweatshirt', 'ja': 'トレーナー'},
+ {'en': 'swim trunks / shorts', 'ja': '海パン'},
+ {'en': 'swing', 'ja': 'スイング'},
+ {'en': 'electrical switch', 'ja': 'スイッチ'},
+ {'en': 'syringe', 'ja': '注射器'},
+ {'en': 'table lamp', 'ja': '電気スタンド'},
+ {'en': 'tank', 'ja': 'タンク'},
+ {'en': 'tape player', 'ja': 'テーププレーヤー'},
+ {'en': 'teapot', 'ja': 'ティーポット'},
+ {'en': 'teddy bear', 'ja': 'テディ'},
+ {'en': 'television', 'ja': 'テレビ'},
+ {'en': 'tennis ball', 'ja': 'テニスボール'},
+ {'en': 'thatched roof', 'ja': 'サッチ'},
+ {'en': 'front curtain', 'ja': '劇場のカーテン'},
+ {'en': 'thimble', 'ja': '指ぬき'},
+ {'en': 'threshing machine', 'ja': '脱穀機'},
+ {'en': 'throne', 'ja': '王位'},
+ {'en': 'tile roof', 'ja': '瓦屋根'},
+ {'en': 'toaster', 'ja': 'トースター'},
+ {'en': 'tobacco shop', 'ja': 'タバコ屋'},
+ {'en': 'toilet seat', 'ja': '便座'},
+ {'en': 'torch', 'ja': 'トーチ'},
+ {'en': 'totem pole', 'ja': 'トーテムポール'},
+ {'en': 'tow truck', 'ja': 'レッカー車'},
+ {'en': 'toy store', 'ja': '玩具屋'},
+ {'en': 'tractor', 'ja': 'トラクター'},
+ {'en': 'semi-trailer truck', 'ja': 'トレーラートラック'},
+ {'en': 'tray', 'ja': 'トレイ'},
+ {'en': 'trench coat', 'ja': 'トレンチコート'},
+ {'en': 'tricycle', 'ja': '三輪車'},
+ {'en': 'trimaran', 'ja': '三胴船'},
+ {'en': 'tripod', 'ja': '三脚'},
+ {'en': 'triumphal arch', 'ja': '凱旋門'},
+ {'en': 'trolleybus', 'ja': 'トロリーバス'},
+ {'en': 'trombone', 'ja': 'トロンボーン'},
+ {'en': 'hot tub', 'ja': 'バスタブ'},
+ {'en': 'turnstile', 'ja': '回転ドア'},
+ {'en': 'typewriter keyboard', 'ja': 'タイプライターのキーボード'},
+ {'en': 'umbrella', 'ja': '傘'},
+ {'en': 'unicycle', 'ja': '一輪車'},
+ {'en': 'upright piano', 'ja': '直立'},
+ {'en': 'vacuum cleaner', 'ja': '真空'},
+ {'en': 'vase', 'ja': '花瓶'},
+ {'en': 'vaulted or arched ceiling', 'ja': 'ボールト'},
+ {'en': 'velvet fabric', 'ja': 'ベルベット'},
+ {'en': 'vending machine', 'ja': '自動販売機'},
+ {'en': 'vestment', 'ja': '祭服'},
+ {'en': 'viaduct', 'ja': '高架橋'},
+ {'en': 'violin', 'ja': 'バイオリン'},
+ {'en': 'volleyball', 'ja': 'バレーボール'},
+ {'en': 'waffle iron', 'ja': 'ワッフル焼き型'},
+ {'en': 'wall clock', 'ja': '壁時計'},
+ {'en': 'wallet', 'ja': '財布'},
+ {'en': 'wardrobe', 'ja': 'ワードローブ'},
+ {'en': 'military aircraft', 'ja': '戦闘機'},
+ {'en': 'sink', 'ja': '洗面器'},
+ {'en': 'washing machine', 'ja': 'ワッシャー'},
+ {'en': 'water bottle', 'ja': '水筒'},
+ {'en': 'water jug', 'ja': '水差し'},
+ {'en': 'water tower', 'ja': '給水塔'},
+ {'en': 'whiskey jug', 'ja': 'ウイスキージャグ'},
+ {'en': 'whistle', 'ja': 'ホイッスル'},
+ {'en': 'hair wig', 'ja': 'かつら'},
+ {'en': 'window screen', 'ja': '窓網戸'},
+ {'en': 'window shade', 'ja': 'ブラインド'},
+ {'en': 'Windsor tie', 'ja': 'ウィンザーネクタイ'},
+ {'en': 'wine bottle', 'ja': 'ワインボトル'},
+ {'en': 'airplane wing', 'ja': '翼'},
+ {'en': 'wok', 'ja': '中華鍋'},
+ {'en': 'wooden spoon', 'ja': '木製スプーン'},
+ {'en': 'wool', 'ja': 'ウール'},
+ {'en': 'split-rail fence', 'ja': 'ワームフェンス'},
+ {'en': 'shipwreck', 'ja': '難破船'},
+ {'en': 'sailboat', 'ja': 'ヨール'},
+ {'en': 'yurt', 'ja': 'パオ'},
+ {'en': 'website', 'ja': 'サイト'},
+ {'en': 'comic book', 'ja': 'コミックブック'},
+ {'en': 'crossword', 'ja': 'クロスワードパズル'},
+ {'en': 'traffic or street sign', 'ja': '道路標識'},
+ {'en': 'traffic light', 'ja': '交通信号灯'},
+ {'en': 'dust jacket', 'ja': 'ブックカバー'},
+ {'en': 'menu', 'ja': 'メニュー'},
+ {'en': 'plate', 'ja': 'プレート'},
+ {'en': 'guacamole', 'ja': 'グアカモーレ'},
+ {'en': 'consomme', 'ja': 'コンソメ'},
+ {'en': 'hot pot', 'ja': 'ホットポット'},
+ {'en': 'trifle', 'ja': 'パフェ'},
+ {'en': 'ice cream', 'ja': 'アイスクリーム'},
+ {'en': 'popsicle', 'ja': 'アイスキャンディー'},
+ {'en': 'baguette', 'ja': 'フランスパン'},
+ {'en': 'bagel', 'ja': 'ベーグル'},
+ {'en': 'pretzel', 'ja': 'プレッツェル'},
+ {'en': 'cheeseburger', 'ja': 'チーズバーガー'},
+ {'en': 'hot dog', 'ja': 'ホットドッグ'},
+ {'en': 'mashed potatoes', 'ja': 'マッシュポテト'},
+ {'en': 'cabbage', 'ja': 'キャベツ'},
+ {'en': 'broccoli', 'ja': 'ブロッコリー'},
+ {'en': 'cauliflower', 'ja': 'カリフラワー'},
+ {'en': 'zucchini', 'ja': 'ズッキーニ'},
+ {'en': 'spaghetti squash', 'ja': 'そうめんかぼちゃ'},
+ {'en': 'acorn squash', 'ja': 'ドングリかぼちゃ'},
+ {'en': 'butternut squash', 'ja': 'カボチャ'},
+ {'en': 'cucumber', 'ja': 'キュウリ'},
+ {'en': 'artichoke', 'ja': 'アーティチョーク'},
+ {'en': 'bell pepper', 'ja': 'ピーマン'},
+ {'en': 'cardoon', 'ja': 'カルドン'},
+ {'en': 'mushroom', 'ja': 'キノコ'},
+ {'en': 'Granny Smith apple', 'ja': 'リンゴ'},
+ {'en': 'strawberry', 'ja': 'イチゴ'},
+ {'en': 'orange', 'ja': 'オレンジ'},
+ {'en': 'lemon', 'ja': 'レモン'},
+ {'en': 'fig', 'ja': 'イチジク'},
+ {'en': 'pineapple', 'ja': 'パイナップル'},
+ {'en': 'banana', 'ja': 'バナナ'},
+ {'en': 'jackfruit', 'ja': 'パラミツ'},
+ {'en': 'cherimoya (custard apple)', 'ja': 'カスタードアップル'},
+ {'en': 'pomegranate', 'ja': 'ザクロ'},
+ {'en': 'hay', 'ja': '干し草'},
+ {'en': 'carbonara', 'ja': 'カルボナーラ'},
+ {'en': 'chocolate syrup', 'ja': 'チョコレートソース'},
+ {'en': 'dough', 'ja': 'パン生地'},
+ {'en': 'meatloaf', 'ja': 'ミートローフ'},
+ {'en': 'pizza', 'ja': 'ピザ'},
+ {'en': 'pot pie', 'ja': 'ポットパイ'},
+ {'en': 'burrito', 'ja': 'ブリトー'},
+ {'en': 'red wine', 'ja': '赤ワイン'},
+ {'en': 'espresso', 'ja': 'エスプレッソ'},
+ {'en': 'tea cup', 'ja': 'カップ'},
+ {'en': 'eggnog', 'ja': 'エッグノッグ'},
+ {'en': 'mountain', 'ja': 'アルプス'},
+ {'en': 'bubble', 'ja': 'バブル'},
+ {'en': 'cliff', 'ja': '崖'},
+ {'en': 'coral reef', 'ja': 'サンゴ礁'},
+ {'en': 'geyser', 'ja': '間欠泉'},
+ {'en': 'lakeshore', 'ja': '湖畔'},
+ {'en': 'promontory', 'ja': '岬'},
+ {'en': 'sandbar', 'ja': '砂州'},
+ {'en': 'beach', 'ja': '海岸'},
+ {'en': 'valley', 'ja': '谷'},
+ {'en': 'volcano', 'ja': '火山'},
+ {'en': 'baseball player', 'ja': '野球選手'},
+ {'en': 'bridegroom', 'ja': '新郎'},
+ {'en': 'scuba diver', 'ja': 'スキューバダイバー'},
+ {'en': 'rapeseed', 'ja': '菜種'},
+ {'en': 'daisy', 'ja': 'デイジー'},
+ {'en': "yellow lady's slipper", 'ja': '蘭'},
+ {'en': 'corn', 'ja': 'トウモロコシ'},
+ {'en': 'acorn', 'ja': 'ドングリ'},
+ {'en': 'rose hip', 'ja': 'ヒップ'},
+ {'en': 'horse chestnut seed', 'ja': 'トチノキ'},
+ {'en': 'coral fungus', 'ja': 'サンゴ菌'},
+ {'en': 'agaric', 'ja': 'ハラタケ'},
+ {'en': 'gyromitra', 'ja': 'シャグマアミガサタケ'},
+ {'en': 'stinkhorn mushroom', 'ja': 'スッポンタケ'},
+ {'en': 'earth star fungus', 'ja': 'ハラタケ'},
+ {'en': 'hen of the woods mushroom', 'ja': '舞茸'},
+ {'en': 'bolete', 'ja': 'きのこ'},
+ {'en': 'corn cob', 'ja': '耳'},
+ {'en': 'toilet paper', 'ja': 'トイレットペーパー'}]
+
+
+imagenet_templates = [{'en': 'a bad photo of a {}.', 'ja': '{}の悪い写真'},
+ {'en': 'a photo of many {}.', 'ja': '多くの{}の写真'},
+ {'en': 'a sculpture of a {}.', 'ja': '{}の彫刻'},
+ {'en': 'a photo of the hard to see {}.', 'ja': '見づらい{}の写真'},
+ {'en': 'a low resolution photo of the {}.', 'ja': '{}の低解像度写真'},
+ {'en': 'a rendering of a {}.', 'ja': '{}のレンダリング'},
+ {'en': 'graffiti of a {}.', 'ja': '{}の落書き'},
+ {'en': 'a cropped photo of the {}.', 'ja': '{}のトリミング写真'},
+ {'en': 'a tattoo of a {}.', 'ja': '{}のタトゥー'},
+ {'en': 'the embroidered {}.', 'ja': '刺繍された{}'},
+ {'en': 'a bright photo of a {}.', 'ja': '{}の明るい写真'},
+ {'en': 'a photo of a clean {}.', 'ja': 'きれいな{}の写真'},
+ {'en': 'a photo of a dirty {}.', 'ja': '汚れた{}の写真'},
+ {'en': 'a dark photo of the {}.', 'ja': '{}の暗い写真'},
+ {'en': 'a drawing of a {}.', 'ja': '{}の絵'},
+ {'en': 'a photo of my {}.', 'ja': '私の{}の写真'},
+ {'en': 'the plastic {}.', 'ja': 'プラスチック製の{}'},
+ {'en': 'a photo of the cool {}.', 'ja': 'かっこいい{}の写真'},
+ {'en': 'a close-up photo of a {}.', 'ja': '{}のクローズアップ写真'},
+ {'en': 'a black and white photo of the {}.', 'ja': '{}の白黒写真'},
+ {'en': 'a pixelated photo of the {}.', 'ja': '{}のピクセル写真'},
+ {'en': 'a jpeg corrupted photo of a {}.', 'ja': 'jpegで加工した{}の写真'},
+ {'en': 'a blurry photo of the {}.', 'ja': '{}のぼやけた写真'},
+ {'en': 'a photo of the {}.', 'ja': '{}の写真'},
+ {'en': 'a good photo of the {}.', 'ja': '{}の良い写真'},
+ {'en': 'a {} in a video game.', 'ja': 'ゲームに登場する{}'},
+ {'en': 'the origami {}.', 'ja': '折り紙で作った{}'},
+ {'en': 'a sketch of a {}.', 'ja': '{}のスケッチ'},
+ {'en': 'the toy {}.', 'ja': 'おもちゃの{}'},
+ {'en': 'a rendition of the {}.', 'ja': '{}の演出'},
+ {'en': 'a photo of a large {}.', 'ja': '大きな{}の写真'},
+ {'en': 'a photo of a nice {}.', 'ja': '素敵な{}の写真'},
+ {'en': 'a photo of a weird {}.', 'ja': '奇妙な{}の写真'},
+ {'en': 'a cartoon {}.', 'ja': '漫画の{}'},
+ {'en': 'art of a {}.', 'ja': '{}の芸術'},
+ {'en': 'a plushie {}.', 'ja': '{}のぬいぐるみ'},
+ {'en': 'a photo of the small {}.', 'ja': '小さな{}の写真'},]
+
+
+
+
diff --git a/japanese_clip/utils/imagenet_zeroshot_data_en.py b/japanese_clip/utils/imagenet_zeroshot_data_en.py
new file mode 100644
index 0000000..dc23140
--- /dev/null
+++ b/japanese_clip/utils/imagenet_zeroshot_data_en.py
@@ -0,0 +1,248 @@
+imagenet_classnames = ["tench", "goldfish", "great white shark", "tiger shark", "hammerhead shark", "electric ray",
+                        "stingray", "rooster", "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco",
+                        "indigo bunting", "American robin", "bulbul", "jay", "magpie", "chickadee", "American dipper",
+                        "kite (bird of prey)", "bald eagle", "vulture", "great grey owl", "fire salamander",
+                        "smooth newt", "newt", "spotted salamander", "axolotl", "American bullfrog", "tree frog",
+                        "tailed frog", "loggerhead sea turtle", "leatherback sea turtle", "mud turtle", "terrapin",
+                        "box turtle", "banded gecko", "green iguana", "Carolina anole",
+                        "desert grassland whiptail lizard", "agama", "frilled-necked lizard", "alligator lizard",
+                        "Gila monster", "European green lizard", "chameleon", "Komodo dragon", "Nile crocodile",
+                        "American alligator", "triceratops", "worm snake", "ring-necked snake",
+                        "eastern hog-nosed snake", "smooth green snake", "kingsnake", "garter snake", "water snake",
+                        "vine snake", "night snake", "boa constrictor", "African rock python", "Indian cobra",
+                        "green mamba", "sea snake", "Saharan horned viper", "eastern diamondback rattlesnake",
+                        "sidewinder rattlesnake", "trilobite", "harvestman", "scorpion", "yellow garden spider",
+                        "barn spider", "European garden spider", "southern black widow", "tarantula", "wolf spider",
+                        "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse", "prairie grouse", "peafowl",
+                        "quail", "partridge", "african grey parrot", "macaw", "sulphur-crested cockatoo", "lorikeet",
+                        "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "duck",
+                        "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby",
+                        "koala", "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch",
+                        "snail", "slug", "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab",
+                        "fiddler crab", "red king crab", "American lobster", "spiny lobster", "crayfish", "hermit crab",
+                        "isopod", "white stork", "black stork", "spoonbill", "flamingo", "little blue heron",
+                        "great egret", "bittern bird", "crane bird", "limpkin", "common gallinule", "American coot",
+                        "bustard", "ruddy turnstone", "dunlin", "common redshank", "dowitcher", "oystercatcher",
+                        "pelican", "king penguin", "albatross", "grey whale", "killer whale", "dugong", "sea lion",
+                        "Chihuahua", "Japanese Chin", "Maltese", "Pekingese", "Shih Tzu", "King Charles Spaniel",
+                        "Papillon", "toy terrier", "Rhodesian Ridgeback", "Afghan Hound", "Basset Hound", "Beagle",
+                        "Bloodhound", "Bluetick Coonhound", "Black and Tan Coonhound", "Treeing Walker Coonhound",
+                        "English foxhound", "Redbone Coonhound", "borzoi", "Irish Wolfhound", "Italian Greyhound",
+                        "Whippet", "Ibizan Hound", "Norwegian Elkhound", "Otterhound", "Saluki", "Scottish Deerhound",
+                        "Weimaraner", "Staffordshire Bull Terrier", "American Staffordshire Terrier",
+                        "Bedlington Terrier", "Border Terrier", "Kerry Blue Terrier", "Irish Terrier",
+                        "Norfolk Terrier", "Norwich Terrier", "Yorkshire Terrier", "Wire Fox Terrier",
+                        "Lakeland Terrier", "Sealyham Terrier", "Airedale Terrier", "Cairn Terrier",
+                        "Australian Terrier", "Dandie Dinmont Terrier", "Boston Terrier", "Miniature Schnauzer",
+                        "Giant Schnauzer", "Standard Schnauzer", "Scottish Terrier", "Tibetan Terrier",
+                        "Australian Silky Terrier", "Soft-coated Wheaten Terrier", "West Highland White Terrier",
+                        "Lhasa Apso", "Flat-Coated Retriever", "Curly-coated Retriever", "Golden Retriever",
+                        "Labrador Retriever", "Chesapeake Bay Retriever", "German Shorthaired Pointer", "Vizsla",
+                        "English Setter", "Irish Setter", "Gordon Setter", "Brittany dog", "Clumber Spaniel",
+                        "English Springer Spaniel", "Welsh Springer Spaniel", "Cocker Spaniel", "Sussex Spaniel",
+                        "Irish Water Spaniel", "Kuvasz", "Schipperke", "Groenendael dog", "Malinois", "Briard",
+                        "Australian Kelpie", "Komondor", "Old English Sheepdog", "Shetland Sheepdog", "collie",
+                        "Border Collie", "Bouvier des Flandres dog", "Rottweiler", "German Shepherd Dog", "Dobermann",
+                        "Miniature Pinscher", "Greater Swiss Mountain Dog", "Bernese Mountain Dog",
+                        "Appenzeller Sennenhund", "Entlebucher Sennenhund", "Boxer", "Bullmastiff", "Tibetan Mastiff",
+                        "French Bulldog", "Great Dane", "St. Bernard", "husky", "Alaskan Malamute", "Siberian Husky",
+                        "Dalmatian", "Affenpinscher", "Basenji", "pug", "Leonberger", "Newfoundland dog",
+                        "Great Pyrenees dog", "Samoyed", "Pomeranian", "Chow Chow", "Keeshond", "brussels griffon",
+                        "Pembroke Welsh Corgi", "Cardigan Welsh Corgi", "Toy Poodle", "Miniature Poodle",
+                        "Standard Poodle", "Mexican hairless dog (xoloitzcuintli)", "grey wolf", "Alaskan tundra wolf",
+                        "red wolf or maned wolf", "coyote", "dingo", "dhole", "African wild dog", "hyena", "red fox",
+                        "kit fox", "Arctic fox", "grey fox", "tabby cat", "tiger cat", "Persian cat", "Siamese cat",
+                        "Egyptian Mau", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger",
+                        "cheetah", "brown bear", "American black bear", "polar bear", "sloth bear", "mongoose",
+                        "meerkat", "tiger beetle", "ladybug", "ground beetle", "longhorn beetle", "leaf beetle",
+                        "dung beetle", "rhinoceros beetle", "weevil", "fly", "bee", "ant", "grasshopper",
+                        "cricket insect", "stick insect", "cockroach", "praying mantis", "cicada", "leafhopper",
+                        "lacewing", "dragonfly", "damselfly", "red admiral butterfly", "ringlet butterfly",
+                        "monarch butterfly", "small white butterfly", "sulphur butterfly", "gossamer-winged butterfly",
+                        "starfish", "sea urchin", "sea cucumber", "cottontail rabbit", "hare", "Angora rabbit",
+                        "hamster", "porcupine", "fox squirrel", "marmot", "beaver", "guinea pig", "common sorrel horse",
+                        "zebra", "pig", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo", "bison",
+                        "ram (adult male sheep)", "bighorn sheep", "Alpine ibex", "hartebeest", "impala (antelope)",
+                        "gazelle", "arabian camel", "llama", "weasel", "mink", "European polecat",
+                        "black-footed ferret", "otter", "skunk", "badger", "armadillo", "three-toed sloth", "orangutan",
+                        "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas monkey", "baboon", "macaque",
+                        "langur", "black-and-white colobus", "proboscis monkey", "marmoset", "white-headed capuchin",
+                        "howler monkey", "titi monkey", "Geoffroy's spider monkey", "common squirrel monkey",
+                        "ring-tailed lemur", "indri", "Asian elephant", "African bush elephant", "red panda",
+                        "giant panda", "snoek fish", "eel", "silver salmon", "rock beauty fish", "clownfish",
+                        "sturgeon", "gar fish", "lionfish", "pufferfish", "abacus", "abaya", "academic gown",
+                        "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance",
+                        "amphibious vehicle", "analog clock", "apiary", "apron", "trash can", "assault rifle",
+                        "backpack", "bakery", "balance beam", "balloon", "ballpoint pen", "Band-Aid", "banjo",
+                        "baluster / handrail", "barbell", "barber chair", "barbershop", "barn", "barometer", "barrel",
+                        "wheelbarrow", "baseball", "basketball", "bassinet", "bassoon", "swimming cap", "bath towel",
+                        "bathtub", "station wagon", "lighthouse", "beaker", "military hat (bearskin or shako)",
+                        "beer bottle", "beer glass", "bell tower", "baby bib", "tandem bicycle", "bikini",
+                        "ring binder", "binoculars", "birdhouse", "boathouse", "bobsleigh", "bolo tie", "poke bonnet",
+                        "bookcase", "bookstore", "bottle cap", "hunting bow", "bow tie", "brass memorial plaque", "bra",
+                        "breakwater", "breastplate", "broom", "bucket", "buckle", "bulletproof vest",
+                        "high-speed train", "butcher shop", "taxicab", "cauldron", "candle", "cannon", "canoe",
+                        "can opener", "cardigan", "car mirror", "carousel", "tool kit", "cardboard box / carton",
+                        "car wheel", "automated teller machine", "cassette", "cassette player", "castle", "catamaran",
+                        "CD player", "cello", "mobile phone", "chain", "chain-link fence", "chain mail", "chainsaw",
+                        "storage chest", "chiffonier", "bell or wind chime", "china cabinet", "Christmas stocking",
+                        "church", "movie theater", "cleaver", "cliff dwelling", "cloak", "clogs", "cocktail shaker",
+                        "coffee mug", "coffeemaker", "spiral or coil", "combination lock", "computer keyboard",
+                        "candy store", "container ship", "convertible", "corkscrew", "cornet", "cowboy boot",
+                        "cowboy hat", "cradle", "construction crane", "crash helmet", "crate", "infant bed",
+                        "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer",
+                        "rotary dial telephone", "diaper", "digital clock", "digital watch", "dining table",
+                        "dishcloth", "dishwasher", "disc brake", "dock", "dog sled", "dome", "doormat", "drilling rig",
+                        "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", "electric guitar",
+                        "electric locomotive", "entertainment center", "envelope", "espresso machine", "face powder",
+                        "feather boa", "filing cabinet", "fireboat", "fire truck", "fire screen", "flagpole", "flute",
+                        "folding chair", "football helmet", "forklift", "fountain", "fountain pen", "four-poster bed",
+                        "freight car", "French horn", "frying pan", "fur coat", "garbage truck",
+                        "gas mask or respirator", "gas pump", "goblet", "go-kart", "golf ball", "golf cart", "gondola",
+                        "gong", "gown", "grand piano", "greenhouse", "radiator grille", "grocery store", "guillotine",
+                        "hair clip", "hair spray", "half-track", "hammer", "hamper", "hair dryer", "hand-held computer",
+                        "handkerchief", "hard disk drive", "harmonica", "harp", "combine harvester", "hatchet",
+                        "holster", "home theater", "honeycomb", "hook", "hoop skirt", "gymnastic horizontal bar",
+                        "horse-drawn vehicle", "hourglass", "iPod", "clothes iron", "carved pumpkin", "jeans", "jeep",
+                        "T-shirt", "jigsaw puzzle", "rickshaw", "joystick", "kimono", "knee pad", "knot", "lab coat",
+                        "ladle", "lampshade", "laptop computer", "lawn mower", "lens cap", "letter opener", "library",
+                        "lifeboat", "lighter", "limousine", "ocean liner", "lipstick", "slip-on shoe", "lotion",
+                        "music speaker", "loupe magnifying glass", "sawmill", "magnetic compass", "messenger bag",
+                        "mailbox", "tights", "one-piece bathing suit", "manhole cover", "maraca", "marimba", "mask",
+                        "matchstick", "maypole", "maze", "measuring cup", "medicine cabinet", "megalith", "microphone",
+                        "microwave oven", "military uniform", "milk can", "minibus", "miniskirt", "minivan", "missile",
+                        "mitten", "mixing bowl", "mobile home", "ford model t", "modem", "monastery", "monitor",
+                        "moped", "mortar and pestle", "graduation cap", "mosque", "mosquito net", "vespa",
+                        "mountain bike", "tent", "computer mouse", "mousetrap", "moving van", "muzzle", "metal nail",
+                        "neck brace", "necklace", "baby pacifier", "notebook computer", "obelisk", "oboe", "ocarina",
+                        "odometer", "oil filter", "pipe organ", "oscilloscope", "overskirt", "bullock cart",
+                        "oxygen mask", "product packet / packaging", "paddle", "paddle wheel", "padlock", "paintbrush",
+                        "pajamas", "palace", "pan flute", "paper towel", "parachute", "parallel bars", "park bench",
+                        "parking meter", "railroad car", "patio", "payphone", "pedestal", "pencil case",
+                        "pencil sharpener", "perfume", "Petri dish", "photocopier", "plectrum", "Pickelhaube",
+                        "picket fence", "pickup truck", "pier", "piggy bank", "pill bottle", "pillow", "ping-pong ball",
+                        "pinwheel", "pirate ship", "drink pitcher", "block plane", "planetarium", "plastic bag",
+                        "plate rack", "farm plow", "plunger", "Polaroid camera", "pole", "police van", "poncho",
+                        "pool table", "soda bottle", "plant pot", "potter's wheel", "power drill", "prayer rug",
+                        "printer", "prison", "missile", "projector", "hockey puck", "punching bag", "purse", "quill",
+                        "quilt", "race car", "racket", "radiator", "radio", "radio telescope", "rain barrel",
+                        "recreational vehicle", "fishing casting reel", "reflex camera", "refrigerator",
+                        "remote control", "restaurant", "revolver", "rifle", "rocking chair", "rotisserie", "eraser",
+                        "rugby ball", "ruler measuring stick", "sneaker", "safe", "safety pin", "salt shaker", "sandal",
+                        "sarong", "saxophone", "scabbard", "weighing scale", "school bus", "schooner", "scoreboard",
+                        "CRT monitor", "screw", "screwdriver", "seat belt", "sewing machine", "shield", "shoe store",
+                        "shoji screen / room divider", "shopping basket", "shopping cart", "shovel", "shower cap",
+                        "shower curtain", "ski", "balaclava ski mask", "sleeping bag", "slide rule", "sliding door",
+                        "slot machine", "snorkel", "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock",
+                        "solar thermal collector", "sombrero", "soup bowl", "keyboard space bar", "space heater",
+                        "space shuttle", "spatula", "motorboat", "spider web", "spindle", "sports car", "spotlight",
+                        "stage", "steam locomotive", "through arch bridge", "steel drum", "stethoscope", "scarf",
+                        "stone wall", "stopwatch", "stove", "strainer", "tram", "stretcher", "couch", "stupa",
+                        "submarine", "suit", "sundial", "sunglasses", "sunglasses", "sunscreen", "suspension bridge",
+                        "mop", "sweatshirt", "swim trunks / shorts", "swing", "electrical switch", "syringe",
+                        "table lamp", "tank", "tape player", "teapot", "teddy bear", "television", "tennis ball",
+                        "thatched roof", "front curtain", "thimble", "threshing machine", "throne", "tile roof",
+                        "toaster", "tobacco shop", "toilet seat", "torch", "totem pole", "tow truck", "toy store",
+                        "tractor", "semi-trailer truck", "tray", "trench coat", "tricycle", "trimaran", "tripod",
+                        "triumphal arch", "trolleybus", "trombone", "hot tub", "turnstile", "typewriter keyboard",
+                        "umbrella", "unicycle", "upright piano", "vacuum cleaner", "vase", "vaulted or arched ceiling",
+                        "velvet fabric", "vending machine", "vestment", "viaduct", "violin", "volleyball",
+                        "waffle iron", "wall clock", "wallet", "wardrobe", "military aircraft", "sink",
+                        "washing machine", "water bottle", "water jug", "water tower", "whiskey jug", "whistle",
+                        "hair wig", "window screen", "window shade", "Windsor tie", "wine bottle", "airplane wing",
+                        "wok", "wooden spoon", "wool", "split-rail fence", "shipwreck", "sailboat", "yurt", "website",
+                        "comic book", "crossword", "traffic or street sign", "traffic light", "dust jacket", "menu",
+                        "plate", "guacamole", "consomme", "hot pot", "trifle", "ice cream", "popsicle", "baguette",
+                        "bagel", "pretzel", "cheeseburger", "hot dog", "mashed potatoes", "cabbage", "broccoli",
+                        "cauliflower", "zucchini", "spaghetti squash", "acorn squash", "butternut squash", "cucumber",
+                        "artichoke", "bell pepper", "cardoon", "mushroom", "Granny Smith apple", "strawberry", "orange",
+                        "lemon", "fig", "pineapple", "banana", "jackfruit", "cherimoya (custard apple)", "pomegranate",
+                        "hay", "carbonara", "chocolate syrup", "dough", "meatloaf", "pizza", "pot pie", "burrito",
+                        "red wine", "espresso", "tea cup", "eggnog", "mountain", "bubble", "cliff", "coral reef",
+                        "geyser", "lakeshore", "promontory", "sandbar", "beach", "valley", "volcano", "baseball player",
+                        "bridegroom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn",
+                        "rose hip", "horse chestnut seed", "coral fungus", "agaric", "gyromitra", "stinkhorn mushroom",
+                        "earth star fungus", "hen of the woods mushroom", "bolete", "corn cob", "toilet paper"]
+
+imagenet_templates = [
+    'a bad photo of a {}.',
+    'a photo of many {}.',
+    'a sculpture of a {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of the {}.',
+    'a rendering of a {}.',
+    'graffiti of a {}.',
+    'a bad photo of the {}.',
+    'a cropped photo of the {}.',
+    'a tattoo of a {}.',
+    'the embroidered {}.',
+    'a photo of a hard to see {}.',
+    'a bright photo of a {}.',
+    'a photo of a clean {}.',
+    'a photo of a dirty {}.',
+    'a dark photo of the {}.',
+    'a drawing of a {}.',
+    'a photo of my {}.',
+    'the plastic {}.',
+    'a photo of the cool {}.',
+    'a close-up photo of a {}.',
+    'a black and white photo of the {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+    'a pixelated photo of the {}.',
+    'a sculpture of the {}.',
+    'a bright photo of the {}.',
+    'a cropped photo of a {}.',
+    'a plastic {}.',
+    'a photo of the dirty {}.',
+    'a jpeg corrupted photo of a {}.',
+    'a blurry photo of the {}.',
+    'a photo of the {}.',
+    'a good photo of the {}.',
+    'a rendering of the {}.',
+    'a {} in a video game.',
+    'a photo of one {}.',
+    'a doodle of a {}.',
+    'a close-up photo of the {}.',
+    'a photo of a {}.',
+    'the origami {}.',
+    'the {} in a video game.',
+    'a sketch of a {}.',
+    'a doodle of the {}.',
+    'a origami {}.',
+    'a low resolution photo of a {}.',
+    'the toy {}.',
+    'a rendition of the {}.',
+    'a photo of the clean {}.',
+    'a photo of a large {}.',
+    'a rendition of a {}.',
+    'a photo of a nice {}.',
+    'a photo of a weird {}.',
+    'a blurry photo of a {}.',
+    'a cartoon {}.',
+    'art of a {}.',
+    'a sketch of the {}.',
+    'a embroidered {}.',
+    'a pixelated photo of a {}.',
+    'itap of the {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a good photo of a {}.',
+    'a plushie {}.',
+    'a photo of the nice {}.',
+    'a photo of the small {}.',
+    'a photo of the weird {}.',
+    'the cartoon {}.',
+    'art of the {}.',
+    'a drawing of the {}.',
+    'a photo of the large {}.',
+    'a black and white photo of a {}.',
+    'the plushie {}.',
+    'a dark photo of a {}.',
+    'itap of a {}.',
+    'graffiti of the {}.',
+    'a toy {}.',
+    'itap of my {}.',
+    'a photo of a cool {}.',
+    'a photo of a small {}.',
+    'a tattoo of the {}.',
+]
\ No newline at end of file
diff --git a/japanese_clip/version.py b/japanese_clip/version.py
new file mode 100644
index 0000000..feacd5a
--- /dev/null
+++ b/japanese_clip/version.py
@@ -0,0 +1,16 @@
+# coding=utf-8
+# Copyright 2022 rinna Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = '0.2.0'
diff --git a/jclip.py b/jclip.py
new file mode 100644
index 0000000..aa3d4dc
--- /dev/null
+++ b/jclip.py
@@ -0,0 +1,79 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from pathlib import Path
+
+import torch
+
+from towhee import register
+from towhee.operator.base import NNOperator, OperatorFlag
+from towhee.types.arg import arg, to_image_color
+from towhee.types.image_utils import from_pil, to_pil
+
+@register(output_schema=['vec'])
+class Jaclip(NNOperator):
+    """
+    Japanese CLIP multi-modal embedding operator
+    """
+    def __init__(self, model_name: str, modality: str):
+        super().__init__()
+        path = str(Path(__file__).parent)
+        sys.path.append(path)
+        import japanese_clip as ja_clip
+        sys.path.pop()
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        model, preprocess = ja_clip.load("rinna/japanese-clip-vit-b-16", cache_dir="{}/weights/japanese_clip".format(path), device=self.device)
+        self.model = model
+        self.tfms = preprocess
+        self.tokenizer = ja_clip.load_tokenizer()
+        self.ja_clip = ja_clip
+
+
+    def __call__(self, data):
+        if self._modality == 'image':
+            vec = self._inference_from_image(data)
+        elif self._modality == 'text':
+            vec = self._inference_from_text(data)
+        else:
+            raise ValueError("modality[{}] not implemented.".format(self._modality))
+        return vec.detach().cpu().numpy().flatten()
+
+    def _inference_from_text(self, text):
+        encodings = ja_clip.tokenize(
+            texts=[text],
+            max_seq_len=77,
+            device=self.device,
+            tokenizer=self.tokenizer, # this is optional. if you don't pass, load tokenizer each time
+        )
+        text_feature = model.get_text_features(**encodings)
+        return text_feature
+
+    @arg(1, to_image_color('RGB'))
+    def _inference_from_image(self, img):
+        img = self._preprocess(img)
+        caption = ''
+        image_feature = self.model.get_image_features(image)
+        return image_feature
+
+    def _preprocess(self, img):
+        img = to_pil(img)
+        processed_img = self.tfms(img).unsqueeze(0).to(self.device)
+        return processed_img
+
+    def _configs(self):
+        config = {}
+        config['blip_base'] = {}
+        config['blip_base']['weights'] = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth'
+        return config
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e69de29