From fe3c1620cc4fb9ed0ee8f2d17bb38e8450fcc17a Mon Sep 17 00:00:00 2001 From: wxywb Date: Thu, 13 Oct 2022 17:28:19 +0800 Subject: [PATCH] init the operator. Signed-off-by: wxywb --- .gitignore | 1 + __init__.py | 18 + japanese_clip/__init__.py | 19 + japanese_clip/auto_model.py | 95 ++ japanese_clip/clip/__init__.py | 16 + japanese_clip/clip/configuration_clip.py | 219 ++++ japanese_clip/clip/modeling_clip.py | 815 +++++++++++++ japanese_clip/cloob/__init__.py | 16 + japanese_clip/cloob/configuration_cloob.py | 203 ++++ japanese_clip/cloob/loss.py | 58 + japanese_clip/cloob/modeling_cloob.py | 783 +++++++++++++ japanese_clip/tokenizer.py | 63 + japanese_clip/utils/__init__.py | 0 japanese_clip/utils/callbacks.py | 96 ++ japanese_clip/utils/imagenet_zeroshot_data.py | 1043 +++++++++++++++++ .../utils/imagenet_zeroshot_data_en.py | 248 ++++ japanese_clip/version.py | 16 + jclip.py | 79 ++ requirements.txt | 0 19 files changed, 3788 insertions(+) create mode 100644 .gitignore create mode 100644 __init__.py create mode 100644 japanese_clip/__init__.py create mode 100644 japanese_clip/auto_model.py create mode 100644 japanese_clip/clip/__init__.py create mode 100644 japanese_clip/clip/configuration_clip.py create mode 100644 japanese_clip/clip/modeling_clip.py create mode 100644 japanese_clip/cloob/__init__.py create mode 100644 japanese_clip/cloob/configuration_cloob.py create mode 100644 japanese_clip/cloob/loss.py create mode 100644 japanese_clip/cloob/modeling_cloob.py create mode 100644 japanese_clip/tokenizer.py create mode 100644 japanese_clip/utils/__init__.py create mode 100644 japanese_clip/utils/callbacks.py create mode 100644 japanese_clip/utils/imagenet_zeroshot_data.py create mode 100644 japanese_clip/utils/imagenet_zeroshot_data_en.py create mode 100644 japanese_clip/version.py create mode 100644 jclip.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..9d2895f --- /dev/null +++ b/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .jclip import Jaclip + +def jclip(model_name: str, modality: str): + return Jaclip(model_name, modality) diff --git a/japanese_clip/__init__.py b/japanese_clip/__init__.py new file mode 100644 index 0000000..5b60b75 --- /dev/null +++ b/japanese_clip/__init__.py @@ -0,0 +1,19 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .clip import CLIPModel, CLIPConfig +from .cloob import CLOOBModel, CLOOBConfig +from .auto_model import load, available_models +from .tokenizer import load_tokenizer, tokenize diff --git a/japanese_clip/auto_model.py b/japanese_clip/auto_model.py new file mode 100644 index 0000000..eddfd44 --- /dev/null +++ b/japanese_clip/auto_model.py @@ -0,0 +1,95 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union +import json +import torch +from torchvision import transforms as T +from huggingface_hub import hf_hub_url, cached_download +import os + +from .clip import CLIPModel +from .cloob import CLOOBModel + +# TODO: Fill in repo_ids +MODELS = { + 'rinna/japanese-clip-vit-b-16': { + 'repo_id': 'rinna/japanese-clip-vit-b-16', + 'model_class': CLIPModel, + }, + 'rinna/japanese-cloob-vit-b-16': { + 'repo_id': 'rinna/japanese-cloob-vit-b-16', + 'model_class': CLOOBModel, + } +} +MODEL_CLASSES = { + "cloob": CLOOBModel, + "clip": CLIPModel, +} +MODEL_FILE = "pytorch_model.bin" +CONFIG_FILE = "config.json" + + +def available_models(): + return list(MODELS.keys()) + + +def _convert_to_rgb(image): + return image.convert('RGB') + + +def _transform(image_size): + return T.Compose([ + T.Resize(image_size, interpolation=T.InterpolationMode.BILINEAR), + T.CenterCrop(image_size), + _convert_to_rgb, + T.ToTensor(), + T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711),) + ]) + + +def _download(repo_id: str, cache_dir: str): + config_file_url = hf_hub_url(repo_id=repo_id, filename=CONFIG_FILE) + cached_download(config_file_url, cache_dir=cache_dir) + model_file_url = hf_hub_url(repo_id=repo_id, filename=MODEL_FILE) + cached_download(model_file_url, cache_dir=cache_dir) + + +def load( + model_name: str, + device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs +): + """ + Args: + model_name: model unique name or path to pre-downloaded model + device: device to put the loaded model + kwargs: kwargs for huggingface pretrained model class + Return: + (torch.nn.Module, A torchvision transform) + """ + if model_name in MODELS.keys(): + ModelClass = CLIPModel if 'clip' in model_name else CLOOBModel + elif os.path.exists(model_name): + assert os.path.exists(os.path.join(model_name, CONFIG_FILE)) + with open(os.path.join(model_name, CONFIG_FILE), "r", encoding="utf-8") as f: + j = json.load(f) + ModelClass = MODEL_CLASSES[j["model_type"]] + else: + RuntimeError(f"Model {model_name} not found; available models = {available_models()}") + + model = ModelClass.from_pretrained(model_name, **kwargs) + model = model.eval().requires_grad_(False).to(device) + return model, _transform(model.config.vision_config.image_size) diff --git a/japanese_clip/clip/__init__.py b/japanese_clip/clip/__init__.py new file mode 100644 index 0000000..c377d55 --- /dev/null +++ b/japanese_clip/clip/__init__.py @@ -0,0 +1,16 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .modeling_clip import * +from .configuration_clip import * diff --git a/japanese_clip/clip/configuration_clip.py b/japanese_clip/clip/configuration_clip.py new file mode 100644 index 0000000..3e5f071 --- /dev/null +++ b/japanese_clip/clip/configuration_clip.py @@ -0,0 +1,219 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" CLIP model configuration""" +import logging +import copy +import os +from typing import Union + +import numpy as np +from transformers import AutoConfig, PretrainedConfig + + +logger = logging.getLogger(__name__) + + +class CLIPTextConfig(PretrainedConfig): + model_type = "clip_text_model" + + def __init__( + self, + vocab_size=49408, + hidden_size=512, + intermediate_size=2048, + num_hidden_layers=12, + num_attention_heads=8, + max_position_embeddings=77, + hidden_act="quick_gelu", + layer_norm_eps=0.00001, + dropout=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + **kwargs + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.dropout = dropout + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the text config dict if we are loading from CLIPConfig + if config_dict.get("model_type") == "clip": + config_dict = config_dict["text_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class CLIPVisionConfig(PretrainedConfig): + model_type = "clip_vision_model" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + num_hidden_layers=12, + num_attention_heads=12, + image_size=224, + patch_size=32, + hidden_act="quick_gelu", + layer_norm_eps=0.00001, + dropout=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + **kwargs + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.dropout = dropout + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from CLIPConfig + if config_dict.get("model_type") == "clip": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class CLIPConfig(PretrainedConfig): + r""" + [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate + CLIP model according to the specified arguments, defining the text model and vision model configs. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config_dict (`dict`, *optional*): + Dictionary of configuration options used to initialize [`CLIPTextConfig`]. + vision_config_dict (`dict`, *optional*): + Dictionary of configuration options used to initialize [`CLIPVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimentionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation. + kwargs (*optional*): + Dictionary of keyword arguments. + """ + + model_type = "clip" + is_composition = True + + def __init__( + self, + text_config=None, + vision_config=None, + projection_dim=512, + logit_scale_init_value=None, + **kwargs + ): + super().__init__(text_config=text_config, vision_config=vision_config, **kwargs) + + if vision_config is None: + raise ValueError("`vision_config` can not be `None`.") + + if text_config is None: + raise ValueError("`text_config` can not be `None`.") + + vision_model_type = vision_config.pop("model_type") + text_model_type = text_config.pop("model_type") + + if vision_model_type == "clip_vision_model": + self.vision_config = CLIPVisionConfig(**vision_config) + else: + self.vision_config = AutoConfig.for_model( + vision_model_type, **vision_config + ) + + if text_model_type == "clip_text_model": + self.text_config = CLIPTextConfig(**text_config) + else: + self.text_config = AutoConfig.for_model( + text_model_type, **text_config + ) + + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value if logit_scale_init_value is not None else np.log(1 / 0.07) + self.initializer_factor = 1.0 + + @classmethod + def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs): + r""" + Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model + configuration. + + Returns: + [`CLIPConfig`]: An instance of a configuration object + """ + + return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["text_config"] = self.text_config.to_dict() + output["vision_config"] = self.vision_config.to_dict() + output["model_type"] = self.__class__.model_type + return output diff --git a/japanese_clip/clip/modeling_clip.py b/japanese_clip/clip/modeling_clip.py new file mode 100644 index 0000000..16a713f --- /dev/null +++ b/japanese_clip/clip/modeling_clip.py @@ -0,0 +1,815 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from transformers import AutoModel +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from transformers.modeling_utils import PreTrainedModel, ModelOutput +from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig + + +logger = logging.getLogger(__name__) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) + + +# contrastive loss function, adapted from +# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) + + +def clip_loss(similarity: torch.Tensor) -> torch.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(similarity.T) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +class CLIPOutput(ModelOutput): + loss: Optional[torch.FloatTensor] = None + logits_per_image: torch.FloatTensor = None + logits_per_text: torch.FloatTensor = None + text_embeds: torch.FloatTensor = None + image_embeds: torch.FloatTensor = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class CLIPVisionEmbeddings(nn.Module): + def __init__(self, config: CLIPVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class CLIPTextEmbeddings(nn.Module): + def __init__(self, config: CLIPTextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings + + +class CLIPAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scale + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + # apply the causal_attention_mask first + if causal_attention_mask is not None: + if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {causal_attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit akward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class CLIPMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class CLIPEncoderLayer(nn.Module): + def __init__(self, config: CLIPConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = CLIPAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim) + self.mlp = CLIPMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class CLIPPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CLIPConfig + base_model_prefix = "clip" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor + if isinstance(module, CLIPTextEmbeddings): + module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) + module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) + elif isinstance(module, CLIPVisionEmbeddings): + factor = self.config.initializer_factor + nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) + nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) + nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) + elif isinstance(module, CLIPAttention): + factor = self.config.initializer_factor + in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (module.embed_dim**-0.5) * factor + nn.init.normal_(module.q_proj.weight, std=in_proj_std) + nn.init.normal_(module.k_proj.weight, std=in_proj_std) + nn.init.normal_(module.v_proj.weight, std=in_proj_std) + nn.init.normal_(module.out_proj.weight, std=out_proj_std) + elif isinstance(module, CLIPMLP): + factor = self.config.initializer_factor + in_proj_std = ( + (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + ) + fc_std = (2 * module.config.hidden_size) ** -0.5 * factor + nn.init.normal_(module.fc1.weight, std=fc_std) + nn.init.normal_(module.fc2.weight, std=in_proj_std) + elif isinstance(module, CLIPModel): + nn.init.normal_( + module.text_projection.weight, + std=module.text_embed_dim**-0.5 * self.config.initializer_factor, + ) + nn.init.normal_( + module.visual_projection.weight, + std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, + ) + + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, CLIPEncoder): + module.gradient_checkpointing = value + + +class CLIPEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`CLIPEncoderLayer`]. + Args: + config: CLIPConfig + """ + + def __init__(self, config: CLIPConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(encoder_layer), + hidden_states, + attention_mask, + causal_attention_mask, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class CLIPTextTransformer(nn.Module): + def __init__(self, config: CLIPTextConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + self.embeddings = CLIPTextEmbeddings(config) + self.encoder = CLIPEncoder(config) + self.final_layer_norm = nn.LayerNorm(embed_dim) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None: + raise ValueError("You have to specify either input_ids") + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + + hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) + + bsz, seq_len = input_shape + # CLIP's text model uses causal mask, prepare it here. + # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 + causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len).to(hidden_states.device) + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask, hidden_states.dtype) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.final_layer_norm(last_hidden_state) + + # text_embeds.shape = [batch_size, sequence_length, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def _build_causal_attention_mask(self, bsz, seq_len): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(bsz, seq_len, seq_len) + mask.fill_(float("-inf")) + mask.triu_(1) # zero out the lower diagonal + mask = mask.unsqueeze(1) # expand mask + return mask + + +class CLIPTextModel(CLIPPreTrainedModel): + config_class = CLIPTextConfig + + def __init__(self, config: CLIPTextConfig): + super().__init__(config) + self.text_model = CLIPTextTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.text_model.embeddings.token_embedding + + def set_input_embeddings(self, value): + self.text_model.embeddings.token_embedding = value + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + return self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class CLIPVisionTransformer(nn.Module): + def __init__(self, config: CLIPVisionConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = CLIPVisionEmbeddings(config) + self.pre_layrnorm = nn.LayerNorm(embed_dim) + self.encoder = CLIPEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim) + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class CLIPVisionModel(CLIPPreTrainedModel): + config_class = CLIPVisionConfig + main_input_name = "pixel_values" + + def __init__(self, config: CLIPVisionConfig): + super().__init__(config) + self.vision_model = CLIPVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class CLIPModel(CLIPPreTrainedModel): + config_class = CLIPConfig + + def __init__(self, config: CLIPConfig): + super().__init__(config) + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + if isinstance(text_config, CLIPTextConfig): + text_model = CLIPTextTransformer(text_config) + else: + text_model = AutoModel.from_config(config.text_config, add_pooling_layer=False) + + if isinstance(config.vision_config, CLIPVisionConfig): + vision_model = CLIPVisionModel(config.vision_config) + else: + vision_model = AutoModel.from_config(config.vision_config, add_pooling_layer=False) + + self.text_model = text_model + self.vision_model = vision_model + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + + # Initialize weights and apply final processing + self.post_init() + + def encode_text(self, input_ids, **kwargs): + return self.get_text_features(input_ids=input_ids, **kwargs) + + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = text_outputs.last_hidden_state[:, 0, :] + text_features = self.text_projection(pooled_output) + + return text_features + + def encode_image(self, pixel_values, **kwargs): + return self.get_image_features(pixel_values=pixel_values, **kwargs) + + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = vision_outputs.last_hidden_state[:, 0, :] + image_features = self.visual_projection(pooled_output) + + return image_features + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CLIPOutput]: + # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_embeds = vision_outputs.last_hidden_state[:, 0, :] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs.last_hidden_state[:, 0, :] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale + logits_per_image = logits_per_text.T + + loss = None + if return_loss: + loss = clip_loss(logits_per_text) + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return CLIPOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + diff --git a/japanese_clip/cloob/__init__.py b/japanese_clip/cloob/__init__.py new file mode 100644 index 0000000..5266914 --- /dev/null +++ b/japanese_clip/cloob/__init__.py @@ -0,0 +1,16 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .configuration_cloob import * +from .modeling_cloob import * diff --git a/japanese_clip/cloob/configuration_cloob.py b/japanese_clip/cloob/configuration_cloob.py new file mode 100644 index 0000000..215015c --- /dev/null +++ b/japanese_clip/cloob/configuration_cloob.py @@ -0,0 +1,203 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" CLOOB model configuration""" +import logging +import copy +import os +from typing import Union + +from transformers import AutoConfig, PretrainedConfig + + +logger = logging.getLogger(__name__) + + +class CLOOBTextConfig(PretrainedConfig): + model_type = "cloob_text_model" + + def __init__( + self, + vocab_size=49408, + hidden_size=512, + intermediate_size=2048, + num_hidden_layers=12, + num_attention_heads=8, + max_position_embeddings=77, + hidden_act="quick_gelu", + layer_norm_eps=0.00001, + dropout=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + **kwargs + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.dropout = dropout + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the text config dict if we are loading from CLIPConfig + if config_dict.get("model_type") == "clip": + config_dict = config_dict["text_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class CLOOBVisionConfig(PretrainedConfig): + model_type = "cloob_vision_model" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + num_hidden_layers=12, + num_attention_heads=12, + image_size=224, + patch_size=32, + hidden_act="quick_gelu", + layer_norm_eps=0.00001, + dropout=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + **kwargs + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.dropout = dropout + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from CLIPConfig + if config_dict.get("model_type") == "clip": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class CLOOBConfig(PretrainedConfig): + model_type = "cloob" + is_composition = True + + def __init__( + self, + text_config=None, + vision_config=None, + projection_dim=512, + init_inv_tau=30.0, + scale_hopfield=15.0, + **kwargs + ): + super().__init__(text_config=text_config, vision_config=vision_config, **kwargs) + + if vision_config is None: + raise ValueError("`vision_config` can not be `None`.") + + if text_config is None: + raise ValueError("`text_config` can not be `None`.") + + vision_model_type = vision_config.pop("model_type") + text_model_type = text_config.pop("model_type") + + if vision_model_type == "cloob_vision_model": + self.vision_config = CLOOBVisionConfig(**vision_config) + else: + self.vision_config = AutoConfig.for_model( + vision_model_type, **vision_config + ) + + if text_model_type == "cloob_text_model": + self.text_config = CLOOBTextConfig(**text_config) + else: + self.text_config = AutoConfig.for_model( + text_model_type, **text_config + ) + + self.projection_dim = projection_dim + self.initializer_factor = 1.0 + self.init_inv_tau = init_inv_tau + self.scale_hopfield = scale_hopfield + + + @classmethod + def from_text_vision_configs(cls, text_config: CLOOBTextConfig, vision_config: CLOOBVisionConfig, **kwargs): + r""" + Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model + configuration. + + Returns: + [`CLIPConfig`]: An instance of a configuration object + """ + + return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["text_config"] = self.text_config.to_dict() + output["vision_config"] = self.vision_config.to_dict() + output["model_type"] = self.__class__.model_type + return output + + diff --git a/japanese_clip/cloob/loss.py b/japanese_clip/cloob/loss.py new file mode 100644 index 0000000..5050309 --- /dev/null +++ b/japanese_clip/cloob/loss.py @@ -0,0 +1,58 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn.functional as F + + +def cloob_loss(image_features, text_features, inv_tau, scale_hopfield): + """ + Note: this loss has been rescaled from the original CLOOB loss for interpretability, + to convert to the original, divide it by inv_tau / 2. + """ + p_xx, p_yy, p_xy, p_yx = hopfield_retrieval(image_features, text_features, scale_hopfield) + identity = torch.eye(p_xx.shape[1]) > 0.5 + i = identity.to(p_xx.device) + loss_img = infoLOOB_loss(p_xx.T, p_xy.T, i, inv_tau=inv_tau) + loss_txt = infoLOOB_loss(p_yy.T, p_yx.T, i, inv_tau=inv_tau) + return (loss_img + loss_txt) / 2 + + +def infoLOOB_loss(x, y, i, inv_tau): + tau = 1 / inv_tau + k = x @ y.T / tau + positives = -torch.mean(torch.sum(k * i, dim=1)) + + # For logsumexp the zero entries must be equal to a very large negative number + large_neg = -10000.0 + arg_lse = k * torch.logical_not(i) + i * large_neg + negatives = torch.mean(torch.logsumexp(arg_lse, dim=1)) + return positives + negatives + + +def hopfield_retrieval(image_features, text_features, scale_hopfield): + patterns_xx = hopfield(state_patterns=image_features, stored_patterns=image_features, scale_hopfield=scale_hopfield) + patterns_yy = hopfield(state_patterns=text_features, stored_patterns=text_features, scale_hopfield=scale_hopfield) + patterns_xy = hopfield(state_patterns=text_features, stored_patterns=image_features, scale_hopfield=scale_hopfield) + patterns_yx = hopfield(state_patterns=image_features, stored_patterns=text_features, scale_hopfield=scale_hopfield) + + return patterns_xx, patterns_yy, patterns_xy, patterns_yx + + +def hopfield(state_patterns, stored_patterns, scale_hopfield): + retrieved_patterns = stored_patterns.T @ F.softmax(scale_hopfield * stored_patterns @ state_patterns.T, dim=0) + # Row vectors -> dim=1 to normalize the row vectors + retrieved_patterns = retrieved_patterns / retrieved_patterns.norm(dim=0, keepdim=True) + return retrieved_patterns diff --git a/japanese_clip/cloob/modeling_cloob.py b/japanese_clip/cloob/modeling_cloob.py new file mode 100644 index 0000000..563f4c5 --- /dev/null +++ b/japanese_clip/cloob/modeling_cloob.py @@ -0,0 +1,783 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from transformers import AutoModel +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from transformers.modeling_utils import PreTrainedModel, ModelOutput +from .configuration_cloob import CLOOBConfig, CLOOBTextConfig, CLOOBVisionConfig +from .loss import cloob_loss +from ..clip.modeling_clip import _expand_mask + +logger = logging.getLogger(__name__) + + +@dataclass +class CLOOBOutput(ModelOutput): + loss: Optional[torch.FloatTensor] = None + inv_tau: Union[torch.FloatTensor, float] = None + text_embeds: torch.FloatTensor = None + image_embeds: torch.FloatTensor = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class CLOOBVisionEmbeddings(nn.Module): + def __init__(self, config: CLOOBVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class CLOOBTextEmbeddings(nn.Module): + def __init__(self, config: CLOOBTextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings + + +class CLOOBAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scale + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + # apply the causal_attention_mask first + if causal_attention_mask is not None: + if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {causal_attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit akward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class CLOOBMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class CLOOBEncoderLayer(nn.Module): + def __init__(self, config: CLOOBConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = CLOOBAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim) + self.mlp = CLOOBMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class CLOOBPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CLOOBConfig + base_model_prefix = "cloob" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor + if isinstance(module, CLOOBTextEmbeddings): + module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) + module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) + elif isinstance(module, CLOOBVisionEmbeddings): + factor = self.config.initializer_factor + nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) + nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) + nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) + elif isinstance(module, CLOOBAttention): + factor = self.config.initializer_factor + in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (module.embed_dim**-0.5) * factor + nn.init.normal_(module.q_proj.weight, std=in_proj_std) + nn.init.normal_(module.k_proj.weight, std=in_proj_std) + nn.init.normal_(module.v_proj.weight, std=in_proj_std) + nn.init.normal_(module.out_proj.weight, std=out_proj_std) + elif isinstance(module, CLOOBMLP): + factor = self.config.initializer_factor + in_proj_std = ( + (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + ) + fc_std = (2 * module.config.hidden_size) ** -0.5 * factor + nn.init.normal_(module.fc1.weight, std=fc_std) + nn.init.normal_(module.fc2.weight, std=in_proj_std) + elif isinstance(module, CLOOBModel): + nn.init.normal_( + module.text_projection.weight, + std=module.text_embed_dim**-0.5 * self.config.initializer_factor, + ) + nn.init.normal_( + module.visual_projection.weight, + std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, + ) + + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, CLOOBEncoder): + module.gradient_checkpointing = value + + +class CLOOBEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`CLOOBEncoderLayer`]. + Args: + config: CLOOBConfig + """ + + def __init__(self, config: CLOOBConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([CLOOBEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(encoder_layer), + hidden_states, + attention_mask, + causal_attention_mask, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class CLOOBTextTransformer(nn.Module): + def __init__(self, config: CLOOBTextConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + self.embeddings = CLOOBTextEmbeddings(config) + self.encoder = CLOOBEncoder(config) + self.final_layer_norm = nn.LayerNorm(embed_dim) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None: + raise ValueError("You have to specify either input_ids") + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + + hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) + + bsz, seq_len = input_shape + # CLOOB's text model uses causal mask, prepare it here. + # https://github.com/openai/CLOOB/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/CLOOB/model.py#L324 + causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len).to(hidden_states.device) + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask, hidden_states.dtype) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.final_layer_norm(last_hidden_state) + + # text_embeds.shape = [batch_size, sequence_length, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def _build_causal_attention_mask(self, bsz, seq_len): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(bsz, seq_len, seq_len) + mask.fill_(float("-inf")) + mask.triu_(1) # zero out the lower diagonal + mask = mask.unsqueeze(1) # expand mask + return mask + + +class CLOOBTextModel(CLOOBPreTrainedModel): + config_class = CLOOBTextConfig + + def __init__(self, config: CLOOBTextConfig): + super().__init__(config) + self.text_model = CLOOBTextTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.text_model.embeddings.token_embedding + + def set_input_embeddings(self, value): + self.text_model.embeddings.token_embedding = value + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + return self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class CLOOBVisionTransformer(nn.Module): + def __init__(self, config: CLOOBVisionConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = CLOOBVisionEmbeddings(config) + self.pre_layrnorm = nn.LayerNorm(embed_dim) + self.encoder = CLOOBEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim) + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class CLOOBVisionModel(CLOOBPreTrainedModel): + config_class = CLOOBVisionConfig + main_input_name = "pixel_values" + + def __init__(self, config: CLOOBVisionConfig): + super().__init__(config) + self.vision_model = CLOOBVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class CLOOBModel(CLOOBPreTrainedModel): + config_class = CLOOBConfig + + def __init__(self, config: CLOOBConfig): + super().__init__(config) + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + if isinstance(text_config, CLOOBTextConfig): + text_model = CLOOBTextTransformer(text_config) + else: + text_model = AutoModel.from_config(config.text_config, add_pooling_layer=False) + + if isinstance(config.vision_config, CLOOBVisionConfig): + vision_model = CLOOBVisionModel(config.vision_config) + else: + vision_model = AutoModel.from_config(config.vision_config, add_pooling_layer=False) + + self.text_model = text_model + self.vision_model = vision_model + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + + self.inv_tau = config.init_inv_tau + self.scale_hopfield = config.scale_hopfield + + # Initialize weights and apply final processing + self.post_init() + + def encode_text(self, input_ids, **kwargs): + return self.get_text_features(input_ids=input_ids, **kwargs) + + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + # Use CLOOB model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = text_outputs.last_hidden_state[:, 0, :] + text_features = self.text_projection(pooled_output) + + return text_features + + def encode_image(self, pixel_values, **kwargs): + return self.get_image_features(pixel_values=pixel_values, **kwargs) + + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + # Use CLOOB model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = vision_outputs.last_hidden_state[:, 0, :] + image_features = self.visual_projection(pooled_output) + + return image_features + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CLOOBOutput]: + # Use CLOOB model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_embeds = vision_outputs.last_hidden_state[:, 0, :] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs.last_hidden_state[:, 0, :] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True) + + loss = None + if return_loss: + loss = cloob_loss(image_embeds, text_embeds, self.inv_tau, self.scale_hopfield) + + if not return_dict: + output = (text_embeds, image_embeds, self.inv_tau, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return CLOOBOutput( + loss=loss, + text_embeds=text_embeds, + image_embeds=image_embeds, + inv_tau=self.inv_tau, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) diff --git a/japanese_clip/tokenizer.py b/japanese_clip/tokenizer.py new file mode 100644 index 0000000..67209d6 --- /dev/null +++ b/japanese_clip/tokenizer.py @@ -0,0 +1,63 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union, List +import torch +from transformers import T5Tokenizer + + +def load_tokenizer(): + """ + https://huggingface.co/rinna/japanese-roberta-base + """ + tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-roberta-base") + tokenizer.do_lower_case = True # due to some bug of tokenizer config loading + return tokenizer + + +def tokenize( + texts: Union[str, List[str]], + tokenizer: T5Tokenizer = None, + max_seq_len: int = 77, + device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", +): + """ + This is a function that have the original clip's code has. + https://github.com/openai/CLIP/blob/main/clip/clip.py#L195 + """ + if isinstance(texts, str): + texts = [texts] + if tokenizer is None: + tokenizer = load_tokenizer() + inputs = tokenizer( + texts, + max_length=max_seq_len-1, + padding="max_length", + truncation=True, + add_special_tokens=False, + ) + # add cls token at first place + input_ids = [[tokenizer.cls_token_id] + ids for ids in inputs['input_ids']] + attention_mask = [[1] + am for am in inputs['attention_mask']] + position_ids = [list(range(0, len(input_ids[0])))] * len(texts) + + input_ids = torch.tensor(input_ids, dtype=torch.long) + attention_mask = torch.tensor(attention_mask, dtype=torch.long) + position_ids = torch.tensor(position_ids, dtype=torch.long) + return { + "input_ids": input_ids.to(device), + "attention_mask": attention_mask.to(device), + "position_ids": position_ids.to(device), + } diff --git a/japanese_clip/utils/__init__.py b/japanese_clip/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/japanese_clip/utils/callbacks.py b/japanese_clip/utils/callbacks.py new file mode 100644 index 0000000..06ddcf5 --- /dev/null +++ b/japanese_clip/utils/callbacks.py @@ -0,0 +1,96 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tqdm.auto import tqdm +import numpy as np +import torch + + +def accuracy(output, target, topk=(1,)): + output = torch.from_numpy(np.asarray(output)) + target = torch.from_numpy(np.asarray(target)) + pred = output.topk(max(topk), dim=1, largest=True, sorted=True)[1].t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + return [ + float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) + for k in topk + ] + + +class ImagenetClassificationCallback: + def __init__( + self, + imagenet_classes, + imagenet_templates, + imagenet_dataloader, + ): + self.imagenet_classes = imagenet_classes + self.imagenet_templates = imagenet_templates + self.imagenet_dataloader = imagenet_dataloader + + def tokenize(self, tokenizer, examples, device): + encoding_inputs = tokenizer(examples, max_length=76, padding="max_length", truncation=True, add_special_tokens=False) + # add cls token at first place + input_ids = [[tokenizer.cls_token_id] + ids for ids in encoding_inputs['input_ids']] + attention_mask = [[1] + am for am in encoding_inputs['attention_mask']] + position_ids = [list(range(0, len(input_ids[0])))] * len(examples) + + input_ids = torch.tensor(input_ids, dtype=torch.long, device=device) + attention_mask = torch.tensor(attention_mask, dtype=torch.long, device=device) + position_ids = torch.tensor(position_ids, dtype=torch.long, device=device) + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + } + + def zeroshot_classifier(self, model, tokenizer, classnames, templates): + zeroshot_weights = [] + for classname in tqdm(classnames): + texts = [template.format(classname) for template in templates] + class_embeddings = model.get_text_features(**self.tokenize(tokenizer, texts, model.device)).detach().cpu().numpy() + class_embeddings = class_embeddings / np.linalg.norm( + class_embeddings, axis=-1, keepdims=True + ) + class_embedding = np.mean(class_embeddings, axis=0) + class_embedding /= np.linalg.norm(class_embedding, axis=-1) + zeroshot_weights.append(class_embedding) + zeroshot_weights = np.stack(zeroshot_weights, axis=1) + return zeroshot_weights + + def zeroshot(self, model, tokenizer) -> dict: + print("Imagenet Zeroshot Classification...") + zeroshot_weights = self.zeroshot_classifier(model, tokenizer, self.imagenet_classes, self.imagenet_templates) + top_ns = [1, 5, 10, 100] + acc_counters = [0.0 for _ in top_ns] + n = 0.0 + + for i, (images, target) in enumerate(tqdm(self.imagenet_dataloader)): + target = target.numpy() + # predict + image_features = model.get_image_features(images.to(model.device)).detach().cpu().numpy() + image_features = image_features / np.linalg.norm(image_features, axis=-1, keepdims=True) + logits = 100.0 * image_features @ zeroshot_weights + + # measure accuracy + accs = accuracy(logits, target, topk=top_ns) + for j in range(len(top_ns)): + acc_counters[j] += accs[j] + n += images.shape[0] + + tops = {f"imagenet/top{top_ns[i]}": acc_counters[i] / n * 100 for i in range(len(top_ns))} + + return tops + diff --git a/japanese_clip/utils/imagenet_zeroshot_data.py b/japanese_clip/utils/imagenet_zeroshot_data.py new file mode 100644 index 0000000..5533a1a --- /dev/null +++ b/japanese_clip/utils/imagenet_zeroshot_data.py @@ -0,0 +1,1043 @@ +imagenet_classnames = [{'en': 'tench', 'ja': 'テンチ'}, + {'en': 'goldfish', 'ja': '金魚'}, + {'en': 'great white shark', 'ja': 'ホホジロザメ'}, + {'en': 'tiger shark', 'ja': 'イタチザメ'}, + {'en': 'hammerhead shark', 'ja': 'ハンマーヘッド'}, + {'en': 'electric ray', 'ja': 'シビレエイ'}, + {'en': 'stingray', 'ja': 'アカエイ'}, + {'en': 'rooster', 'ja': 'コック'}, + {'en': 'hen', 'ja': 'めんどり'}, + {'en': 'ostrich', 'ja': 'ダチョウ'}, + {'en': 'brambling', 'ja': 'アトリ'}, + {'en': 'goldfinch', 'ja': 'ゴシキヒワ'}, + {'en': 'house finch', 'ja': 'ハウスフィンチ'}, + {'en': 'junco', 'ja': 'ユキヒメドリ'}, + {'en': 'indigo bunting', 'ja': 'インディゴホオジロ'}, + {'en': 'American robin', 'ja': 'ロビン'}, + {'en': 'bulbul', 'ja': 'ブルブル'}, + {'en': 'jay', 'ja': 'カケス'}, + {'en': 'magpie', 'ja': 'カササギ'}, + {'en': 'chickadee', 'ja': '四十雀'}, + {'en': 'American dipper', 'ja': '水クロウタドリ'}, + {'en': 'kite (bird of prey)', 'ja': '凧'}, + {'en': 'bald eagle', 'ja': '白頭ワシ'}, + {'en': 'vulture', 'ja': 'ハゲワシ'}, + {'en': 'great grey owl', 'ja': 'カラフトフクロウ'}, + {'en': 'fire salamander', 'ja': '欧州ファイアサラマンダー'}, + {'en': 'smooth newt', 'ja': '共通イモリ'}, + {'en': 'newt', 'ja': 'イモリ'}, + {'en': 'spotted salamander', 'ja': 'サンショウウオを発見'}, + {'en': 'axolotl', 'ja': 'アホロートル'}, + {'en': 'American bullfrog', 'ja': 'ウシガエル'}, + {'en': 'tree frog', 'ja': 'アマガエル'}, + {'en': 'tailed frog', 'ja': 'つかれたカエル'}, + {'en': 'loggerhead sea turtle', 'ja': 'とんちき'}, + {'en': 'leatherback sea turtle', 'ja': 'オサガメ'}, + {'en': 'mud turtle', 'ja': '鼈'}, + {'en': 'terrapin', 'ja': 'テラピン'}, + {'en': 'box turtle', 'ja': 'ハコガメ'}, + {'en': 'banded gecko', 'ja': '縞模様のヤモリ'}, + {'en': 'green iguana', 'ja': '共通イグアナ'}, + {'en': 'Carolina anole', 'ja': 'アメリカンカメレオン'}, + {'en': 'desert grassland whiptail lizard', 'ja': 'ウィッペイル'}, + {'en': 'agama', 'ja': 'アガマトカゲ'}, + {'en': 'frilled-necked lizard', 'ja': 'フリルトカゲ'}, + {'en': 'alligator lizard', 'ja': 'アリゲータートカゲ'}, + {'en': 'Gila monster', 'ja': 'アメリカドクトカゲ'}, + {'en': 'European green lizard', 'ja': '緑のトカゲ'}, + {'en': 'chameleon', 'ja': 'アフリカのカメレオン'}, + {'en': 'Komodo dragon', 'ja': 'コモドドラゴン'}, + {'en': 'Nile crocodile', 'ja': 'アフリカのワニ'}, + {'en': 'American alligator', 'ja': 'アメリカワニ'}, + {'en': 'triceratops', 'ja': 'トリケラトプス'}, + {'en': 'worm snake', 'ja': '雷のヘビ'}, + {'en': 'ring-necked snake', 'ja': 'リングネックスネーク'}, + {'en': 'eastern hog-nosed snake', 'ja': 'ホーノースヘビ'}, + {'en': 'smooth green snake', 'ja': '緑のヘビ'}, + {'en': 'kingsnake', 'ja': 'キングスネーク'}, + {'en': 'garter snake', 'ja': 'ガータースネーク'}, + {'en': 'water snake', 'ja': '水蛇'}, + {'en': 'vine snake', 'ja': 'つるヘビ'}, + {'en': 'night snake', 'ja': '夜のヘビ'}, + {'en': 'boa constrictor', 'ja': 'ボア・コンストリクター'}, + {'en': 'African rock python', 'ja': 'ロックパイソン'}, + {'en': 'Indian cobra', 'ja': 'インドコブラ'}, + {'en': 'green mamba', 'ja': 'グリーンマンバ'}, + {'en': 'sea snake', 'ja': 'ウミヘビ'}, + {'en': 'Saharan horned viper', 'ja': 'ツノクサリヘビ'}, + {'en': 'eastern diamondback rattlesnake', 'ja': 'ダイヤ'}, + {'en': 'sidewinder rattlesnake', 'ja': 'サイドワインダー'}, + {'en': 'trilobite', 'ja': '三葉虫'}, + {'en': 'harvestman', 'ja': '刈り入れ作業者'}, + {'en': 'scorpion', 'ja': 'サソリ'}, + {'en': 'yellow garden spider', 'ja': '黒と金の庭クモ'}, + {'en': 'barn spider', 'ja': '納屋クモ'}, + {'en': 'European garden spider', 'ja': '庭クモ'}, + {'en': 'southern black widow', 'ja': 'クロゴケグモ'}, + {'en': 'tarantula', 'ja': 'タランチュラ'}, + {'en': 'wolf spider', 'ja': 'オオカミのクモ'}, + {'en': 'tick', 'ja': 'ダニ'}, + {'en': 'centipede', 'ja': '百足'}, + {'en': 'black grouse', 'ja': 'クロライチョウ'}, + {'en': 'ptarmigan', 'ja': '雷鳥'}, + {'en': 'ruffed grouse', 'ja': 'ひだえりの付いたライチョウ'}, + {'en': 'prairie grouse', 'ja': '草原チキン'}, + {'en': 'peafowl', 'ja': '孔雀'}, + {'en': 'quail', 'ja': 'ウズラ'}, + {'en': 'partridge', 'ja': 'ヤマウズラ'}, + {'en': 'african grey parrot', 'ja': 'アフリカの灰色'}, + {'en': 'macaw', 'ja': 'コンゴウインコ'}, + {'en': 'sulphur-crested cockatoo', 'ja': '硫黄トキオウム'}, + {'en': 'lorikeet', 'ja': 'インコ'}, + {'en': 'coucal', 'ja': 'バンケン'}, + {'en': 'bee eater', 'ja': '蜂食べる人'}, + {'en': 'hornbill', 'ja': 'サイチョウ'}, + {'en': 'hummingbird', 'ja': 'ハチドリ'}, + {'en': 'jacamar', 'ja': '錐嘴'}, + {'en': 'toucan', 'ja': 'オオハシ'}, + {'en': 'duck', 'ja': 'ドレイク'}, + {'en': 'red-breasted merganser', 'ja': '赤ブレストアイサ属のガモ'}, + {'en': 'goose', 'ja': 'ガチョウ'}, + {'en': 'black swan', 'ja': '黒い白鳥'}, + {'en': 'tusker', 'ja': 'タスカービール'}, + {'en': 'echidna', 'ja': 'ハリモグラ'}, + {'en': 'platypus', 'ja': 'カモノハシ'}, + {'en': 'wallaby', 'ja': 'ワラビー'}, + {'en': 'koala', 'ja': 'コアラ'}, + {'en': 'wombat', 'ja': 'ウォンバット'}, + {'en': 'jellyfish', 'ja': 'クラゲ'}, + {'en': 'sea anemone', 'ja': 'イソギンチャク'}, + {'en': 'brain coral', 'ja': '脳サンゴ'}, + {'en': 'flatworm', 'ja': '扁形動物'}, + {'en': 'nematode', 'ja': '線虫'}, + {'en': 'conch', 'ja': '巻き貝'}, + {'en': 'snail', 'ja': 'カタツムリ'}, + {'en': 'slug', 'ja': 'ナメクジ'}, + {'en': 'sea slug', 'ja': 'ウミウシ'}, + {'en': 'chiton', 'ja': 'キトン'}, + {'en': 'chambered nautilus', 'ja': 'オウムガイ'}, + {'en': 'Dungeness crab', 'ja': 'アメリカイチョウガニ'}, + {'en': 'rock crab', 'ja': '岩カニ'}, + {'en': 'fiddler crab', 'ja': 'シオマネキ'}, + {'en': 'red king crab', 'ja': 'タラバガニ'}, + {'en': 'American lobster', 'ja': 'アメリカンロブスター'}, + {'en': 'spiny lobster', 'ja': '伊勢エビ'}, + {'en': 'crayfish', 'ja': 'ザリガニ'}, + {'en': 'hermit crab', 'ja': 'ヤドカリ'}, + {'en': 'isopod', 'ja': '等脚類'}, + {'en': 'white stork', 'ja': 'コウノトリ'}, + {'en': 'black stork', 'ja': 'ナベコウ'}, + {'en': 'spoonbill', 'ja': 'ヘラサギ'}, + {'en': 'flamingo', 'ja': 'フラミンゴ'}, + {'en': 'little blue heron', 'ja': '小さな青いサギ'}, + {'en': 'great egret', 'ja': 'アメリカン白鷺'}, + {'en': 'bittern bird', 'ja': 'にがり'}, + {'en': 'crane bird', 'ja': 'クレーン'}, + {'en': 'limpkin', 'ja': 'ツルモドキ科の鳥'}, + {'en': 'common gallinule', 'ja': 'ヨーロピアン水鳥'}, + {'en': 'American coot', 'ja': 'アメリカオオバン'}, + {'en': 'bustard', 'ja': 'ノガン'}, + {'en': 'ruddy turnstone', 'ja': 'キョウジョシギ'}, + {'en': 'dunlin', 'ja': '赤担保シギ'}, + {'en': 'common redshank', 'ja': 'アカアシシギ'}, + {'en': 'dowitcher', 'ja': 'オオハシシギ'}, + {'en': 'oystercatcher', 'ja': 'ミヤコドリ'}, + {'en': 'pelican', 'ja': 'ペリカン'}, + {'en': 'king penguin', 'ja': 'キングペンギン'}, + {'en': 'albatross', 'ja': 'アルバトロス'}, + {'en': 'grey whale', 'ja': 'コククジラ'}, + {'en': 'killer whale', 'ja': 'シャチ'}, + {'en': 'dugong', 'ja': 'ジュゴン'}, + {'en': 'sea lion', 'ja': 'アシカ'}, + {'en': 'Chihuahua', 'ja': 'チワワ'}, + {'en': 'Japanese Chin', 'ja': '狆'}, + {'en': 'Maltese', 'ja': 'マルチーズ犬'}, + {'en': 'Pekingese', 'ja': '狆'}, + {'en': 'Shih Tzu', 'ja': 'シーズー、シーズー'}, + {'en': 'King Charles Spaniel', 'ja': 'ブレナムスパニエル'}, + {'en': 'Papillon', 'ja': 'パピヨン'}, + {'en': 'toy terrier', 'ja': 'トイテリア'}, + {'en': 'Rhodesian Ridgeback', 'ja': 'ローデシアン・リッジバック'}, + {'en': 'Afghan Hound', 'ja': 'アフガンハウンド'}, + {'en': 'Basset Hound', 'ja': 'バセット犬'}, + {'en': 'Beagle', 'ja': 'ビーグル'}, + {'en': 'Bloodhound', 'ja': 'ブラッドハウンド'}, + {'en': 'Bluetick Coonhound', 'ja': 'ブルーティック'}, + {'en': 'Black and Tan Coonhound', 'ja': '黒と黄褐色の猟犬'}, + {'en': 'Treeing Walker Coonhound', 'ja': 'ウォーカーハウンド'}, + {'en': 'English foxhound', 'ja': 'イングリッシュフォックスハウンド'}, + {'en': 'Redbone Coonhound', 'ja': 'レッドボーン'}, + {'en': 'borzoi', 'ja': 'ボルゾイ'}, + {'en': 'Irish Wolfhound', 'ja': 'アイリッシュ・ウルフハウンド'}, + {'en': 'Italian Greyhound', 'ja': 'イタリアングレーハウンド'}, + {'en': 'Whippet', 'ja': 'ウィペット'}, + {'en': 'Ibizan Hound', 'ja': 'イビサハウンド'}, + {'en': 'Norwegian Elkhound', 'ja': 'ノルウェーエルクハウンド'}, + {'en': 'Otterhound', 'ja': 'オッターハウンド'}, + {'en': 'Saluki', 'ja': 'サルーキ'}, + {'en': 'Scottish Deerhound', 'ja': 'スコティッシュ・ディアハウンド'}, + {'en': 'Weimaraner', 'ja': 'ワイマラナー'}, + {'en': 'Staffordshire Bull Terrier', 'ja': 'スタフォードシャーブルテリア'}, + {'en': 'American Staffordshire Terrier', 'ja': 'アメリカン・スタッフォードシャー・テリア'}, + {'en': 'Bedlington Terrier', 'ja': 'ベドリントンテリア'}, + {'en': 'Border Terrier', 'ja': 'ボーダーテリア'}, + {'en': 'Kerry Blue Terrier', 'ja': 'ケリーブルーテリア'}, + {'en': 'Irish Terrier', 'ja': 'アイリッシュテリア'}, + {'en': 'Norfolk Terrier', 'ja': 'ノーフォークテリア'}, + {'en': 'Norwich Terrier', 'ja': 'ノーリッチ・テリア'}, + {'en': 'Yorkshire Terrier', 'ja': 'ヨークシャーテリア'}, + {'en': 'Wire Fox Terrier', 'ja': 'ワイヤーヘアー・フォックステリア'}, + {'en': 'Lakeland Terrier', 'ja': 'レークランドテリア'}, + {'en': 'Sealyham Terrier', 'ja': 'シーリーハムテリア'}, + {'en': 'Airedale Terrier', 'ja': 'エアデール'}, + {'en': 'Cairn Terrier', 'ja': 'ケルン'}, + {'en': 'Australian Terrier', 'ja': 'オーストラリアテリア'}, + {'en': 'Dandie Dinmont Terrier', 'ja': 'ダンディディンモントテリア'}, + {'en': 'Boston Terrier', 'ja': 'ボストンブル'}, + {'en': 'Miniature Schnauzer', 'ja': 'ミニチュアシュナウザー'}, + {'en': 'Giant Schnauzer', 'ja': 'ジャイアントシュナウザー'}, + {'en': 'Standard Schnauzer', 'ja': 'スタンダードシュナウザー'}, + {'en': 'Scottish Terrier', 'ja': 'スコッチテリア'}, + {'en': 'Tibetan Terrier', 'ja': 'チベタンテリア'}, + {'en': 'Australian Silky Terrier', 'ja': 'シルキーテリア'}, + {'en': 'Soft-coated Wheaten Terrier', 'ja': 'ソフトコーテッド・ウィートン・テリア'}, + {'en': 'West Highland White Terrier', 'ja': 'ウェストハイランドホワイトテリア'}, + {'en': 'Lhasa Apso', 'ja': 'ラサ'}, + {'en': 'Flat-Coated Retriever', 'ja': 'フラットコーテッド・レトリーバー'}, + {'en': 'Curly-coated Retriever', 'ja': 'カーリーコーティングされたレトリーバー'}, + {'en': 'Golden Retriever', 'ja': 'ゴールデンレトリバー'}, + {'en': 'Labrador Retriever', 'ja': 'ラブラドル・レトリーバー犬'}, + {'en': 'Chesapeake Bay Retriever', 'ja': 'チェサピーク湾レトリーバー'}, + {'en': 'German Shorthaired Pointer', 'ja': 'ジャーマン・ショートヘア・ポインタ'}, + {'en': 'Vizsla', 'ja': 'ビズラ'}, + {'en': 'English Setter', 'ja': 'イングリッシュセッター'}, + {'en': 'Irish Setter', 'ja': 'アイリッシュセッター'}, + {'en': 'Gordon Setter', 'ja': 'ゴードンセッター'}, + {'en': 'Brittany dog', 'ja': 'ブリタニースパニエル'}, + {'en': 'Clumber Spaniel', 'ja': 'クランバー'}, + {'en': 'English Springer Spaniel', 'ja': 'イングリッシュスプリンガー'}, + {'en': 'Welsh Springer Spaniel', 'ja': 'ウェルシュスプリンガースパニエル'}, + {'en': 'Cocker Spaniel', 'ja': 'コッカースパニエル'}, + {'en': 'Sussex Spaniel', 'ja': 'サセックススパニエル'}, + {'en': 'Irish Water Spaniel', 'ja': 'アイルランドのウォータースパニエル'}, + {'en': 'Kuvasz', 'ja': 'クバース犬'}, + {'en': 'Schipperke', 'ja': 'スキッパーキー'}, + {'en': 'Groenendael dog', 'ja': 'ベルジアン・シェパード・ドッグ・グローネンダール'}, + {'en': 'Malinois', 'ja': 'マリノア'}, + {'en': 'Briard', 'ja': 'ブリアール'}, + {'en': 'Australian Kelpie', 'ja': 'ケルピー'}, + {'en': 'Komondor', 'ja': 'コモンドール'}, + {'en': 'Old English Sheepdog', 'ja': 'オールドイングリッシュシープドッグ'}, + {'en': 'Shetland Sheepdog', 'ja': 'シェトランドシープドッグ'}, + {'en': 'collie', 'ja': 'コリー'}, + {'en': 'Border Collie', 'ja': 'ボーダーコリー'}, + {'en': 'Bouvier des Flandres dog', 'ja': 'ブーヴィエ・デ・フランドル'}, + {'en': 'Rottweiler', 'ja': 'ロットワイラー'}, + {'en': 'German Shepherd Dog', 'ja': 'ジャーマンシェパード'}, + {'en': 'Dobermann', 'ja': 'ドーベルマン犬'}, + {'en': 'Miniature Pinscher', 'ja': 'ミニチュアピンシャー'}, + {'en': 'Greater Swiss Mountain Dog', 'ja': 'グレータースイスマウンテンドッグ'}, + {'en': 'Bernese Mountain Dog', 'ja': 'バーネーズマウンテンドッグ'}, + {'en': 'Appenzeller Sennenhund', 'ja': 'アッペンツェル'}, + {'en': 'Entlebucher Sennenhund', 'ja': 'エントレブッシャー'}, + {'en': 'Boxer', 'ja': 'ボクサー'}, + {'en': 'Bullmastiff', 'ja': 'ブルマスチフ'}, + {'en': 'Tibetan Mastiff', 'ja': 'チベットマスチフ'}, + {'en': 'French Bulldog', 'ja': 'フレンチブルドッグ'}, + {'en': 'Great Dane', 'ja': 'グレートデーン'}, + {'en': 'St. Bernard', 'ja': 'セントバーナード'}, + {'en': 'husky', 'ja': 'エスキモー犬'}, + {'en': 'Alaskan Malamute', 'ja': 'マラミュート'}, + {'en': 'Siberian Husky', 'ja': 'シベリアンハスキー'}, + {'en': 'Dalmatian', 'ja': 'ダルメシアン'}, + {'en': 'Affenpinscher', 'ja': 'アーフェンピンシャー'}, + {'en': 'Basenji', 'ja': 'バセンジー'}, + {'en': 'pug', 'ja': 'パグ'}, + {'en': 'Leonberger', 'ja': 'レオンバーグ'}, + {'en': 'Newfoundland dog', 'ja': 'ニューファンドランド島'}, + {'en': 'Great Pyrenees dog', 'ja': 'グレートピレニーズ'}, + {'en': 'Samoyed', 'ja': 'サモエド'}, + {'en': 'Pomeranian', 'ja': 'ポメラニアン'}, + {'en': 'Chow Chow', 'ja': 'チャウ'}, + {'en': 'Keeshond', 'ja': 'キースホンド'}, + {'en': 'brussels griffon', 'ja': 'ブラバンソングリフォン'}, + {'en': 'Pembroke Welsh Corgi', 'ja': 'ペンブローク'}, + {'en': 'Cardigan Welsh Corgi', 'ja': 'カーディガン'}, + {'en': 'Toy Poodle', 'ja': 'トイプードル'}, + {'en': 'Miniature Poodle', 'ja': 'ミニチュアプードル'}, + {'en': 'Standard Poodle', 'ja': 'スタンダードプードル'}, + {'en': 'Mexican hairless dog (xoloitzcuintli)', 'ja': 'メキシカン・ヘアーレス'}, + {'en': 'grey wolf', 'ja': 'シンリンオオカミ'}, + {'en': 'Alaskan tundra wolf', 'ja': '白いオオカミ'}, + {'en': 'red wolf or maned wolf', 'ja': 'レッドウルフ'}, + {'en': 'coyote', 'ja': 'コヨーテ'}, + {'en': 'dingo', 'ja': 'ディンゴ'}, + {'en': 'dhole', 'ja': 'ドール'}, + {'en': 'African wild dog', 'ja': 'リカオン'}, + {'en': 'hyena', 'ja': 'ハイエナ'}, + {'en': 'red fox', 'ja': 'アカギツネ'}, + {'en': 'kit fox', 'ja': 'キットキツネ'}, + {'en': 'Arctic fox', 'ja': 'ホッキョクギツネ'}, + {'en': 'grey fox', 'ja': '灰色のキツネ'}, + {'en': 'tabby cat', 'ja': 'タビー'}, + {'en': 'tiger cat', 'ja': '虎猫'}, + {'en': 'Persian cat', 'ja': 'ペルシャ猫'}, + {'en': 'Siamese cat', 'ja': 'シャム猫'}, + {'en': 'Egyptian Mau', 'ja': 'エジプトの猫'}, + {'en': 'cougar', 'ja': 'クーガー'}, + {'en': 'lynx', 'ja': 'オオヤマネコ'}, + {'en': 'leopard', 'ja': 'ヒョウ'}, + {'en': 'snow leopard', 'ja': 'ユキヒョウ'}, + {'en': 'jaguar', 'ja': 'ジャガー'}, + {'en': 'lion', 'ja': 'ライオン'}, + {'en': 'tiger', 'ja': '虎'}, + {'en': 'cheetah', 'ja': 'チーター'}, + {'en': 'brown bear', 'ja': 'ヒグマ'}, + {'en': 'American black bear', 'ja': 'アメリカクロクマ'}, + {'en': 'polar bear', 'ja': '氷のクマ'}, + {'en': 'sloth bear', 'ja': 'ナマケグマ'}, + {'en': 'mongoose', 'ja': 'マングース'}, + {'en': 'meerkat', 'ja': 'ミーアキャット'}, + {'en': 'tiger beetle', 'ja': 'ハンミョウ'}, + {'en': 'ladybug', 'ja': 'てんとう虫'}, + {'en': 'ground beetle', 'ja': 'グランドビートル'}, + {'en': 'longhorn beetle', 'ja': 'カミキリムシ'}, + {'en': 'leaf beetle', 'ja': 'ハムシ'}, + {'en': 'dung beetle', 'ja': 'フンコロガシ'}, + {'en': 'rhinoceros beetle', 'ja': 'サイハムシ'}, + {'en': 'weevil', 'ja': 'ゾウムシ'}, + {'en': 'fly', 'ja': 'ハエ'}, + {'en': 'bee', 'ja': '蜂'}, + {'en': 'ant', 'ja': '蟻'}, + {'en': 'grasshopper', 'ja': 'バッタ'}, + {'en': 'cricket insect', 'ja': 'クリケット'}, + {'en': 'stick insect', 'ja': '杖'}, + {'en': 'cockroach', 'ja': 'ゴキブリ'}, + {'en': 'praying mantis', 'ja': 'カマキリ'}, + {'en': 'cicada', 'ja': '蝉'}, + {'en': 'leafhopper', 'ja': 'ヨコバイ'}, + {'en': 'lacewing', 'ja': 'クサカゲロウ'}, + {'en': 'dragonfly', 'ja': 'トンボ'}, + {'en': 'damselfly', 'ja': 'イトトンボ'}, + {'en': 'red admiral butterfly', 'ja': '提督'}, + {'en': 'ringlet butterfly', 'ja': 'リングレット'}, + {'en': 'monarch butterfly', 'ja': '君主'}, + {'en': 'small white butterfly', 'ja': 'モンシロチョウ'}, + {'en': 'sulphur butterfly', 'ja': '硫黄蝶'}, + {'en': 'gossamer-winged butterfly', 'ja': 'シジミチョウ'}, + {'en': 'starfish', 'ja': 'ヒトデ'}, + {'en': 'sea urchin', 'ja': 'うに'}, + {'en': 'sea cucumber', 'ja': 'ナマコ'}, + {'en': 'cottontail rabbit', 'ja': '木のウサギ'}, + {'en': 'hare', 'ja': '野ウサギ'}, + {'en': 'Angora rabbit', 'ja': 'アンゴラ'}, + {'en': 'hamster', 'ja': 'ハムスター'}, + {'en': 'porcupine', 'ja': 'ヤマアラシ'}, + {'en': 'fox squirrel', 'ja': 'キツネリス'}, + {'en': 'marmot', 'ja': 'マーモット'}, + {'en': 'beaver', 'ja': 'ビーバー'}, + {'en': 'guinea pig', 'ja': 'モルモット'}, + {'en': 'common sorrel horse', 'ja': '栗色'}, + {'en': 'zebra', 'ja': 'シマウマ'}, + {'en': 'pig', 'ja': '豚'}, + {'en': 'wild boar', 'ja': 'イノシシ'}, + {'en': 'warthog', 'ja': 'イボイノシシ'}, + {'en': 'hippopotamus', 'ja': 'カバ'}, + {'en': 'ox', 'ja': '雄牛'}, + {'en': 'water buffalo', 'ja': '水牛'}, + {'en': 'bison', 'ja': 'バイソン'}, + {'en': 'ram (adult male sheep)', 'ja': 'ラム'}, + {'en': 'bighorn sheep', 'ja': 'ビッグホーン'}, + {'en': 'Alpine ibex', 'ja': 'アイベックス'}, + {'en': 'hartebeest', 'ja': 'ハーテビースト'}, + {'en': 'impala (antelope)', 'ja': 'インパラ'}, + {'en': 'gazelle', 'ja': 'ガゼル'}, + {'en': 'arabian camel', 'ja': 'アラビアラクダ'}, + {'en': 'llama', 'ja': 'ラマ'}, + {'en': 'weasel', 'ja': 'イタチ'}, + {'en': 'mink', 'ja': 'ミンク'}, + {'en': 'European polecat', 'ja': 'ケナガイタチ'}, + {'en': 'black-footed ferret', 'ja': 'クロアシイタチ'}, + {'en': 'otter', 'ja': 'カワウソ'}, + {'en': 'skunk', 'ja': 'スカンク'}, + {'en': 'badger', 'ja': '狸'}, + {'en': 'armadillo', 'ja': 'アルマジロ'}, + {'en': 'three-toed sloth', 'ja': 'ミユビナマケモノ'}, + {'en': 'orangutan', 'ja': 'オランウータン'}, + {'en': 'gorilla', 'ja': 'ゴリラ'}, + {'en': 'chimpanzee', 'ja': 'チンパンジー'}, + {'en': 'gibbon', 'ja': 'テナガザル'}, + {'en': 'siamang', 'ja': 'フクロテナガザル'}, + {'en': 'guenon', 'ja': 'オナガザル'}, + {'en': 'patas monkey', 'ja': 'パタス'}, + {'en': 'baboon', 'ja': 'ヒヒ'}, + {'en': 'macaque', 'ja': 'マカク'}, + {'en': 'langur', 'ja': 'ヤセザル'}, + {'en': 'black-and-white colobus', 'ja': 'コロブス属'}, + {'en': 'proboscis monkey', 'ja': 'テングザル'}, + {'en': 'marmoset', 'ja': 'マーモセット'}, + {'en': 'white-headed capuchin', 'ja': 'オマキザル'}, + {'en': 'howler monkey', 'ja': 'ホエザル'}, + {'en': 'titi monkey', 'ja': 'ティティ'}, + {'en': "Geoffroy's spider monkey", 'ja': 'クモザル'}, + {'en': 'common squirrel monkey', 'ja': 'リスザル'}, + {'en': 'ring-tailed lemur', 'ja': 'マダガスカル猫'}, + {'en': 'indri', 'ja': 'インドリ'}, + {'en': 'Asian elephant', 'ja': 'インドゾウ'}, + {'en': 'African bush elephant', 'ja': 'アフリカゾウ'}, + {'en': 'red panda', 'ja': 'レッサーパンダ'}, + {'en': 'giant panda', 'ja': 'ジャイアントパンダ'}, + {'en': 'snoek fish', 'ja': 'バラクータ'}, + {'en': 'eel', 'ja': 'ウナギ'}, + {'en': 'silver salmon', 'ja': 'ギンザケ'}, + {'en': 'rock beauty fish', 'ja': '岩の美しさ'}, + {'en': 'clownfish', 'ja': 'クマノミ'}, + {'en': 'sturgeon', 'ja': 'チョウザメ'}, + {'en': 'gar fish', 'ja': 'ガー'}, + {'en': 'lionfish', 'ja': 'ミノカサゴ'}, + {'en': 'pufferfish', 'ja': 'フグ'}, + {'en': 'abacus', 'ja': 'そろばん'}, + {'en': 'abaya', 'ja': 'アバヤ'}, + {'en': 'academic gown', 'ja': 'アカデミックガウン'}, + {'en': 'accordion', 'ja': 'アコーディオン'}, + {'en': 'acoustic guitar', 'ja': 'アコースティックギター'}, + {'en': 'aircraft carrier', 'ja': '空母'}, + {'en': 'airliner', 'ja': '旅客機'}, + {'en': 'airship', 'ja': '飛行船'}, + {'en': 'altar', 'ja': '祭壇'}, + {'en': 'ambulance', 'ja': '救急車'}, + {'en': 'amphibious vehicle', 'ja': '両生類'}, + {'en': 'analog clock', 'ja': 'アナログ時計'}, + {'en': 'apiary', 'ja': '養蜂場'}, + {'en': 'apron', 'ja': 'エプロン'}, + {'en': 'trash can', 'ja': 'ごみ入れ'}, + {'en': 'assault rifle', 'ja': 'アサルトライフル'}, + {'en': 'backpack', 'ja': 'バックパック'}, + {'en': 'bakery', 'ja': 'ベーカリー'}, + {'en': 'balance beam', 'ja': '平均台'}, + {'en': 'balloon', 'ja': 'バルーン'}, + {'en': 'ballpoint pen', 'ja': 'ボールペン'}, + {'en': 'Band-Aid', 'ja': 'バンドエイド'}, + {'en': 'banjo', 'ja': 'バンジョー'}, + {'en': 'baluster / handrail', 'ja': 'バニスター'}, + {'en': 'barbell', 'ja': 'バーベル'}, + {'en': 'barber chair', 'ja': '理髪店の椅子'}, + {'en': 'barbershop', 'ja': '理髪店'}, + {'en': 'barn', 'ja': '納屋'}, + {'en': 'barometer', 'ja': 'バロメーター'}, + {'en': 'barrel', 'ja': 'バレル'}, + {'en': 'wheelbarrow', 'ja': 'バロー'}, + {'en': 'baseball', 'ja': '野球'}, + {'en': 'basketball', 'ja': 'バスケットボール'}, + {'en': 'bassinet', 'ja': 'バシネット'}, + {'en': 'bassoon', 'ja': 'ファゴット'}, + {'en': 'swimming cap', 'ja': '水泳帽'}, + {'en': 'bath towel', 'ja': 'バスタオル'}, + {'en': 'bathtub', 'ja': 'バスタブ'}, + {'en': 'station wagon', 'ja': 'ビーチワゴン'}, + {'en': 'lighthouse', 'ja': 'ビーコン'}, + {'en': 'beaker', 'ja': 'ビーカー'}, + {'en': 'military hat (bearskin or shako)', 'ja': 'ベアスキン'}, + {'en': 'beer bottle', 'ja': 'ビール瓶'}, + {'en': 'beer glass', 'ja': 'ビールグラス'}, + {'en': 'bell tower', 'ja': 'ベルコート'}, + {'en': 'baby bib', 'ja': 'ビブ'}, + {'en': 'tandem bicycle', 'ja': '自転車'}, + {'en': 'bikini', 'ja': 'ビキニ'}, + {'en': 'ring binder', 'ja': 'バインダー'}, + {'en': 'binoculars', 'ja': '双眼鏡'}, + {'en': 'birdhouse', 'ja': '巣箱'}, + {'en': 'boathouse', 'ja': 'ボートハウス'}, + {'en': 'bobsleigh', 'ja': 'ボブスレー'}, + {'en': 'bolo tie', 'ja': 'ループタイ'}, + {'en': 'poke bonnet', 'ja': 'ボンネット'}, + {'en': 'bookcase', 'ja': '本棚'}, + {'en': 'bookstore', 'ja': '書店'}, + {'en': 'bottle cap', 'ja': '瓶のキャップ'}, + {'en': 'hunting bow', 'ja': '弓'}, + {'en': 'bow tie', 'ja': 'ちょうネクタイ'}, + {'en': 'brass memorial plaque', 'ja': '真鍮'}, + {'en': 'bra', 'ja': 'ブラジャー'}, + {'en': 'breakwater', 'ja': '防波堤'}, + {'en': 'breastplate', 'ja': '胸当て'}, + {'en': 'broom', 'ja': 'ほうき'}, + {'en': 'bucket', 'ja': 'バケツ'}, + {'en': 'buckle', 'ja': 'バックル'}, + {'en': 'bulletproof vest', 'ja': '防弾チョッキ'}, + {'en': 'high-speed train', 'ja': '新幹線'}, + {'en': 'butcher shop', 'ja': '精肉店'}, + {'en': 'taxicab', 'ja': 'タクシー'}, + {'en': 'cauldron', 'ja': '大釜'}, + {'en': 'candle', 'ja': 'キャンドル'}, + {'en': 'cannon', 'ja': '大砲'}, + {'en': 'canoe', 'ja': 'カヌー'}, + {'en': 'can opener', 'ja': '缶切り'}, + {'en': 'cardigan', 'ja': 'カーディガン'}, + {'en': 'car mirror', 'ja': '車のミラー'}, + {'en': 'carousel', 'ja': '回転木馬'}, + {'en': 'tool kit', 'ja': '大工のキット'}, + {'en': 'cardboard box / carton', 'ja': 'カートン'}, + {'en': 'car wheel', 'ja': '車のホイール'}, + {'en': 'automated teller machine', 'ja': '現金自動預け払い機'}, + {'en': 'cassette', 'ja': 'カセット'}, + {'en': 'cassette player', 'ja': 'カセット・プレーヤー'}, + {'en': 'castle', 'ja': '城'}, + {'en': 'catamaran', 'ja': 'カタマラン'}, + {'en': 'CD player', 'ja': 'CDプレーヤー'}, + {'en': 'cello', 'ja': 'チェロ'}, + {'en': 'mobile phone', 'ja': 'スマートフォン'}, + {'en': 'chain', 'ja': '鎖'}, + {'en': 'chain-link fence', 'ja': 'チェーンリンクフェンス'}, + {'en': 'chain mail', 'ja': 'チェーンメール'}, + {'en': 'chainsaw', 'ja': 'チェーンソー'}, + {'en': 'storage chest', 'ja': '胸'}, + {'en': 'chiffonier', 'ja': 'シフォニア'}, + {'en': 'bell or wind chime', 'ja': 'チャイム'}, + {'en': 'china cabinet', 'ja': '中国キャビネット'}, + {'en': 'Christmas stocking', 'ja': 'クリスマスの靴下'}, + {'en': 'church', 'ja': '教会'}, + {'en': 'movie theater', 'ja': '映画'}, + {'en': 'cleaver', 'ja': 'クリーバー'}, + {'en': 'cliff dwelling', 'ja': '崖の住居'}, + {'en': 'cloak', 'ja': 'マント'}, + {'en': 'clogs', 'ja': 'クロッグ'}, + {'en': 'cocktail shaker', 'ja': 'カクテルシェーカー'}, + {'en': 'coffee mug', 'ja': 'コーヒーマグ'}, + {'en': 'coffeemaker', 'ja': 'コーヒーポット'}, + {'en': 'spiral or coil', 'ja': 'コイル'}, + {'en': 'combination lock', 'ja': 'ダイヤル錠'}, + {'en': 'computer keyboard', 'ja': 'コンピュータのキーボード'}, + {'en': 'candy store', 'ja': '製菓'}, + {'en': 'container ship', 'ja': 'コンテナ船'}, + {'en': 'convertible', 'ja': 'コンバーチブル'}, + {'en': 'corkscrew', 'ja': 'コークスクリュー'}, + {'en': 'cornet', 'ja': 'コルネット'}, + {'en': 'cowboy boot', 'ja': 'カウボーイブーツ'}, + {'en': 'cowboy hat', 'ja': 'カウボーイハット'}, + {'en': 'cradle', 'ja': 'クレードル'}, + {'en': 'construction crane', 'ja': 'クレーン'}, + {'en': 'crash helmet', 'ja': 'クラッシュヘルメット'}, + {'en': 'crate', 'ja': '木箱'}, + {'en': 'infant bed', 'ja': 'ベビーベッド'}, + {'en': 'Crock Pot', 'ja': 'クロークポット'}, + {'en': 'croquet ball', 'ja': 'クロケットボール'}, + {'en': 'crutch', 'ja': '松葉杖'}, + {'en': 'cuirass', 'ja': '胸当て'}, + {'en': 'dam', 'ja': 'ダム'}, + {'en': 'desk', 'ja': '机'}, + {'en': 'desktop computer', 'ja': 'デスクトップコンピューター'}, + {'en': 'rotary dial telephone', 'ja': 'ダイヤル電話'}, + {'en': 'diaper', 'ja': 'おむつ'}, + {'en': 'digital clock', 'ja': 'デジタル時計'}, + {'en': 'digital watch', 'ja': 'デジタル腕時計'}, + {'en': 'dining table', 'ja': 'ダイニングテーブル'}, + {'en': 'dishcloth', 'ja': '意気地なし'}, + {'en': 'dishwasher', 'ja': '食器洗い機'}, + {'en': 'disc brake', 'ja': 'ディスクブレーキ'}, + {'en': 'dock', 'ja': 'ドック'}, + {'en': 'dog sled', 'ja': '犬ぞり'}, + {'en': 'dome', 'ja': 'ドーム'}, + {'en': 'doormat', 'ja': '玄関マット'}, + {'en': 'drilling rig', 'ja': '掘削基地'}, + {'en': 'drum', 'ja': 'ドラム'}, + {'en': 'drumstick', 'ja': 'ドラムスティック'}, + {'en': 'dumbbell', 'ja': 'ダンベル'}, + {'en': 'Dutch oven', 'ja': 'ダッチオーブン'}, + {'en': 'electric fan', 'ja': '扇風機'}, + {'en': 'electric guitar', 'ja': 'エレキギター'}, + {'en': 'electric locomotive', 'ja': '電気機関車'}, + {'en': 'entertainment center', 'ja': '娯楽施設'}, + {'en': 'envelope', 'ja': '封筒'}, + {'en': 'espresso machine', 'ja': 'エスプレッソマシーン'}, + {'en': 'face powder', 'ja': 'フェースパウダー'}, + {'en': 'feather boa', 'ja': 'フェザーボア'}, + {'en': 'filing cabinet', 'ja': 'ファイル'}, + {'en': 'fireboat', 'ja': '消防艇'}, + {'en': 'fire truck', 'ja': '消防車'}, + {'en': 'fire screen', 'ja': 'ファイアースクリーン'}, + {'en': 'flagpole', 'ja': '旗竿'}, + {'en': 'flute', 'ja': 'フルート'}, + {'en': 'folding chair', 'ja': '折り畳み式椅子'}, + {'en': 'football helmet', 'ja': 'フットボールヘルメット'}, + {'en': 'forklift', 'ja': 'フォークリフト'}, + {'en': 'fountain', 'ja': '噴水'}, + {'en': 'fountain pen', 'ja': '万年筆'}, + {'en': 'four-poster bed', 'ja': '四柱'}, + {'en': 'freight car', 'ja': '貨車'}, + {'en': 'French horn', 'ja': 'フレンチホルン'}, + {'en': 'frying pan', 'ja': 'フライパン'}, + {'en': 'fur coat', 'ja': '毛皮のコート'}, + {'en': 'garbage truck', 'ja': 'ごみ収集車'}, + {'en': 'gas mask or respirator', 'ja': 'ガスマスク'}, + {'en': 'gas pump', 'ja': 'ガソリンポンプ'}, + {'en': 'goblet', 'ja': 'ゴブレット'}, + {'en': 'go-kart', 'ja': 'ゴーカート'}, + {'en': 'golf ball', 'ja': 'ゴルフボール'}, + {'en': 'golf cart', 'ja': 'ゴルフカート'}, + {'en': 'gondola', 'ja': 'ゴンドラ'}, + {'en': 'gong', 'ja': 'ゴング'}, + {'en': 'gown', 'ja': 'ガウン'}, + {'en': 'grand piano', 'ja': 'グランドピアノ'}, + {'en': 'greenhouse', 'ja': '温室'}, + {'en': 'radiator grille', 'ja': 'グリル'}, + {'en': 'grocery store', 'ja': '食料品店'}, + {'en': 'guillotine', 'ja': 'ギロチン'}, + {'en': 'hair clip', 'ja': 'ヘアスライド'}, + {'en': 'hair spray', 'ja': 'ヘアスプレー'}, + {'en': 'half-track', 'ja': '半トラック'}, + {'en': 'hammer', 'ja': 'ハンマー'}, + {'en': 'hamper', 'ja': '妨げます'}, + {'en': 'hair dryer', 'ja': 'ハンドブロワー'}, + {'en': 'hand-held computer', 'ja': 'タブレット'}, + {'en': 'handkerchief', 'ja': 'ハンカチ'}, + {'en': 'hard disk drive', 'ja': 'ハードディスク'}, + {'en': 'harmonica', 'ja': 'ハーモニカ'}, + {'en': 'harp', 'ja': 'ハープ'}, + {'en': 'combine harvester', 'ja': 'ハーベスタ'}, + {'en': 'hatchet', 'ja': '斧'}, + {'en': 'holster', 'ja': 'ホルスター'}, + {'en': 'home theater', 'ja': 'ホームシアター'}, + {'en': 'honeycomb', 'ja': 'ハニカム'}, + {'en': 'hook', 'ja': 'フック'}, + {'en': 'hoop skirt', 'ja': 'フープスカート'}, + {'en': 'gymnastic horizontal bar', 'ja': '水平バー'}, + {'en': 'horse-drawn vehicle', 'ja': '馬車'}, + {'en': 'hourglass', 'ja': '砂時計'}, + {'en': 'iPod', 'ja': 'アイフォーン'}, + {'en': 'clothes iron', 'ja': '鉄'}, + {'en': 'carved pumpkin', 'ja': 'ジャックオーランタン'}, + {'en': 'jeans', 'ja': 'ジーンズ'}, + {'en': 'jeep', 'ja': 'ジープ'}, + {'en': 'T-shirt', 'ja': 'ジャージー'}, + {'en': 'jigsaw puzzle', 'ja': 'ジグソーパズル'}, + {'en': 'rickshaw', 'ja': '人力車'}, + {'en': 'joystick', 'ja': 'ジョイスティック'}, + {'en': 'kimono', 'ja': '着物'}, + {'en': 'knee pad', 'ja': '膝パッド'}, + {'en': 'knot', 'ja': '結び目'}, + {'en': 'lab coat', 'ja': '白衣'}, + {'en': 'ladle', 'ja': 'ひしゃく'}, + {'en': 'lampshade', 'ja': 'ランプのかさ'}, + {'en': 'laptop computer', 'ja': 'ノートパソコン'}, + {'en': 'lawn mower', 'ja': '芝刈り機'}, + {'en': 'lens cap', 'ja': 'レンズキャップ'}, + {'en': 'letter opener', 'ja': 'レターオープナー'}, + {'en': 'library', 'ja': 'ライブラリ'}, + {'en': 'lifeboat', 'ja': '救命ボート'}, + {'en': 'lighter', 'ja': 'ライター'}, + {'en': 'limousine', 'ja': 'リムジン'}, + {'en': 'ocean liner', 'ja': 'ライナー'}, + {'en': 'lipstick', 'ja': '口紅'}, + {'en': 'slip-on shoe', 'ja': 'ローファー'}, + {'en': 'lotion', 'ja': 'ローション'}, + {'en': 'music speaker', 'ja': 'スピーカー'}, + {'en': 'loupe magnifying glass', 'ja': 'ルーペ'}, + {'en': 'sawmill', 'ja': '製材所'}, + {'en': 'magnetic compass', 'ja': '磁気コンパス'}, + {'en': 'messenger bag', 'ja': '郵袋'}, + {'en': 'mailbox', 'ja': 'メールボックス'}, + {'en': 'tights', 'ja': 'マイヨ'}, + {'en': 'one-piece bathing suit', 'ja': 'マイヨ'}, + {'en': 'manhole cover', 'ja': 'マンホールの蓋'}, + {'en': 'maraca', 'ja': 'マラカス'}, + {'en': 'marimba', 'ja': 'マリンバ'}, + {'en': 'mask', 'ja': 'マスク'}, + {'en': 'matchstick', 'ja': 'マッチ棒'}, + {'en': 'maypole', 'ja': 'メイポール'}, + {'en': 'maze', 'ja': '迷路'}, + {'en': 'measuring cup', 'ja': '計量カップ'}, + {'en': 'medicine cabinet', 'ja': '薬箱'}, + {'en': 'megalith', 'ja': '巨石'}, + {'en': 'microphone', 'ja': 'マイク'}, + {'en': 'microwave oven', 'ja': 'マイクロ波'}, + {'en': 'military uniform', 'ja': '軍服'}, + {'en': 'milk can', 'ja': 'ミルク缶'}, + {'en': 'minibus', 'ja': 'ミニバス'}, + {'en': 'miniskirt', 'ja': 'ミニスカート'}, + {'en': 'minivan', 'ja': 'ミニバン'}, + {'en': 'missile', 'ja': 'ミサイル'}, + {'en': 'mitten', 'ja': 'ミトン'}, + {'en': 'mixing bowl', 'ja': 'ミキシングボウル'}, + {'en': 'mobile home', 'ja': '移動住宅'}, + {'en': 'ford model t', 'ja': 'モデルT'}, + {'en': 'modem', 'ja': 'モデム'}, + {'en': 'monastery', 'ja': '修道院'}, + {'en': 'monitor', 'ja': 'モニター'}, + {'en': 'moped', 'ja': 'モペット'}, + {'en': 'mortar and pestle', 'ja': 'モルタル'}, + {'en': 'graduation cap', 'ja': 'モルタルボード'}, + {'en': 'mosque', 'ja': 'モスク'}, + {'en': 'mosquito net', 'ja': '蚊帳'}, + {'en': 'vespa', 'ja': 'スクーター'}, + {'en': 'mountain bike', 'ja': 'マウンテンバイク'}, + {'en': 'tent', 'ja': '山のテント'}, + {'en': 'computer mouse', 'ja': 'マウス'}, + {'en': 'mousetrap', 'ja': 'ネズミ捕り'}, + {'en': 'moving van', 'ja': '引っ越しトラック'}, + {'en': 'muzzle', 'ja': '銃口'}, + {'en': 'metal nail', 'ja': 'ネイル'}, + {'en': 'neck brace', 'ja': 'ネックブレース'}, + {'en': 'necklace', 'ja': 'ネックレス'}, + {'en': 'baby pacifier', 'ja': '乳首'}, + {'en': 'notebook computer', 'ja': 'ノート'}, + {'en': 'obelisk', 'ja': 'オベリスク'}, + {'en': 'oboe', 'ja': 'オーボエ'}, + {'en': 'ocarina', 'ja': 'オカリナ'}, + {'en': 'odometer', 'ja': 'オドメーター'}, + {'en': 'oil filter', 'ja': 'オイルフィルター'}, + {'en': 'pipe organ', 'ja': '器官'}, + {'en': 'oscilloscope', 'ja': 'オシロスコープ'}, + {'en': 'overskirt', 'ja': 'オーバースカート'}, + {'en': 'bullock cart', 'ja': '牛車'}, + {'en': 'oxygen mask', 'ja': '酸素マスク'}, + {'en': 'product packet / packaging', 'ja': 'パケット'}, + {'en': 'paddle', 'ja': 'パドル'}, + {'en': 'paddle wheel', 'ja': 'パドルホイール'}, + {'en': 'padlock', 'ja': '南京錠'}, + {'en': 'paintbrush', 'ja': '絵筆'}, + {'en': 'pajamas', 'ja': 'パジャマ'}, + {'en': 'palace', 'ja': '宮殿'}, + {'en': 'pan flute', 'ja': 'パンパイプ'}, + {'en': 'paper towel', 'ja': 'ペーパータオル'}, + {'en': 'parachute', 'ja': 'パラシュート'}, + {'en': 'parallel bars', 'ja': '平行棒'}, + {'en': 'park bench', 'ja': '公園のベンチ'}, + {'en': 'parking meter', 'ja': 'パーキングメーター'}, + {'en': 'railroad car', 'ja': '乗用車'}, + {'en': 'patio', 'ja': 'パティオ'}, + {'en': 'payphone', 'ja': '有料電話'}, + {'en': 'pedestal', 'ja': '台座'}, + {'en': 'pencil case', 'ja': '筆箱'}, + {'en': 'pencil sharpener', 'ja': '鉛筆削り'}, + {'en': 'perfume', 'ja': '香水'}, + {'en': 'Petri dish', 'ja': 'ペトリ皿'}, + {'en': 'photocopier', 'ja': 'コピー機'}, + {'en': 'plectrum', 'ja': '選ぶ'}, + {'en': 'Pickelhaube', 'ja': 'スパイク付き鉄かぶと'}, + {'en': 'picket fence', 'ja': '杭柵'}, + {'en': 'pickup truck', 'ja': '拾う'}, + {'en': 'pier', 'ja': '桟橋'}, + {'en': 'piggy bank', 'ja': '貯金箱'}, + {'en': 'pill bottle', 'ja': '錠剤瓶'}, + {'en': 'pillow', 'ja': '枕'}, + {'en': 'ping-pong ball', 'ja': 'ピンポン球'}, + {'en': 'pinwheel', 'ja': '風車'}, + {'en': 'pirate ship', 'ja': '海賊'}, + {'en': 'drink pitcher', 'ja': 'ピッチャー'}, + {'en': 'block plane', 'ja': '飛行機'}, + {'en': 'planetarium', 'ja': 'プラネタリウム'}, + {'en': 'plastic bag', 'ja': 'ビニール袋'}, + {'en': 'plate rack', 'ja': '皿立て'}, + {'en': 'farm plow', 'ja': 'プラウ'}, + {'en': 'plunger', 'ja': 'プランジャー'}, + {'en': 'Polaroid camera', 'ja': 'ポラロイドカメラ'}, + {'en': 'pole', 'ja': 'ポール'}, + {'en': 'police van', 'ja': '警察車'}, + {'en': 'poncho', 'ja': 'ポンチョ'}, + {'en': 'pool table', 'ja': 'ビリヤード台'}, + {'en': 'soda bottle', 'ja': 'ポップ・ボトル'}, + {'en': 'plant pot', 'ja': 'ポット'}, + {'en': "potter's wheel", 'ja': 'ろくろ'}, + {'en': 'power drill', 'ja': 'パワードリル'}, + {'en': 'prayer rug', 'ja': '礼拝用敷物'}, + {'en': 'printer', 'ja': 'プリンタ'}, + {'en': 'prison', 'ja': '刑務所'}, + {'en': 'missile', 'ja': '発射体'}, + {'en': 'projector', 'ja': 'プロジェクター'}, + {'en': 'hockey puck', 'ja': 'パック'}, + {'en': 'punching bag', 'ja': 'サンドバッグ'}, + {'en': 'purse', 'ja': '財布'}, + {'en': 'quill', 'ja': 'クイル'}, + {'en': 'quilt', 'ja': 'キルト'}, + {'en': 'race car', 'ja': 'レーサー'}, + {'en': 'racket', 'ja': 'ラケット'}, + {'en': 'radiator', 'ja': 'ラジエーター'}, + {'en': 'radio', 'ja': '無線'}, + {'en': 'radio telescope', 'ja': '電波望遠鏡'}, + {'en': 'rain barrel', 'ja': '天水桶'}, + {'en': 'recreational vehicle', 'ja': 'RV車'}, + {'en': 'fishing casting reel', 'ja': 'リール'}, + {'en': 'reflex camera', 'ja': 'レフレックスカメラ'}, + {'en': 'refrigerator', 'ja': '冷蔵庫'}, + {'en': 'remote control', 'ja': 'リモコン'}, + {'en': 'restaurant', 'ja': 'レストラン'}, + {'en': 'revolver', 'ja': 'リボルバー'}, + {'en': 'rifle', 'ja': 'ライフル'}, + {'en': 'rocking chair', 'ja': 'ロッキングチェア'}, + {'en': 'rotisserie', 'ja': '焼肉料理店'}, + {'en': 'eraser', 'ja': '消しゴム'}, + {'en': 'rugby ball', 'ja': 'ラグビーボール'}, + {'en': 'ruler measuring stick', 'ja': 'ルール'}, + {'en': 'sneaker', 'ja': 'ランニングシューズ'}, + {'en': 'safe', 'ja': '安全'}, + {'en': 'safety pin', 'ja': '安全ピン'}, + {'en': 'salt shaker', 'ja': '塩の入れ物'}, + {'en': 'sandal', 'ja': 'サンダル'}, + {'en': 'sarong', 'ja': 'サロン'}, + {'en': 'saxophone', 'ja': 'サックス'}, + {'en': 'scabbard', 'ja': '鞘'}, + {'en': 'weighing scale', 'ja': '規模'}, + {'en': 'school bus', 'ja': 'スクールバス'}, + {'en': 'schooner', 'ja': 'スクーナー'}, + {'en': 'scoreboard', 'ja': 'スコアボード'}, + {'en': 'CRT monitor', 'ja': '画面'}, + {'en': 'screw', 'ja': 'スクリュー'}, + {'en': 'screwdriver', 'ja': 'ドライバー'}, + {'en': 'seat belt', 'ja': 'シートベルト'}, + {'en': 'sewing machine', 'ja': 'ミシン'}, + {'en': 'shield', 'ja': 'シールド'}, + {'en': 'shoe store', 'ja': '靴屋'}, + {'en': 'shoji screen / room divider', 'ja': '障子'}, + {'en': 'shopping basket', 'ja': '買い物かご'}, + {'en': 'shopping cart', 'ja': 'ショッピングカート'}, + {'en': 'shovel', 'ja': 'シャベル'}, + {'en': 'shower cap', 'ja': 'シャワーキャップ'}, + {'en': 'shower curtain', 'ja': 'シャワーカーテン'}, + {'en': 'ski', 'ja': 'スキー'}, + {'en': 'balaclava ski mask', 'ja': 'スキーマスク'}, + {'en': 'sleeping bag', 'ja': '寝袋'}, + {'en': 'slide rule', 'ja': '計算尺'}, + {'en': 'sliding door', 'ja': '引き戸'}, + {'en': 'slot machine', 'ja': 'スロット'}, + {'en': 'snorkel', 'ja': 'スノーケル'}, + {'en': 'snowmobile', 'ja': 'スノーモービル'}, + {'en': 'snowplow', 'ja': '除雪機'}, + {'en': 'soap dispenser', 'ja': 'ソープディスペンサー'}, + {'en': 'soccer ball', 'ja': 'サッカーボール'}, + {'en': 'sock', 'ja': '靴下'}, + {'en': 'solar thermal collector', 'ja': '太陽の皿'}, + {'en': 'sombrero', 'ja': 'ソンブレロ'}, + {'en': 'soup bowl', 'ja': 'スープ皿'}, + {'en': 'keyboard space bar', 'ja': 'スペースキー'}, + {'en': 'space heater', 'ja': 'スペースヒーター'}, + {'en': 'space shuttle', 'ja': 'スペースシャトル'}, + {'en': 'spatula', 'ja': 'へら'}, + {'en': 'motorboat', 'ja': 'スピードボート'}, + {'en': 'spider web', 'ja': 'クモの巣'}, + {'en': 'spindle', 'ja': 'スピンドル'}, + {'en': 'sports car', 'ja': 'スポーツカー'}, + {'en': 'spotlight', 'ja': 'スポットライト'}, + {'en': 'stage', 'ja': 'ステージ'}, + {'en': 'steam locomotive', 'ja': '蒸気機関車'}, + {'en': 'through arch bridge', 'ja': '鋼アーチ橋'}, + {'en': 'steel drum', 'ja': 'スチールドラム'}, + {'en': 'stethoscope', 'ja': '聴診器'}, + {'en': 'scarf', 'ja': 'ストール'}, + {'en': 'stone wall', 'ja': '石垣'}, + {'en': 'stopwatch', 'ja': 'ストップウォッチ'}, + {'en': 'stove', 'ja': 'レンジ'}, + {'en': 'strainer', 'ja': 'ストレーナー'}, + {'en': 'tram', 'ja': '路面電車'}, + {'en': 'stretcher', 'ja': 'ストレッチャー'}, + {'en': 'couch', 'ja': 'スタジオソファ'}, + {'en': 'stupa', 'ja': '仏舎利塔'}, + {'en': 'submarine', 'ja': '潜水艦'}, + {'en': 'suit', 'ja': 'スーツ'}, + {'en': 'sundial', 'ja': '日時計'}, + {'en': 'sunglasses', 'ja': 'サングラス'}, + {'en': 'sunglasses', 'ja': 'サングラス'}, + {'en': 'sunscreen', 'ja': '日焼け止め剤'}, + {'en': 'suspension bridge', 'ja': 'つり橋'}, + {'en': 'mop', 'ja': '綿棒'}, + {'en': 'sweatshirt', 'ja': 'トレーナー'}, + {'en': 'swim trunks / shorts', 'ja': '海パン'}, + {'en': 'swing', 'ja': 'スイング'}, + {'en': 'electrical switch', 'ja': 'スイッチ'}, + {'en': 'syringe', 'ja': '注射器'}, + {'en': 'table lamp', 'ja': '電気スタンド'}, + {'en': 'tank', 'ja': 'タンク'}, + {'en': 'tape player', 'ja': 'テーププレーヤー'}, + {'en': 'teapot', 'ja': 'ティーポット'}, + {'en': 'teddy bear', 'ja': 'テディ'}, + {'en': 'television', 'ja': 'テレビ'}, + {'en': 'tennis ball', 'ja': 'テニスボール'}, + {'en': 'thatched roof', 'ja': 'サッチ'}, + {'en': 'front curtain', 'ja': '劇場のカーテン'}, + {'en': 'thimble', 'ja': '指ぬき'}, + {'en': 'threshing machine', 'ja': '脱穀機'}, + {'en': 'throne', 'ja': '王位'}, + {'en': 'tile roof', 'ja': '瓦屋根'}, + {'en': 'toaster', 'ja': 'トースター'}, + {'en': 'tobacco shop', 'ja': 'タバコ屋'}, + {'en': 'toilet seat', 'ja': '便座'}, + {'en': 'torch', 'ja': 'トーチ'}, + {'en': 'totem pole', 'ja': 'トーテムポール'}, + {'en': 'tow truck', 'ja': 'レッカー車'}, + {'en': 'toy store', 'ja': '玩具屋'}, + {'en': 'tractor', 'ja': 'トラクター'}, + {'en': 'semi-trailer truck', 'ja': 'トレーラートラック'}, + {'en': 'tray', 'ja': 'トレイ'}, + {'en': 'trench coat', 'ja': 'トレンチコート'}, + {'en': 'tricycle', 'ja': '三輪車'}, + {'en': 'trimaran', 'ja': '三胴船'}, + {'en': 'tripod', 'ja': '三脚'}, + {'en': 'triumphal arch', 'ja': '凱旋門'}, + {'en': 'trolleybus', 'ja': 'トロリーバス'}, + {'en': 'trombone', 'ja': 'トロンボーン'}, + {'en': 'hot tub', 'ja': 'バスタブ'}, + {'en': 'turnstile', 'ja': '回転ドア'}, + {'en': 'typewriter keyboard', 'ja': 'タイプライターのキーボード'}, + {'en': 'umbrella', 'ja': '傘'}, + {'en': 'unicycle', 'ja': '一輪車'}, + {'en': 'upright piano', 'ja': '直立'}, + {'en': 'vacuum cleaner', 'ja': '真空'}, + {'en': 'vase', 'ja': '花瓶'}, + {'en': 'vaulted or arched ceiling', 'ja': 'ボールト'}, + {'en': 'velvet fabric', 'ja': 'ベルベット'}, + {'en': 'vending machine', 'ja': '自動販売機'}, + {'en': 'vestment', 'ja': '祭服'}, + {'en': 'viaduct', 'ja': '高架橋'}, + {'en': 'violin', 'ja': 'バイオリン'}, + {'en': 'volleyball', 'ja': 'バレーボール'}, + {'en': 'waffle iron', 'ja': 'ワッフル焼き型'}, + {'en': 'wall clock', 'ja': '壁時計'}, + {'en': 'wallet', 'ja': '財布'}, + {'en': 'wardrobe', 'ja': 'ワードローブ'}, + {'en': 'military aircraft', 'ja': '戦闘機'}, + {'en': 'sink', 'ja': '洗面器'}, + {'en': 'washing machine', 'ja': 'ワッシャー'}, + {'en': 'water bottle', 'ja': '水筒'}, + {'en': 'water jug', 'ja': '水差し'}, + {'en': 'water tower', 'ja': '給水塔'}, + {'en': 'whiskey jug', 'ja': 'ウイスキージャグ'}, + {'en': 'whistle', 'ja': 'ホイッスル'}, + {'en': 'hair wig', 'ja': 'かつら'}, + {'en': 'window screen', 'ja': '窓網戸'}, + {'en': 'window shade', 'ja': 'ブラインド'}, + {'en': 'Windsor tie', 'ja': 'ウィンザーネクタイ'}, + {'en': 'wine bottle', 'ja': 'ワインボトル'}, + {'en': 'airplane wing', 'ja': '翼'}, + {'en': 'wok', 'ja': '中華鍋'}, + {'en': 'wooden spoon', 'ja': '木製スプーン'}, + {'en': 'wool', 'ja': 'ウール'}, + {'en': 'split-rail fence', 'ja': 'ワームフェンス'}, + {'en': 'shipwreck', 'ja': '難破船'}, + {'en': 'sailboat', 'ja': 'ヨール'}, + {'en': 'yurt', 'ja': 'パオ'}, + {'en': 'website', 'ja': 'サイト'}, + {'en': 'comic book', 'ja': 'コミックブック'}, + {'en': 'crossword', 'ja': 'クロスワードパズル'}, + {'en': 'traffic or street sign', 'ja': '道路標識'}, + {'en': 'traffic light', 'ja': '交通信号灯'}, + {'en': 'dust jacket', 'ja': 'ブックカバー'}, + {'en': 'menu', 'ja': 'メニュー'}, + {'en': 'plate', 'ja': 'プレート'}, + {'en': 'guacamole', 'ja': 'グアカモーレ'}, + {'en': 'consomme', 'ja': 'コンソメ'}, + {'en': 'hot pot', 'ja': 'ホットポット'}, + {'en': 'trifle', 'ja': 'パフェ'}, + {'en': 'ice cream', 'ja': 'アイスクリーム'}, + {'en': 'popsicle', 'ja': 'アイスキャンディー'}, + {'en': 'baguette', 'ja': 'フランスパン'}, + {'en': 'bagel', 'ja': 'ベーグル'}, + {'en': 'pretzel', 'ja': 'プレッツェル'}, + {'en': 'cheeseburger', 'ja': 'チーズバーガー'}, + {'en': 'hot dog', 'ja': 'ホットドッグ'}, + {'en': 'mashed potatoes', 'ja': 'マッシュポテト'}, + {'en': 'cabbage', 'ja': 'キャベツ'}, + {'en': 'broccoli', 'ja': 'ブロッコリー'}, + {'en': 'cauliflower', 'ja': 'カリフラワー'}, + {'en': 'zucchini', 'ja': 'ズッキーニ'}, + {'en': 'spaghetti squash', 'ja': 'そうめんかぼちゃ'}, + {'en': 'acorn squash', 'ja': 'ドングリかぼちゃ'}, + {'en': 'butternut squash', 'ja': 'カボチャ'}, + {'en': 'cucumber', 'ja': 'キュウリ'}, + {'en': 'artichoke', 'ja': 'アーティチョーク'}, + {'en': 'bell pepper', 'ja': 'ピーマン'}, + {'en': 'cardoon', 'ja': 'カルドン'}, + {'en': 'mushroom', 'ja': 'キノコ'}, + {'en': 'Granny Smith apple', 'ja': 'リンゴ'}, + {'en': 'strawberry', 'ja': 'イチゴ'}, + {'en': 'orange', 'ja': 'オレンジ'}, + {'en': 'lemon', 'ja': 'レモン'}, + {'en': 'fig', 'ja': 'イチジク'}, + {'en': 'pineapple', 'ja': 'パイナップル'}, + {'en': 'banana', 'ja': 'バナナ'}, + {'en': 'jackfruit', 'ja': 'パラミツ'}, + {'en': 'cherimoya (custard apple)', 'ja': 'カスタードアップル'}, + {'en': 'pomegranate', 'ja': 'ザクロ'}, + {'en': 'hay', 'ja': '干し草'}, + {'en': 'carbonara', 'ja': 'カルボナーラ'}, + {'en': 'chocolate syrup', 'ja': 'チョコレートソース'}, + {'en': 'dough', 'ja': 'パン生地'}, + {'en': 'meatloaf', 'ja': 'ミートローフ'}, + {'en': 'pizza', 'ja': 'ピザ'}, + {'en': 'pot pie', 'ja': 'ポットパイ'}, + {'en': 'burrito', 'ja': 'ブリトー'}, + {'en': 'red wine', 'ja': '赤ワイン'}, + {'en': 'espresso', 'ja': 'エスプレッソ'}, + {'en': 'tea cup', 'ja': 'カップ'}, + {'en': 'eggnog', 'ja': 'エッグノッグ'}, + {'en': 'mountain', 'ja': 'アルプス'}, + {'en': 'bubble', 'ja': 'バブル'}, + {'en': 'cliff', 'ja': '崖'}, + {'en': 'coral reef', 'ja': 'サンゴ礁'}, + {'en': 'geyser', 'ja': '間欠泉'}, + {'en': 'lakeshore', 'ja': '湖畔'}, + {'en': 'promontory', 'ja': '岬'}, + {'en': 'sandbar', 'ja': '砂州'}, + {'en': 'beach', 'ja': '海岸'}, + {'en': 'valley', 'ja': '谷'}, + {'en': 'volcano', 'ja': '火山'}, + {'en': 'baseball player', 'ja': '野球選手'}, + {'en': 'bridegroom', 'ja': '新郎'}, + {'en': 'scuba diver', 'ja': 'スキューバダイバー'}, + {'en': 'rapeseed', 'ja': '菜種'}, + {'en': 'daisy', 'ja': 'デイジー'}, + {'en': "yellow lady's slipper", 'ja': '蘭'}, + {'en': 'corn', 'ja': 'トウモロコシ'}, + {'en': 'acorn', 'ja': 'ドングリ'}, + {'en': 'rose hip', 'ja': 'ヒップ'}, + {'en': 'horse chestnut seed', 'ja': 'トチノキ'}, + {'en': 'coral fungus', 'ja': 'サンゴ菌'}, + {'en': 'agaric', 'ja': 'ハラタケ'}, + {'en': 'gyromitra', 'ja': 'シャグマアミガサタケ'}, + {'en': 'stinkhorn mushroom', 'ja': 'スッポンタケ'}, + {'en': 'earth star fungus', 'ja': 'ハラタケ'}, + {'en': 'hen of the woods mushroom', 'ja': '舞茸'}, + {'en': 'bolete', 'ja': 'きのこ'}, + {'en': 'corn cob', 'ja': '耳'}, + {'en': 'toilet paper', 'ja': 'トイレットペーパー'}] + + +imagenet_templates = [{'en': 'a bad photo of a {}.', 'ja': '{}の悪い写真'}, + {'en': 'a photo of many {}.', 'ja': '多くの{}の写真'}, + {'en': 'a sculpture of a {}.', 'ja': '{}の彫刻'}, + {'en': 'a photo of the hard to see {}.', 'ja': '見づらい{}の写真'}, + {'en': 'a low resolution photo of the {}.', 'ja': '{}の低解像度写真'}, + {'en': 'a rendering of a {}.', 'ja': '{}のレンダリング'}, + {'en': 'graffiti of a {}.', 'ja': '{}の落書き'}, + {'en': 'a cropped photo of the {}.', 'ja': '{}のトリミング写真'}, + {'en': 'a tattoo of a {}.', 'ja': '{}のタトゥー'}, + {'en': 'the embroidered {}.', 'ja': '刺繍された{}'}, + {'en': 'a bright photo of a {}.', 'ja': '{}の明るい写真'}, + {'en': 'a photo of a clean {}.', 'ja': 'きれいな{}の写真'}, + {'en': 'a photo of a dirty {}.', 'ja': '汚れた{}の写真'}, + {'en': 'a dark photo of the {}.', 'ja': '{}の暗い写真'}, + {'en': 'a drawing of a {}.', 'ja': '{}の絵'}, + {'en': 'a photo of my {}.', 'ja': '私の{}の写真'}, + {'en': 'the plastic {}.', 'ja': 'プラスチック製の{}'}, + {'en': 'a photo of the cool {}.', 'ja': 'かっこいい{}の写真'}, + {'en': 'a close-up photo of a {}.', 'ja': '{}のクローズアップ写真'}, + {'en': 'a black and white photo of the {}.', 'ja': '{}の白黒写真'}, + {'en': 'a pixelated photo of the {}.', 'ja': '{}のピクセル写真'}, + {'en': 'a jpeg corrupted photo of a {}.', 'ja': 'jpegで加工した{}の写真'}, + {'en': 'a blurry photo of the {}.', 'ja': '{}のぼやけた写真'}, + {'en': 'a photo of the {}.', 'ja': '{}の写真'}, + {'en': 'a good photo of the {}.', 'ja': '{}の良い写真'}, + {'en': 'a {} in a video game.', 'ja': 'ゲームに登場する{}'}, + {'en': 'the origami {}.', 'ja': '折り紙で作った{}'}, + {'en': 'a sketch of a {}.', 'ja': '{}のスケッチ'}, + {'en': 'the toy {}.', 'ja': 'おもちゃの{}'}, + {'en': 'a rendition of the {}.', 'ja': '{}の演出'}, + {'en': 'a photo of a large {}.', 'ja': '大きな{}の写真'}, + {'en': 'a photo of a nice {}.', 'ja': '素敵な{}の写真'}, + {'en': 'a photo of a weird {}.', 'ja': '奇妙な{}の写真'}, + {'en': 'a cartoon {}.', 'ja': '漫画の{}'}, + {'en': 'art of a {}.', 'ja': '{}の芸術'}, + {'en': 'a plushie {}.', 'ja': '{}のぬいぐるみ'}, + {'en': 'a photo of the small {}.', 'ja': '小さな{}の写真'},] + + + + diff --git a/japanese_clip/utils/imagenet_zeroshot_data_en.py b/japanese_clip/utils/imagenet_zeroshot_data_en.py new file mode 100644 index 0000000..dc23140 --- /dev/null +++ b/japanese_clip/utils/imagenet_zeroshot_data_en.py @@ -0,0 +1,248 @@ +imagenet_classnames = ["tench", "goldfish", "great white shark", "tiger shark", "hammerhead shark", "electric ray", + "stingray", "rooster", "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", + "indigo bunting", "American robin", "bulbul", "jay", "magpie", "chickadee", "American dipper", + "kite (bird of prey)", "bald eagle", "vulture", "great grey owl", "fire salamander", + "smooth newt", "newt", "spotted salamander", "axolotl", "American bullfrog", "tree frog", + "tailed frog", "loggerhead sea turtle", "leatherback sea turtle", "mud turtle", "terrapin", + "box turtle", "banded gecko", "green iguana", "Carolina anole", + "desert grassland whiptail lizard", "agama", "frilled-necked lizard", "alligator lizard", + "Gila monster", "European green lizard", "chameleon", "Komodo dragon", "Nile crocodile", + "American alligator", "triceratops", "worm snake", "ring-necked snake", + "eastern hog-nosed snake", "smooth green snake", "kingsnake", "garter snake", "water snake", + "vine snake", "night snake", "boa constrictor", "African rock python", "Indian cobra", + "green mamba", "sea snake", "Saharan horned viper", "eastern diamondback rattlesnake", + "sidewinder rattlesnake", "trilobite", "harvestman", "scorpion", "yellow garden spider", + "barn spider", "European garden spider", "southern black widow", "tarantula", "wolf spider", + "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse", "prairie grouse", "peafowl", + "quail", "partridge", "african grey parrot", "macaw", "sulphur-crested cockatoo", "lorikeet", + "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "duck", + "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", + "koala", "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", + "snail", "slug", "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", + "fiddler crab", "red king crab", "American lobster", "spiny lobster", "crayfish", "hermit crab", + "isopod", "white stork", "black stork", "spoonbill", "flamingo", "little blue heron", + "great egret", "bittern bird", "crane bird", "limpkin", "common gallinule", "American coot", + "bustard", "ruddy turnstone", "dunlin", "common redshank", "dowitcher", "oystercatcher", + "pelican", "king penguin", "albatross", "grey whale", "killer whale", "dugong", "sea lion", + "Chihuahua", "Japanese Chin", "Maltese", "Pekingese", "Shih Tzu", "King Charles Spaniel", + "Papillon", "toy terrier", "Rhodesian Ridgeback", "Afghan Hound", "Basset Hound", "Beagle", + "Bloodhound", "Bluetick Coonhound", "Black and Tan Coonhound", "Treeing Walker Coonhound", + "English foxhound", "Redbone Coonhound", "borzoi", "Irish Wolfhound", "Italian Greyhound", + "Whippet", "Ibizan Hound", "Norwegian Elkhound", "Otterhound", "Saluki", "Scottish Deerhound", + "Weimaraner", "Staffordshire Bull Terrier", "American Staffordshire Terrier", + "Bedlington Terrier", "Border Terrier", "Kerry Blue Terrier", "Irish Terrier", + "Norfolk Terrier", "Norwich Terrier", "Yorkshire Terrier", "Wire Fox Terrier", + "Lakeland Terrier", "Sealyham Terrier", "Airedale Terrier", "Cairn Terrier", + "Australian Terrier", "Dandie Dinmont Terrier", "Boston Terrier", "Miniature Schnauzer", + "Giant Schnauzer", "Standard Schnauzer", "Scottish Terrier", "Tibetan Terrier", + "Australian Silky Terrier", "Soft-coated Wheaten Terrier", "West Highland White Terrier", + "Lhasa Apso", "Flat-Coated Retriever", "Curly-coated Retriever", "Golden Retriever", + "Labrador Retriever", "Chesapeake Bay Retriever", "German Shorthaired Pointer", "Vizsla", + "English Setter", "Irish Setter", "Gordon Setter", "Brittany dog", "Clumber Spaniel", + "English Springer Spaniel", "Welsh Springer Spaniel", "Cocker Spaniel", "Sussex Spaniel", + "Irish Water Spaniel", "Kuvasz", "Schipperke", "Groenendael dog", "Malinois", "Briard", + "Australian Kelpie", "Komondor", "Old English Sheepdog", "Shetland Sheepdog", "collie", + "Border Collie", "Bouvier des Flandres dog", "Rottweiler", "German Shepherd Dog", "Dobermann", + "Miniature Pinscher", "Greater Swiss Mountain Dog", "Bernese Mountain Dog", + "Appenzeller Sennenhund", "Entlebucher Sennenhund", "Boxer", "Bullmastiff", "Tibetan Mastiff", + "French Bulldog", "Great Dane", "St. Bernard", "husky", "Alaskan Malamute", "Siberian Husky", + "Dalmatian", "Affenpinscher", "Basenji", "pug", "Leonberger", "Newfoundland dog", + "Great Pyrenees dog", "Samoyed", "Pomeranian", "Chow Chow", "Keeshond", "brussels griffon", + "Pembroke Welsh Corgi", "Cardigan Welsh Corgi", "Toy Poodle", "Miniature Poodle", + "Standard Poodle", "Mexican hairless dog (xoloitzcuintli)", "grey wolf", "Alaskan tundra wolf", + "red wolf or maned wolf", "coyote", "dingo", "dhole", "African wild dog", "hyena", "red fox", + "kit fox", "Arctic fox", "grey fox", "tabby cat", "tiger cat", "Persian cat", "Siamese cat", + "Egyptian Mau", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", + "cheetah", "brown bear", "American black bear", "polar bear", "sloth bear", "mongoose", + "meerkat", "tiger beetle", "ladybug", "ground beetle", "longhorn beetle", "leaf beetle", + "dung beetle", "rhinoceros beetle", "weevil", "fly", "bee", "ant", "grasshopper", + "cricket insect", "stick insect", "cockroach", "praying mantis", "cicada", "leafhopper", + "lacewing", "dragonfly", "damselfly", "red admiral butterfly", "ringlet butterfly", + "monarch butterfly", "small white butterfly", "sulphur butterfly", "gossamer-winged butterfly", + "starfish", "sea urchin", "sea cucumber", "cottontail rabbit", "hare", "Angora rabbit", + "hamster", "porcupine", "fox squirrel", "marmot", "beaver", "guinea pig", "common sorrel horse", + "zebra", "pig", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo", "bison", + "ram (adult male sheep)", "bighorn sheep", "Alpine ibex", "hartebeest", "impala (antelope)", + "gazelle", "arabian camel", "llama", "weasel", "mink", "European polecat", + "black-footed ferret", "otter", "skunk", "badger", "armadillo", "three-toed sloth", "orangutan", + "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas monkey", "baboon", "macaque", + "langur", "black-and-white colobus", "proboscis monkey", "marmoset", "white-headed capuchin", + "howler monkey", "titi monkey", "Geoffroy's spider monkey", "common squirrel monkey", + "ring-tailed lemur", "indri", "Asian elephant", "African bush elephant", "red panda", + "giant panda", "snoek fish", "eel", "silver salmon", "rock beauty fish", "clownfish", + "sturgeon", "gar fish", "lionfish", "pufferfish", "abacus", "abaya", "academic gown", + "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance", + "amphibious vehicle", "analog clock", "apiary", "apron", "trash can", "assault rifle", + "backpack", "bakery", "balance beam", "balloon", "ballpoint pen", "Band-Aid", "banjo", + "baluster / handrail", "barbell", "barber chair", "barbershop", "barn", "barometer", "barrel", + "wheelbarrow", "baseball", "basketball", "bassinet", "bassoon", "swimming cap", "bath towel", + "bathtub", "station wagon", "lighthouse", "beaker", "military hat (bearskin or shako)", + "beer bottle", "beer glass", "bell tower", "baby bib", "tandem bicycle", "bikini", + "ring binder", "binoculars", "birdhouse", "boathouse", "bobsleigh", "bolo tie", "poke bonnet", + "bookcase", "bookstore", "bottle cap", "hunting bow", "bow tie", "brass memorial plaque", "bra", + "breakwater", "breastplate", "broom", "bucket", "buckle", "bulletproof vest", + "high-speed train", "butcher shop", "taxicab", "cauldron", "candle", "cannon", "canoe", + "can opener", "cardigan", "car mirror", "carousel", "tool kit", "cardboard box / carton", + "car wheel", "automated teller machine", "cassette", "cassette player", "castle", "catamaran", + "CD player", "cello", "mobile phone", "chain", "chain-link fence", "chain mail", "chainsaw", + "storage chest", "chiffonier", "bell or wind chime", "china cabinet", "Christmas stocking", + "church", "movie theater", "cleaver", "cliff dwelling", "cloak", "clogs", "cocktail shaker", + "coffee mug", "coffeemaker", "spiral or coil", "combination lock", "computer keyboard", + "candy store", "container ship", "convertible", "corkscrew", "cornet", "cowboy boot", + "cowboy hat", "cradle", "construction crane", "crash helmet", "crate", "infant bed", + "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", + "rotary dial telephone", "diaper", "digital clock", "digital watch", "dining table", + "dishcloth", "dishwasher", "disc brake", "dock", "dog sled", "dome", "doormat", "drilling rig", + "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", "electric guitar", + "electric locomotive", "entertainment center", "envelope", "espresso machine", "face powder", + "feather boa", "filing cabinet", "fireboat", "fire truck", "fire screen", "flagpole", "flute", + "folding chair", "football helmet", "forklift", "fountain", "fountain pen", "four-poster bed", + "freight car", "French horn", "frying pan", "fur coat", "garbage truck", + "gas mask or respirator", "gas pump", "goblet", "go-kart", "golf ball", "golf cart", "gondola", + "gong", "gown", "grand piano", "greenhouse", "radiator grille", "grocery store", "guillotine", + "hair clip", "hair spray", "half-track", "hammer", "hamper", "hair dryer", "hand-held computer", + "handkerchief", "hard disk drive", "harmonica", "harp", "combine harvester", "hatchet", + "holster", "home theater", "honeycomb", "hook", "hoop skirt", "gymnastic horizontal bar", + "horse-drawn vehicle", "hourglass", "iPod", "clothes iron", "carved pumpkin", "jeans", "jeep", + "T-shirt", "jigsaw puzzle", "rickshaw", "joystick", "kimono", "knee pad", "knot", "lab coat", + "ladle", "lampshade", "laptop computer", "lawn mower", "lens cap", "letter opener", "library", + "lifeboat", "lighter", "limousine", "ocean liner", "lipstick", "slip-on shoe", "lotion", + "music speaker", "loupe magnifying glass", "sawmill", "magnetic compass", "messenger bag", + "mailbox", "tights", "one-piece bathing suit", "manhole cover", "maraca", "marimba", "mask", + "matchstick", "maypole", "maze", "measuring cup", "medicine cabinet", "megalith", "microphone", + "microwave oven", "military uniform", "milk can", "minibus", "miniskirt", "minivan", "missile", + "mitten", "mixing bowl", "mobile home", "ford model t", "modem", "monastery", "monitor", + "moped", "mortar and pestle", "graduation cap", "mosque", "mosquito net", "vespa", + "mountain bike", "tent", "computer mouse", "mousetrap", "moving van", "muzzle", "metal nail", + "neck brace", "necklace", "baby pacifier", "notebook computer", "obelisk", "oboe", "ocarina", + "odometer", "oil filter", "pipe organ", "oscilloscope", "overskirt", "bullock cart", + "oxygen mask", "product packet / packaging", "paddle", "paddle wheel", "padlock", "paintbrush", + "pajamas", "palace", "pan flute", "paper towel", "parachute", "parallel bars", "park bench", + "parking meter", "railroad car", "patio", "payphone", "pedestal", "pencil case", + "pencil sharpener", "perfume", "Petri dish", "photocopier", "plectrum", "Pickelhaube", + "picket fence", "pickup truck", "pier", "piggy bank", "pill bottle", "pillow", "ping-pong ball", + "pinwheel", "pirate ship", "drink pitcher", "block plane", "planetarium", "plastic bag", + "plate rack", "farm plow", "plunger", "Polaroid camera", "pole", "police van", "poncho", + "pool table", "soda bottle", "plant pot", "potter's wheel", "power drill", "prayer rug", + "printer", "prison", "missile", "projector", "hockey puck", "punching bag", "purse", "quill", + "quilt", "race car", "racket", "radiator", "radio", "radio telescope", "rain barrel", + "recreational vehicle", "fishing casting reel", "reflex camera", "refrigerator", + "remote control", "restaurant", "revolver", "rifle", "rocking chair", "rotisserie", "eraser", + "rugby ball", "ruler measuring stick", "sneaker", "safe", "safety pin", "salt shaker", "sandal", + "sarong", "saxophone", "scabbard", "weighing scale", "school bus", "schooner", "scoreboard", + "CRT monitor", "screw", "screwdriver", "seat belt", "sewing machine", "shield", "shoe store", + "shoji screen / room divider", "shopping basket", "shopping cart", "shovel", "shower cap", + "shower curtain", "ski", "balaclava ski mask", "sleeping bag", "slide rule", "sliding door", + "slot machine", "snorkel", "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", + "solar thermal collector", "sombrero", "soup bowl", "keyboard space bar", "space heater", + "space shuttle", "spatula", "motorboat", "spider web", "spindle", "sports car", "spotlight", + "stage", "steam locomotive", "through arch bridge", "steel drum", "stethoscope", "scarf", + "stone wall", "stopwatch", "stove", "strainer", "tram", "stretcher", "couch", "stupa", + "submarine", "suit", "sundial", "sunglasses", "sunglasses", "sunscreen", "suspension bridge", + "mop", "sweatshirt", "swim trunks / shorts", "swing", "electrical switch", "syringe", + "table lamp", "tank", "tape player", "teapot", "teddy bear", "television", "tennis ball", + "thatched roof", "front curtain", "thimble", "threshing machine", "throne", "tile roof", + "toaster", "tobacco shop", "toilet seat", "torch", "totem pole", "tow truck", "toy store", + "tractor", "semi-trailer truck", "tray", "trench coat", "tricycle", "trimaran", "tripod", + "triumphal arch", "trolleybus", "trombone", "hot tub", "turnstile", "typewriter keyboard", + "umbrella", "unicycle", "upright piano", "vacuum cleaner", "vase", "vaulted or arched ceiling", + "velvet fabric", "vending machine", "vestment", "viaduct", "violin", "volleyball", + "waffle iron", "wall clock", "wallet", "wardrobe", "military aircraft", "sink", + "washing machine", "water bottle", "water jug", "water tower", "whiskey jug", "whistle", + "hair wig", "window screen", "window shade", "Windsor tie", "wine bottle", "airplane wing", + "wok", "wooden spoon", "wool", "split-rail fence", "shipwreck", "sailboat", "yurt", "website", + "comic book", "crossword", "traffic or street sign", "traffic light", "dust jacket", "menu", + "plate", "guacamole", "consomme", "hot pot", "trifle", "ice cream", "popsicle", "baguette", + "bagel", "pretzel", "cheeseburger", "hot dog", "mashed potatoes", "cabbage", "broccoli", + "cauliflower", "zucchini", "spaghetti squash", "acorn squash", "butternut squash", "cucumber", + "artichoke", "bell pepper", "cardoon", "mushroom", "Granny Smith apple", "strawberry", "orange", + "lemon", "fig", "pineapple", "banana", "jackfruit", "cherimoya (custard apple)", "pomegranate", + "hay", "carbonara", "chocolate syrup", "dough", "meatloaf", "pizza", "pot pie", "burrito", + "red wine", "espresso", "tea cup", "eggnog", "mountain", "bubble", "cliff", "coral reef", + "geyser", "lakeshore", "promontory", "sandbar", "beach", "valley", "volcano", "baseball player", + "bridegroom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn", + "rose hip", "horse chestnut seed", "coral fungus", "agaric", "gyromitra", "stinkhorn mushroom", + "earth star fungus", "hen of the woods mushroom", "bolete", "corn cob", "toilet paper"] + +imagenet_templates = [ + 'a bad photo of a {}.', + 'a photo of many {}.', + 'a sculpture of a {}.', + 'a photo of the hard to see {}.', + 'a low resolution photo of the {}.', + 'a rendering of a {}.', + 'graffiti of a {}.', + 'a bad photo of the {}.', + 'a cropped photo of the {}.', + 'a tattoo of a {}.', + 'the embroidered {}.', + 'a photo of a hard to see {}.', + 'a bright photo of a {}.', + 'a photo of a clean {}.', + 'a photo of a dirty {}.', + 'a dark photo of the {}.', + 'a drawing of a {}.', + 'a photo of my {}.', + 'the plastic {}.', + 'a photo of the cool {}.', + 'a close-up photo of a {}.', + 'a black and white photo of the {}.', + 'a painting of the {}.', + 'a painting of a {}.', + 'a pixelated photo of the {}.', + 'a sculpture of the {}.', + 'a bright photo of the {}.', + 'a cropped photo of a {}.', + 'a plastic {}.', + 'a photo of the dirty {}.', + 'a jpeg corrupted photo of a {}.', + 'a blurry photo of the {}.', + 'a photo of the {}.', + 'a good photo of the {}.', + 'a rendering of the {}.', + 'a {} in a video game.', + 'a photo of one {}.', + 'a doodle of a {}.', + 'a close-up photo of the {}.', + 'a photo of a {}.', + 'the origami {}.', + 'the {} in a video game.', + 'a sketch of a {}.', + 'a doodle of the {}.', + 'a origami {}.', + 'a low resolution photo of a {}.', + 'the toy {}.', + 'a rendition of the {}.', + 'a photo of the clean {}.', + 'a photo of a large {}.', + 'a rendition of a {}.', + 'a photo of a nice {}.', + 'a photo of a weird {}.', + 'a blurry photo of a {}.', + 'a cartoon {}.', + 'art of a {}.', + 'a sketch of the {}.', + 'a embroidered {}.', + 'a pixelated photo of a {}.', + 'itap of the {}.', + 'a jpeg corrupted photo of the {}.', + 'a good photo of a {}.', + 'a plushie {}.', + 'a photo of the nice {}.', + 'a photo of the small {}.', + 'a photo of the weird {}.', + 'the cartoon {}.', + 'art of the {}.', + 'a drawing of the {}.', + 'a photo of the large {}.', + 'a black and white photo of a {}.', + 'the plushie {}.', + 'a dark photo of a {}.', + 'itap of a {}.', + 'graffiti of the {}.', + 'a toy {}.', + 'itap of my {}.', + 'a photo of a cool {}.', + 'a photo of a small {}.', + 'a tattoo of the {}.', +] \ No newline at end of file diff --git a/japanese_clip/version.py b/japanese_clip/version.py new file mode 100644 index 0000000..feacd5a --- /dev/null +++ b/japanese_clip/version.py @@ -0,0 +1,16 @@ +# coding=utf-8 +# Copyright 2022 rinna Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = '0.2.0' diff --git a/jclip.py b/jclip.py new file mode 100644 index 0000000..aa3d4dc --- /dev/null +++ b/jclip.py @@ -0,0 +1,79 @@ +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from pathlib import Path + +import torch + +from towhee import register +from towhee.operator.base import NNOperator, OperatorFlag +from towhee.types.arg import arg, to_image_color +from towhee.types.image_utils import from_pil, to_pil + +@register(output_schema=['vec']) +class Jaclip(NNOperator): + """ + Japanese CLIP multi-modal embedding operator + """ + def __init__(self, model_name: str, modality: str): + super().__init__() + path = str(Path(__file__).parent) + sys.path.append(path) + import japanese_clip as ja_clip + sys.path.pop() + self.device = "cuda" if torch.cuda.is_available() else "cpu" + model, preprocess = ja_clip.load("rinna/japanese-clip-vit-b-16", cache_dir="{}/weights/japanese_clip".format(path), device=self.device) + self.model = model + self.tfms = preprocess + self.tokenizer = ja_clip.load_tokenizer() + self.ja_clip = ja_clip + + + def __call__(self, data): + if self._modality == 'image': + vec = self._inference_from_image(data) + elif self._modality == 'text': + vec = self._inference_from_text(data) + else: + raise ValueError("modality[{}] not implemented.".format(self._modality)) + return vec.detach().cpu().numpy().flatten() + + def _inference_from_text(self, text): + encodings = ja_clip.tokenize( + texts=[text], + max_seq_len=77, + device=self.device, + tokenizer=self.tokenizer, # this is optional. if you don't pass, load tokenizer each time + ) + text_feature = model.get_text_features(**encodings) + return text_feature + + @arg(1, to_image_color('RGB')) + def _inference_from_image(self, img): + img = self._preprocess(img) + caption = '' + image_feature = self.model.get_image_features(image) + return image_feature + + def _preprocess(self, img): + img = to_pil(img) + processed_img = self.tfms(img).unsqueeze(0).to(self.device) + return processed_img + + def _configs(self): + config = {} + config['blip_base'] = {} + config['blip_base']['weights'] = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth' + return config diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29