From a6ff91c64f9ba1a54d12046f75c148708fdfbbeb Mon Sep 17 00:00:00 2001 From: wxywb Date: Fri, 14 Oct 2022 11:11:04 +0800 Subject: [PATCH] add the doc. Signed-off-by: wxywb --- README.md | 104 ++++++++++++++++++++++++++++++++++++++++++++++- jclip.py | 9 ++-- requirements.txt | 3 ++ 3 files changed, 112 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b1bde4b..c2eb97d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,104 @@ -# japanese-clip +#Japanese Image-Text Retrieval Embdding with CLIP + +*author: David Wang* + + +
+ + + +## Description + +This operator extracts features for image or text with [Japanese-CLIP](https://github.com/rinnakk/japanese-clip +) developed by [rinna Co., Ltd](https://rinna.co.jp/), which can generate embeddings for Japanese text and image by jointly training an image encoder and text encoder to maximize the cosine similarity. + +
+ + +## Code Example + +Load an image from path './teddy.jpg' to generate an image embedding. + +Read the text 'スケートボードに乗っているテディベア。' to generate an text embedding. + + *Write the pipeline in simplified style*: + +```python +import towhee + +towhee.glob('./teddy.jpg') \ + .image_decode() \ + .image_text_embedding.japanese_clip(model_name='clip_vit_b32', modality='image') \ + .show() + +towhee.dc(["スケートボードに乗っているテディベア。"]) \ + .image_text_embedding.japanese_clip(model_name='clip_vit_b32', modality='text') \ + .show() +``` +result1 +result2 + +*Write a same pipeline with explicit inputs/outputs name specifications:* + +```python +import towhee + +towhee.glob['path']('./teddy.jpg') \ + .image_decode['path', 'img']() \ + .image_text_embedding.japanese_clip['img', 'vec'](model_name='clip_vit_b32', modality='image') \ + .select['img', 'vec']() \ + .show() + +towhee.dc['text'](["スケートボードに乗っているテディベア。"]) \ + .image_text_embedding.japanese_clip['text','vec'](model_name='clip_vit_b32', modality='text') \ + .select['text', 'vec']() \ + .show() +``` +result1 +result2 + + +
+ + + +## Factory Constructor + +Create the operator via the following factory method + +***japanese_clip(model_name, modality)*** + +**Parameters:** + +​ ***model_name:*** *str* + +​ The model name of CLIP. Supported model names: +- japanese-clip-vit-b-16 +- japanese-cloob-vit-b-16 + + +​ ***modality:*** *str* + +​ Which modality(*image* or *text*) is used to generate the embedding. + +
+ + + +## Interface + +An image-text embedding operator takes a [towhee image](link/to/towhee/image/api/doc) or string as input and generate an embedding in ndarray. + + +**Parameters:** + +​ ***data:*** *towhee.types.Image (a sub-class of numpy.ndarray)* or *str* + +​ The data (image or text based on specified modality) to generate embedding. + + + +**Returns:** *numpy.ndarray* + +​ The data embedding extracted by model. diff --git a/jclip.py b/jclip.py index c11e8bf..6943442 100644 --- a/jclip.py +++ b/jclip.py @@ -33,9 +33,10 @@ class Jaclip(NNOperator): sys.path.append(path) import japanese_clip as ja_clip sys.path.pop() + cfg = self._configs()[model_name] self.device = "cuda" if torch.cuda.is_available() else "cpu" self._modality = modality - model, preprocess = ja_clip.load("rinna/japanese-clip-vit-b-16", cache_dir="{}/weights/japanese_clip".format(path), device=self.device) + model, preprocess = ja_clip.load(cfg['weights'], cache_dir="{}/weights/japanese_clip".format(path), device=self.device) self.model = model self.tfms = preprocess self.tokenizer = ja_clip.load_tokenizer() @@ -75,6 +76,8 @@ class Jaclip(NNOperator): def _configs(self): config = {} - config['blip_base'] = {} - config['blip_base']['weights'] = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth' + config['japanese-clip-vit-b-16'] = {} + config['japanese-clip-vit-b-16']['weights'] = 'rinna/japanese-clip-vit-b-16' + config['japanese-cloob-vit-b-16'] = {} + config['japanese-cloob-vit-b-16']['weights'] = 'rinna/japanese-cloob-vit-b-16' return config diff --git a/requirements.txt b/requirements.txt index e69de29..ce14b4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,3 @@ +torch +towhee +