fix the crop_size for different version of huggingface.

Signed-off-by: wxywb <xy.wang@zilliz.com>
3 years ago · d0b811ca62
2 changed files with 14 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -28,11 +28,11 @@ import towhee

 towhee.glob('./teddy.jpg') \
      .image_decode() \
-      .image_text_embedding.clip(model_name='clip_vit_b32', modality='image') \
+      .image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image') \
      .show()

 towhee.dc(["A teddybear on a skateboard in Times Square."]) \
-      .image_text_embedding.clip(model_name='clip_vit_b32', modality='text') \
+      .image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='text') \
      .show()
 ```
 <img src="https://towhee.io/image-text-embedding/clip/raw/branch/main/vec1.png" alt="result1" style="height:20px;"/>
@ -45,12 +45,12 @@ import towhee

 towhee.glob['path']('./teddy.jpg') \
      .image_decode['path', 'img']() \
-      .image_text_embedding.clip['img', 'vec'](model_name='clip_vit_b32', modality='image') \
+      .image_text_embedding.clip['img', 'vec'](model_name='clip_vit_base_patch16', modality='image') \
      .select['img', 'vec']() \
      .show()

 towhee.dc['text'](["A teddybear on a skateboard in Times Square."]) \
-      .image_text_embedding.clip['text','vec'](model_name='clip_vit_b32', modality='text') \
+      .image_text_embedding.clip['text','vec'](model_name='clip_vit_base_patch16', modality='text') \
      .select['text', 'vec']() \
      .show()
 ```
@ -112,7 +112,7 @@ Save model to local with specified format.
 ```python
 from towhee import ops

-op = ops.image_text_embedding.clip(model_name='clip_vit_base_16', modality='image').get_op()
+op = ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image').get_op()
 op.save_model('onnx', 'test.onnx')
 ```
 <br />
@ -145,7 +145,7 @@ Get a list of all supported model names or supported model names for specified m
 from towhee import ops


-op = towhee.ops.image_text_embedding.clip(model_name='clip_vit_base_16', modality='image').get_op()
+op = towhee.ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image').get_op()
 full_list = op.supported_model_names()
 onnx_list = op.supported_model_names(format='onnx')
 print(f'Onnx-support/Total Models: {len(onnx_list)}/{len(full_list)}')
@ -164,7 +164,7 @@ If you want to train this operator, besides dependency in requirements.txt, you
 ```python
 import towhee

-clip_op = towhee.ops.image_text_embedding.clip(model_name='clip_vit_base_16', modality='image').get_op()
+clip_op = towhee.ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image').get_op()

 data_args = {
    'dataset_name': 'ydshieh/coco_dataset_script',
--- a/clip.py
+++ b/clip.py
@ -172,7 +172,13 @@ class Clip(NNOperator):
                raise AttributeError('Unsupported model_type.')
        if self.modality == 'image':
            sz = self.processor.feature_extractor.crop_size 
-            dummy_input = Image.new('RGB', (sz, sz), color = 'red')
+            if isinstance(sz, int):
+                h = sz
+                w = sz
+            elif isinstance(sz, dict):
+                h = sz['height']
+                w = sz['width']
+            dummy_input = Image.new('RGB', (w, h), color = 'red')
            inputs = self.processor(images=dummy_input, return_tensors='pt')  # a dictionary
        elif self.modality == 'text':
            dummy_input = 'dummy'