update the figure.

Signed-off-by: wxywb <xy.wang@zilliz.com>
3 years ago · 4838d5a200
4 changed files with 25 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@
 ## Description
 This operator generates the caption with [CapDec](https://arxiv.org/abs/2211.00575) which describes the content of the given image. ExpansionNet v2 introduces the Block Static Expansion which distributes and processes the input over a heterogeneous and arbitrarily big collection of sequences characterized by a different length compared to the input one. This is an adaptation from [DavidHuji/CapDec](https://github.com/DavidHuji/CapDec).
 This operator generates the caption with [CaMEL](https://arxiv.org/abs/2202.10492) which describes the content of the given image. CaMEL is a novel Transformer-based architecture for image captioning which leverages the interaction of two interconnected language models that learn from each other during the training phase. The interplay between the two language models follows a mean teacher learning paradigm with knowledge distillation. This is an adaptation from [aimagelab/camel](https://github.com/aimagelab/camel).
 <br />
@ -25,7 +25,7 @@ import towhee
 towhee.glob('./image.jpg') \
      .image_decode() \
      .image_captioning.capdec(model_name='capdec_noise_0') \
      .image_captioning.camel(model_name='camel_mesh') \
      .show()
 ```
 <img src="./cap.png" alt="result1" style="height:20px;"/>
@ -37,7 +37,7 @@ import towhee
 towhee.glob['path']('./image.jpg') \
      .image_decode['path', 'img']() \
      .image_captioning.capdec['img', 'text'](model_name='capdec_noise_0') \
      .image_captioning.camel['img', 'text'](model_name='camel_mesh') \
      .select['img', 'text']() \
      .show()
 ```
@ -51,17 +51,14 @@ towhee.glob['path']('./image.jpg') \
 Create the operator via the following factory method
 ***capdec(model_name)***
 ***camel(model_name)***
 **Parameters:**
   ***model_name:*** *str*
   The model name of CapDec. Supported model names: 
 - capdec_noise_0
 - capdec_noise_01
 - capdec_noise_001
 - capdec_noise_0001
   The model name of CaMEL. Supported model names: 
 - camel_mesh
 <br />
--- a/camel.py
+++ b/camel.py
@ -34,15 +34,18 @@ class Camel(NNOperator):
    """
    def _gen_args(self):
        args = edict()
        args.N_enc = 3
        args.N_dec = 3
        args.d_model = 512
        args.d_ff = 2048
        args.head = 8
        args.m = 40
        args.disable_mesh = True
        args.d_model = 512
        args.with_pe = True
        args.N_dec=3
        args.N_enc=3
        args.batch_size=25
        args.d_ff=2048
        args.d_model=512
        args.disable_mesh=False
        args.head=8
        args.image_dim=3072
        args.m=40
        args.network='target'
        args.with_pe=False
        args.workers=0
        return args
    def __init__(self, model_name: str):
@ -58,8 +61,7 @@ class Camel(NNOperator):
        args = self._gen_args()
        path = str(Path(__file__).parent)
        self.clip_model, self.clip_tfms = clip.load('RN50x16', jit=False)
        #import ipdb
        #ipdb.set_trace()
        self.image_model = self.clip_model.visual
        self.image_model.forward = self.image_model.intermediate_features
        image_field = ImageField(transform=self.clip_tfms)
@ -75,8 +77,8 @@ class Camel(NNOperator):
        self.model.forward = self.model.beam_search
        self.image_model = self.image_model.to(self.device)
        self.model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
        self.model = self.model.eval()
        self.model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))['state_dict_t'])
        self.model.eval()
        sys.path.pop()
@ -107,15 +109,15 @@ class Camel(NNOperator):
    @arg(1, to_image_color('RGB'))
    def _inference_from_image(self, img):
        img = self._preprocess(img)
        text, _ = self.model.beam_search(img, beam_size=5, out_size=1)
        feat = self.image_model(img)
        tokens, _ = self.model.beam_search(feat, beam_size=5, out_size=1)
        text = text_field.decode(tokens)
        return text 
    def _configs(self):
        config = {}
        config['camel_nomesh'] = {}
        config['camel_nomesh']['weights'] = 's3://pretrainedweights.towhee.io/image-captioning/camel/camel_nomesh.pth'
        config['camel_mesh'] = {}
        config['camel_mesh']['weights'] = 's3://pretrainedweights.towhee.io/image-captioning/camel/camel_mesh.pth'
        config['camel_mesh']['weights'] = 'image-captioning/camel/camel_mesh.pth'
        return config
 if __name__ == '__main__':
--- a/cap.png
+++ b/cap.png
--- a/tabular.png
+++ b/tabular.png