logo
Browse Source

update the figure.

Signed-off-by: wxywb <xy.wang@zilliz.com>
main
wxywb 3 years ago
parent
commit
4838d5a200
  1. 15
      README.md
  2. 36
      camel.py
  3. BIN
      cap.png
  4. BIN
      tabular.png

15
README.md

@ -8,7 +8,7 @@
## Description ## Description
This operator generates the caption with [CapDec](https://arxiv.org/abs/2211.00575) which describes the content of the given image. ExpansionNet v2 introduces the Block Static Expansion which distributes and processes the input over a heterogeneous and arbitrarily big collection of sequences characterized by a different length compared to the input one. This is an adaptation from [DavidHuji/CapDec](https://github.com/DavidHuji/CapDec).
This operator generates the caption with [CaMEL](https://arxiv.org/abs/2202.10492) which describes the content of the given image. CaMEL is a novel Transformer-based architecture for image captioning which leverages the interaction of two interconnected language models that learn from each other during the training phase. The interplay between the two language models follows a mean teacher learning paradigm with knowledge distillation. This is an adaptation from [aimagelab/camel](https://github.com/aimagelab/camel).
<br /> <br />
@ -25,7 +25,7 @@ import towhee
towhee.glob('./image.jpg') \ towhee.glob('./image.jpg') \
.image_decode() \ .image_decode() \
.image_captioning.capdec(model_name='capdec_noise_0') \
.image_captioning.camel(model_name='camel_mesh') \
.show() .show()
``` ```
<img src="./cap.png" alt="result1" style="height:20px;"/> <img src="./cap.png" alt="result1" style="height:20px;"/>
@ -37,7 +37,7 @@ import towhee
towhee.glob['path']('./image.jpg') \ towhee.glob['path']('./image.jpg') \
.image_decode['path', 'img']() \ .image_decode['path', 'img']() \
.image_captioning.capdec['img', 'text'](model_name='capdec_noise_0') \
.image_captioning.camel['img', 'text'](model_name='camel_mesh') \
.select['img', 'text']() \ .select['img', 'text']() \
.show() .show()
``` ```
@ -51,17 +51,14 @@ towhee.glob['path']('./image.jpg') \
Create the operator via the following factory method Create the operator via the following factory method
***capdec(model_name)***
***camel(model_name)***
**Parameters:** **Parameters:**
***model_name:*** *str* ***model_name:*** *str*
​ The model name of CapDec. Supported model names:
- capdec_noise_0
- capdec_noise_01
- capdec_noise_001
- capdec_noise_0001
​ The model name of CaMEL. Supported model names:
- camel_mesh
<br /> <br />

36
camel.py

@ -34,15 +34,18 @@ class Camel(NNOperator):
""" """
def _gen_args(self): def _gen_args(self):
args = edict() args = edict()
args.N_enc = 3
args.N_dec = 3
args.d_model = 512
args.d_ff = 2048
args.head = 8
args.m = 40
args.disable_mesh = True
args.d_model = 512
args.with_pe = True
args.N_dec=3
args.N_enc=3
args.batch_size=25
args.d_ff=2048
args.d_model=512
args.disable_mesh=False
args.head=8
args.image_dim=3072
args.m=40
args.network='target'
args.with_pe=False
args.workers=0
return args return args
def __init__(self, model_name: str): def __init__(self, model_name: str):
@ -58,8 +61,7 @@ class Camel(NNOperator):
args = self._gen_args() args = self._gen_args()
path = str(Path(__file__).parent) path = str(Path(__file__).parent)
self.clip_model, self.clip_tfms = clip.load('RN50x16', jit=False) self.clip_model, self.clip_tfms = clip.load('RN50x16', jit=False)
#import ipdb
#ipdb.set_trace()
self.image_model = self.clip_model.visual self.image_model = self.clip_model.visual
self.image_model.forward = self.image_model.intermediate_features self.image_model.forward = self.image_model.intermediate_features
image_field = ImageField(transform=self.clip_tfms) image_field = ImageField(transform=self.clip_tfms)
@ -75,8 +77,8 @@ class Camel(NNOperator):
self.model.forward = self.model.beam_search self.model.forward = self.model.beam_search
self.image_model = self.image_model.to(self.device) self.image_model = self.image_model.to(self.device)
self.model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
self.model = self.model.eval()
self.model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))['state_dict_t'])
self.model.eval()
sys.path.pop() sys.path.pop()
@ -107,15 +109,15 @@ class Camel(NNOperator):
@arg(1, to_image_color('RGB')) @arg(1, to_image_color('RGB'))
def _inference_from_image(self, img): def _inference_from_image(self, img):
img = self._preprocess(img) img = self._preprocess(img)
text, _ = self.model.beam_search(img, beam_size=5, out_size=1)
feat = self.image_model(img)
tokens, _ = self.model.beam_search(feat, beam_size=5, out_size=1)
text = text_field.decode(tokens)
return text return text
def _configs(self): def _configs(self):
config = {} config = {}
config['camel_nomesh'] = {}
config['camel_nomesh']['weights'] = 's3://pretrainedweights.towhee.io/image-captioning/camel/camel_nomesh.pth'
config['camel_mesh'] = {} config['camel_mesh'] = {}
config['camel_mesh']['weights'] = 's3://pretrainedweights.towhee.io/image-captioning/camel/camel_mesh.pth'
config['camel_mesh']['weights'] = 'image-captioning/camel/camel_mesh.pth'
return config return config
if __name__ == '__main__': if __name__ == '__main__':

BIN
cap.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.0 KiB

BIN
tabular.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 173 KiB

Loading…
Cancel
Save