diff --git a/README.md b/README.md
index 4cc011e..6f5b2bf 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
## Description
-This operator generates the caption with [CapDec](https://arxiv.org/abs/2211.00575) which describes the content of the given image. ExpansionNet v2 introduces the Block Static Expansion which distributes and processes the input over a heterogeneous and arbitrarily big collection of sequences characterized by a different length compared to the input one. This is an adaptation from [DavidHuji/CapDec](https://github.com/DavidHuji/CapDec).
+This operator generates the caption with [CaMEL](https://arxiv.org/abs/2202.10492) which describes the content of the given image. CaMEL is a novel Transformer-based architecture for image captioning which leverages the interaction of two interconnected language models that learn from each other during the training phase. The interplay between the two language models follows a mean teacher learning paradigm with knowledge distillation. This is an adaptation from [aimagelab/camel](https://github.com/aimagelab/camel).
@@ -25,7 +25,7 @@ import towhee
towhee.glob('./image.jpg') \
.image_decode() \
- .image_captioning.capdec(model_name='capdec_noise_0') \
+ .image_captioning.camel(model_name='camel_mesh') \
.show()
```
@@ -37,7 +37,7 @@ import towhee
towhee.glob['path']('./image.jpg') \
.image_decode['path', 'img']() \
- .image_captioning.capdec['img', 'text'](model_name='capdec_noise_0') \
+ .image_captioning.camel['img', 'text'](model_name='camel_mesh') \
.select['img', 'text']() \
.show()
```
@@ -51,17 +51,14 @@ towhee.glob['path']('./image.jpg') \
Create the operator via the following factory method
-***capdec(model_name)***
+***camel(model_name)***
**Parameters:**
***model_name:*** *str*
- The model name of CapDec. Supported model names:
-- capdec_noise_0
-- capdec_noise_01
-- capdec_noise_001
-- capdec_noise_0001
+ The model name of CaMEL. Supported model names:
+- camel_mesh
diff --git a/camel.py b/camel.py
index 63dcd69..36ef798 100644
--- a/camel.py
+++ b/camel.py
@@ -34,15 +34,18 @@ class Camel(NNOperator):
"""
def _gen_args(self):
args = edict()
- args.N_enc = 3
- args.N_dec = 3
- args.d_model = 512
- args.d_ff = 2048
- args.head = 8
- args.m = 40
- args.disable_mesh = True
- args.d_model = 512
- args.with_pe = True
+ args.N_dec=3
+ args.N_enc=3
+ args.batch_size=25
+ args.d_ff=2048
+ args.d_model=512
+ args.disable_mesh=False
+ args.head=8
+ args.image_dim=3072
+ args.m=40
+ args.network='target'
+ args.with_pe=False
+ args.workers=0
return args
def __init__(self, model_name: str):
@@ -58,8 +61,7 @@ class Camel(NNOperator):
args = self._gen_args()
path = str(Path(__file__).parent)
self.clip_model, self.clip_tfms = clip.load('RN50x16', jit=False)
- #import ipdb
- #ipdb.set_trace()
+
self.image_model = self.clip_model.visual
self.image_model.forward = self.image_model.intermediate_features
image_field = ImageField(transform=self.clip_tfms)
@@ -75,8 +77,8 @@ class Camel(NNOperator):
self.model.forward = self.model.beam_search
self.image_model = self.image_model.to(self.device)
- self.model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
- self.model = self.model.eval()
+ self.model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))['state_dict_t'])
+ self.model.eval()
sys.path.pop()
@@ -107,15 +109,15 @@ class Camel(NNOperator):
@arg(1, to_image_color('RGB'))
def _inference_from_image(self, img):
img = self._preprocess(img)
- text, _ = self.model.beam_search(img, beam_size=5, out_size=1)
+ feat = self.image_model(img)
+ tokens, _ = self.model.beam_search(feat, beam_size=5, out_size=1)
+ text = text_field.decode(tokens)
return text
def _configs(self):
config = {}
- config['camel_nomesh'] = {}
- config['camel_nomesh']['weights'] = 's3://pretrainedweights.towhee.io/image-captioning/camel/camel_nomesh.pth'
config['camel_mesh'] = {}
- config['camel_mesh']['weights'] = 's3://pretrainedweights.towhee.io/image-captioning/camel/camel_mesh.pth'
+ config['camel_mesh']['weights'] = 'image-captioning/camel/camel_mesh.pth'
return config
if __name__ == '__main__':
diff --git a/cap.png b/cap.png
new file mode 100644
index 0000000..1663e47
Binary files /dev/null and b/cap.png differ
diff --git a/tabular.png b/tabular.png
new file mode 100644
index 0000000..3364371
Binary files /dev/null and b/tabular.png differ