diff --git a/README.md b/README.md index 9b599c5..c0b2129 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,91 @@ -# faiss +# Operator: ANN Search: Faiss +*author: shiyu* + +
+ + + +## Desription + +Search embedding in [Faiss](https://github.com/facebookresearch/faiss), **please make sure you have inserted data to Faiss before search**. + +
+ + + +## Code Example + +- Insert data into Faiss first + +```python +import numpy as np +import towhee + +vec = np.random.random((10, 100)).astype('float32') +ids = list(i for i in range(10)) + +x = towhee.dc['id'](ids) \ + .runas_op['id', 'vec'](func=lambda x: vec[x]) \ + .to_faiss['id', 'vec'](findex='index.bin') +``` + +- Example + +*Write the pipeline in simplified style:* + +```python +query = vec[0:2] +towhee.dc(query) \ + .ann_search.faiss(findex='index.bin') +``` + +*Write a same pipeline with explicit inputs/outputs name specifications:* + +```python +query = vec[0:2] +towhee.dc['vec'](query) \ + .ann_search.faiss['vec', 'results'](findex='index.bin') \ + .show() +``` + + + +
+ + + +## Factory Constructor + +Create the operator via the following factory method: + +***ann-search.faiss(findex)*** + + + +**Parameters:** + + + +***findex:*** *str* or *faiss.INDEX* + +The path to faiss index file or faiss index. + + +
+ + + +## Interface + +**Parameters:** + +***query:*** *list* + +Query embeddings in Faiss + + + +**Returns:** *Entity* + +Return the results in Faiss with `key` and `score`. diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..684db96 --- /dev/null +++ b/__init__.py @@ -0,0 +1,4 @@ +from .faiss import Faiss + +def faiss(*args, **kwargs): + return Faiss(*args, **kwargs) \ No newline at end of file diff --git a/faiss.py b/faiss.py new file mode 100644 index 0000000..fc9569c --- /dev/null +++ b/faiss.py @@ -0,0 +1,60 @@ +import numpy as np +from pathlib import Path +import faiss +from towhee import register +from towhee.utils.faiss_utils import KVStorage +from towhee.functional.entity import Entity + + +@register(output_schema=['result']) +class Faiss: + """ + Search for embedding vectors in Faiss. Note that the index has data before searching, + refer to DataCollection Mixin `to_faiss`. + + Args: + findex (`str` or `faiss.INDEX`): + The path to faiss index file(defaults to './index.bin') or faiss index. + kwargs + The kwargs with index.search, refer to https://github.com/facebookresearch/faiss/wiki. And the parameter `k` defaults to 10. + + Examples: + + >>> import towhee + >>> res = ( + ... towhee.glob['path']('./*.jpg') + ... .image_decode['path', 'img']() + ... .image_embedding.timm['img', 'vec'](model_name='resnet50') + ... .faiss_search['vec', 'results'](findex='./faiss/faiss.index') + ... .to_list() + ... ) + [, + ] + """ + def __init__(self, findex, **kwargs): + self.faiss_index = findex + self.kwargs = kwargs + self.kv_storage = None + if isinstance(findex, str): + kv_file = findex.strip('./').replace('.', '_kv.') + index_file = Path(findex) + self.faiss_index = faiss.read_index(str(index_file)) + if Path(kv_file).exists(): + self.kv_storage = KVStorage(kv_file) + + def __call__(self, query: list): + if 'k' not in self.kwargs: + self.kwargs['k'] = 10 + + query = np.array([query]) + scores, ids = self.faiss_index.search(query, **self.kwargs) + + ids = ids[0].tolist() + result = [] + for i in range(len(ids)): + if self.kv_storage is not None: + k = self.kv_storage.get(ids[i]) + else: + k = ids[i] + result.append(Entity(**{'key': k, 'score': scores[0][i]})) + return result diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2b48261 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +faiss-cpu +numpy +towhee \ No newline at end of file diff --git a/result.png b/result.png new file mode 100644 index 0000000..22ee3b1 Binary files /dev/null and b/result.png differ