diff --git a/README.md b/README.md index bfb4515..b76441f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,86 @@ -# rerank +# Rerank QA Content +## Description + +The Rerank operator is used to reorder the list of relevant documents for a query. It uses the [MS MARCO Cross-Encoders](https://www.sbert.net/docs/pretrained_cross-encoders.html#ms-marco) model to get the relevant scores and then reorders the documents. + +
+ + + +## Code Example + +- Run with ops + +```Python +from towhee import ops + +op = ops.rerank() +res = op('What is Towhee?', + ['Towhee is Towhee is a cutting-edge framework to deal with unstructure data.', 'I do not know about towhee', 'Towhee has many powerful operators.', 'The weather is good' ], + 0) +``` + +- Run a pipeline + +```python +from towhee import ops, pipe, DataCollection + +p = (pipe.input('query', 'doc', 'threshold') + .map(('query', 'doc', 'threshold'), ('doc', 'score'), ops.rerank()) + .flat_map(('doc', 'score'), ('doc', 'score'), lambda x, y: [(i, j) for i, j in zip(x, y)]) + .output('query', 'doc', 'score') + ) + +DataCollection(p('What is Towhee?', + ['Towhee is Towhee is a cutting-edge framework to deal with unstructure data.', 'I do not know about towhee', 'Towhee has many powerful operators.', 'The weather is good' ], + 0) + ).show() +``` + + + +
+ + + +## Factory Constructor + +Create the operator via the following factory method + +***towhee.rerank(model_name: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2')*** + +**Parameters:** + + ***model_name***: str + +​ The model name of CrossEncoder, you can set it according to the [Model List](https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#models-performance). + +
+ + + +## Interface + +This operator is used to sort the documents of the query content and return the score, and can also set a threshold to filter the results. + +**Parameters:** + + ***query***: str + + The query content. + +​ ***docs***: list + + A list of sentences to check the correlation with the query content. + +​ ***threshold***: float + +​ The threshold for filtering with score, defaults to none, i.e., no filtering. + + +
+ +**Return**: List[str], List[float] + +The list of documents after rerank and the list of corresponding scores. \ No newline at end of file diff --git a/rerank.py b/rerank.py index 59acaa6..ff8f233 100644 --- a/rerank.py +++ b/rerank.py @@ -6,7 +6,7 @@ from towhee.operator import NNOperator class ReRank(NNOperator): - def __init__(self, model_name: str = 'cross-encoder/ms-marco-MiniLM-L-6-v2'): + def __init__(self, model_name: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2'): super().__init__() self._model_name = model_name self._model = CrossEncoder(self._model_name, max_length=1000) @@ -20,4 +20,4 @@ class ReRank(NNOperator): else: re_docs = [docs[i] for i in re_ids if scores[i] >= threshold] re_scores = [scores[i] for i in re_ids if scores[i] >= threshold] - return re_docs, re_scores \ No newline at end of file + return re_docs, re_scores diff --git a/result.png b/result.png new file mode 100644 index 0000000..d8416e5 Binary files /dev/null and b/result.png differ