From 4ff6a0805bb5b61913a66d13104614c98d699d59 Mon Sep 17 00:00:00 2001 From: shiyu22 Date: Tue, 30 May 2023 18:53:27 +0800 Subject: [PATCH] Update splitter Signed-off-by: shiyu22 --- README.md | 24 ++++++++++++++---------- eqa_insert.py | 5 +++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 5ee4f27..dd31dab 100644 --- a/README.md +++ b/README.md @@ -69,40 +69,44 @@ Then you can run `collection.flush() ` and `collection.num_entities` to check th ### **EnhancedQAInsertConfig** -#### **Configuration for [Text Spliter](https://towhee.io/towhee/text-spliter):** +#### **Configuration for [Text Splitter](https://towhee.io/towhee/text-splitter):** -***chunk_size: int*** +***type***: str + +The type of splitter, defaults to 'RecursiveCharacter'. You can set this parameter in ['[RecursiveCharacter](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/recursive_text_splitter.html)', '[Markdown](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/markdown.html)', '[PythonCode](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/python.html)', '[Character](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/character_text_splitter.html#)', '[NLTK](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/nltk.html)', '[Spacy](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/spacy.html)', '[Tiktoken](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/tiktoken_splitter.html)', '[HuggingFace](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/huggingface_length_function.html)']. + +***chunk_size***: int The size of each chunk, defaults to 300. #### **Configuration for Sentence Embedding:** -***model: str*** +***model***: str The model name in the sentence embedding pipeline, defaults to `'all-MiniLM-L6-v2'`. You can refer to the above [Model(s) list ](https://towhee.io/tasks/detail/operator?field_name=Natural-Language-Processing&task_name=Sentence-Embedding)to set the model, some of these models are from [HuggingFace](https://huggingface.co/) (open source), and some are from [OpenAI](https://openai.com/) (not open, required API key). -***openai_api_key: str*** +***openai_api_key***: str The api key of openai, default to `None`. This key is required if the model is from OpenAI, you can check the model provider in the above [Model(s) list](https://towhee.io/sentence-embedding/openai). -***device:*** ***int*** +***device:*** int The number of devices, defaults to `-1`, which means using the CPU. If the setting is not `-1`, the specified GPU device will be used. #### **Configuration for [Milvus](https://towhee.io/ann-insert/milvus-client):** -***host: str*** +***host***: str Host of Milvus vector database, default is `'127.0.0.1'`. -***port: str*** +***port***: str Port of Milvus vector database, default is `'19530'`. -***collection_name: str*** +***collection_name***: str The collection name for Milvus vector database, is required when inserting data into Milvus. -***user: str*** +***user***: str The user name for [Cloud user](https://zilliz.com/cloud), defaults to `None`. -***password: str*** +***password***: str The user password for [Cloud user](https://zilliz.com/cloud), defaults to `None`.
diff --git a/eqa_insert.py b/eqa_insert.py index ca26215..e009b8c 100644 --- a/eqa_insert.py +++ b/eqa_insert.py @@ -21,7 +21,8 @@ class EnhancedQAInsertConfig: Config of pipeline """ def __init__(self): - # config for text_spliter + # config for text_splitter + self.type = 'RecursiveCharacter' self.chunk_size = 300 # config for sentence_embedding self.model = 'all-MiniLM-L6-v2' @@ -78,7 +79,7 @@ def enhanced_qa_insert_pipe(config): return ( pipe.input('doc') .map('doc', 'text', ops.text_loader()) - .flat_map('text', 'sentence', ops.text_spliter(chunk_size=config.chunk_size)) + .flat_map('text', 'sentence', ops.text_splitter(type=config.type, chunk_size=config.chunk_size)) .map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config) .map('embedding', 'embedding', ops.towhee.np_normalize()) .map(('doc', 'sentence', 'embedding'), 'mr', insert_milvus_op)