From f36e783d42002e52c02edaa8330dda3518369c9e Mon Sep 17 00:00:00 2001 From: shiyu22 Date: Tue, 30 May 2023 19:28:02 +0800 Subject: [PATCH] Add splitter_kwargs param Signed-off-by: shiyu22 --- README.md | 4 ++++ eqa_insert.py | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dd31dab..8de47c3 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,10 @@ The type of splitter, defaults to 'RecursiveCharacter'. You can set this paramet ***chunk_size***: int The size of each chunk, defaults to 300. +***splitter_kwargs***: dict + +The kwargs for the splitter, defaults to {}. + #### **Configuration for Sentence Embedding:** ***model***: str diff --git a/eqa_insert.py b/eqa_insert.py index e009b8c..33698a4 100644 --- a/eqa_insert.py +++ b/eqa_insert.py @@ -24,6 +24,7 @@ class EnhancedQAInsertConfig: # config for text_splitter self.type = 'RecursiveCharacter' self.chunk_size = 300 + self.splitter_kwargs = {} # config for sentence_embedding self.model = 'all-MiniLM-L6-v2' self.openai_api_key = None @@ -61,6 +62,11 @@ def _get_embedding_op(config): @AutoPipes.register def enhanced_qa_insert_pipe(config): + text_split_op = ops.text_splitter(type=config.type, + chunk_size=config.chunk_size, + **config.splitter_kwargs) + + allow_triton, sentence_embedding_op = _get_embedding_op(config) sentence_embedding_config = {} if allow_triton: @@ -79,7 +85,7 @@ def enhanced_qa_insert_pipe(config): return ( pipe.input('doc') .map('doc', 'text', ops.text_loader()) - .flat_map('text', 'sentence', ops.text_splitter(type=config.type, chunk_size=config.chunk_size)) + .flat_map('text', 'sentence', text_split_op) .map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config) .map('embedding', 'embedding', ops.towhee.np_normalize()) .map(('doc', 'sentence', 'embedding'), 'mr', insert_milvus_op)