logo
Browse Source

Add splitter_kwargs param

Signed-off-by: shiyu22 <shiyu.chen@zilliz.com>
main
shiyu22 2 years ago
parent
commit
f36e783d42
  1. 4
      README.md
  2. 8
      eqa_insert.py

4
README.md

@ -78,6 +78,10 @@ The type of splitter, defaults to 'RecursiveCharacter'. You can set this paramet
***chunk_size***: int ***chunk_size***: int
The size of each chunk, defaults to 300. The size of each chunk, defaults to 300.
***splitter_kwargs***: dict
The kwargs for the splitter, defaults to {}.
#### **Configuration for Sentence Embedding:** #### **Configuration for Sentence Embedding:**
***model***: str ***model***: str

8
eqa_insert.py

@ -24,6 +24,7 @@ class EnhancedQAInsertConfig:
# config for text_splitter # config for text_splitter
self.type = 'RecursiveCharacter' self.type = 'RecursiveCharacter'
self.chunk_size = 300 self.chunk_size = 300
self.splitter_kwargs = {}
# config for sentence_embedding # config for sentence_embedding
self.model = 'all-MiniLM-L6-v2' self.model = 'all-MiniLM-L6-v2'
self.openai_api_key = None self.openai_api_key = None
@ -61,6 +62,11 @@ def _get_embedding_op(config):
@AutoPipes.register @AutoPipes.register
def enhanced_qa_insert_pipe(config): def enhanced_qa_insert_pipe(config):
text_split_op = ops.text_splitter(type=config.type,
chunk_size=config.chunk_size,
**config.splitter_kwargs)
allow_triton, sentence_embedding_op = _get_embedding_op(config) allow_triton, sentence_embedding_op = _get_embedding_op(config)
sentence_embedding_config = {} sentence_embedding_config = {}
if allow_triton: if allow_triton:
@ -79,7 +85,7 @@ def enhanced_qa_insert_pipe(config):
return ( return (
pipe.input('doc') pipe.input('doc')
.map('doc', 'text', ops.text_loader()) .map('doc', 'text', ops.text_loader())
.flat_map('text', 'sentence', ops.text_splitter(type=config.type, chunk_size=config.chunk_size))
.flat_map('text', 'sentence', text_split_op)
.map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config) .map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config)
.map('embedding', 'embedding', ops.towhee.np_normalize()) .map('embedding', 'embedding', ops.towhee.np_normalize())
.map(('doc', 'sentence', 'embedding'), 'mr', insert_milvus_op) .map(('doc', 'sentence', 'embedding'), 'mr', insert_milvus_op)

Loading…
Cancel
Save