|
@ -24,6 +24,7 @@ class EnhancedQAInsertConfig: |
|
|
# config for text_splitter |
|
|
# config for text_splitter |
|
|
self.type = 'RecursiveCharacter' |
|
|
self.type = 'RecursiveCharacter' |
|
|
self.chunk_size = 300 |
|
|
self.chunk_size = 300 |
|
|
|
|
|
self.splitter_kwargs = {} |
|
|
# config for sentence_embedding |
|
|
# config for sentence_embedding |
|
|
self.model = 'all-MiniLM-L6-v2' |
|
|
self.model = 'all-MiniLM-L6-v2' |
|
|
self.openai_api_key = None |
|
|
self.openai_api_key = None |
|
@ -61,6 +62,11 @@ def _get_embedding_op(config): |
|
|
|
|
|
|
|
|
@AutoPipes.register |
|
|
@AutoPipes.register |
|
|
def enhanced_qa_insert_pipe(config): |
|
|
def enhanced_qa_insert_pipe(config): |
|
|
|
|
|
text_split_op = ops.text_splitter(type=config.type, |
|
|
|
|
|
chunk_size=config.chunk_size, |
|
|
|
|
|
**config.splitter_kwargs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
allow_triton, sentence_embedding_op = _get_embedding_op(config) |
|
|
allow_triton, sentence_embedding_op = _get_embedding_op(config) |
|
|
sentence_embedding_config = {} |
|
|
sentence_embedding_config = {} |
|
|
if allow_triton: |
|
|
if allow_triton: |
|
@ -79,7 +85,7 @@ def enhanced_qa_insert_pipe(config): |
|
|
return ( |
|
|
return ( |
|
|
pipe.input('doc') |
|
|
pipe.input('doc') |
|
|
.map('doc', 'text', ops.text_loader()) |
|
|
.map('doc', 'text', ops.text_loader()) |
|
|
.flat_map('text', 'sentence', ops.text_splitter(type=config.type, chunk_size=config.chunk_size)) |
|
|
|
|
|
|
|
|
.flat_map('text', 'sentence', text_split_op) |
|
|
.map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config) |
|
|
.map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config) |
|
|
.map('embedding', 'embedding', ops.towhee.np_normalize()) |
|
|
.map('embedding', 'embedding', ops.towhee.np_normalize()) |
|
|
.map(('doc', 'sentence', 'embedding'), 'mr', insert_milvus_op) |
|
|
.map(('doc', 'sentence', 'embedding'), 'mr', insert_milvus_op) |
|
|