From 8bdb6fe08f9a3b3df22f54840523c7379885c88a Mon Sep 17 00:00:00 2001 From: Jael Gu Date: Mon, 7 Aug 2023 17:55:24 +0800 Subject: [PATCH] Support more suffix of docs Signed-off-by: Jael Gu --- osschat_insert.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/osschat_insert.py b/osschat_insert.py index 3893947..fc454a9 100644 --- a/osschat_insert.py +++ b/osschat_insert.py @@ -67,7 +67,16 @@ def _get_embedding_op(config): return False, ops.sentence_embedding.openai(model_name=config.embedding_model, api_key=config.openai_api_key) raise RuntimeError('Unknown model: [%s], only support: %s' % (config.embedding_model, _hf_models + _sbert_models + _openai_models)) - +def data_loader(path): + if path.endswith('pdf'): + op = ops.data_loader.pdf_loader() + elif path.endswith(('xls', 'xslx')): + op = ops.data_loader.excel_loader() + elif path.endswith('ppt'): + op = ops.data_loader.powerpoint_loader() + else: + op = ops.text_loader() + return op(path) @AutoPipes.register def osschat_insert_pipe(config): @@ -91,7 +100,7 @@ def osschat_insert_pipe(config): p = ( pipe.input('doc', 'project_name') - .map('doc', 'text', ops.text_loader()) + .map('doc', 'text', data_loader) .flat_map('text', 'sentence', text_split_op) .map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config) )