diff --git a/osschat_insert.py b/osschat_insert.py index 3893947..fc454a9 100644 --- a/osschat_insert.py +++ b/osschat_insert.py @@ -67,7 +67,16 @@ def _get_embedding_op(config): return False, ops.sentence_embedding.openai(model_name=config.embedding_model, api_key=config.openai_api_key) raise RuntimeError('Unknown model: [%s], only support: %s' % (config.embedding_model, _hf_models + _sbert_models + _openai_models)) - +def data_loader(path): + if path.endswith('pdf'): + op = ops.data_loader.pdf_loader() + elif path.endswith(('xls', 'xslx')): + op = ops.data_loader.excel_loader() + elif path.endswith('ppt'): + op = ops.data_loader.powerpoint_loader() + else: + op = ops.text_loader() + return op(path) @AutoPipes.register def osschat_insert_pipe(config): @@ -91,7 +100,7 @@ def osschat_insert_pipe(config): p = ( pipe.input('doc', 'project_name') - .map('doc', 'text', ops.text_loader()) + .map('doc', 'text', data_loader) .flat_map('text', 'sentence', text_split_op) .map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config) )