From 15f161db9135f0e9afbbd65ff225074697424d6b Mon Sep 17 00:00:00 2001 From: Jael Gu Date: Tue, 29 Aug 2023 16:38:49 +0800 Subject: [PATCH] Support more file reader Signed-off-by: Jael Gu --- loader.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/loader.py b/loader.py index 454854b..c38ea01 100644 --- a/loader.py +++ b/loader.py @@ -4,8 +4,9 @@ from typing import List, Optional class TextLoader(PyOperator): '''Load data from url or file (paths or file-like objects).''' - def __init__(self) -> None: + def __init__(self, **kwargs) -> None: super().__init__() + self.unstructured_kwargs = kwargs def __call__(self, data_src) -> List[str]: if data_src.startswith('http'): @@ -20,9 +21,10 @@ class TextLoader(PyOperator): file_path = file.name else: file_path = file - with open(file_path, encoding=encoding) as f: - text = f.read() - return text + from langchain.document_loaders import UnstructuredFileLoader + loader = UnstructuredFileLoader(file_path, mode='single', strategy='fast') + doc = loader.load()[0] + return doc.page_content def _from_url(self, url: str) -> str: from langchain.document_loaders import UnstructuredURLLoader