diff --git a/loader.py b/loader.py index 454854b..c38ea01 100644 --- a/loader.py +++ b/loader.py @@ -4,8 +4,9 @@ from typing import List, Optional class TextLoader(PyOperator): '''Load data from url or file (paths or file-like objects).''' - def __init__(self) -> None: + def __init__(self, **kwargs) -> None: super().__init__() + self.unstructured_kwargs = kwargs def __call__(self, data_src) -> List[str]: if data_src.startswith('http'): @@ -20,9 +21,10 @@ class TextLoader(PyOperator): file_path = file.name else: file_path = file - with open(file_path, encoding=encoding) as f: - text = f.read() - return text + from langchain.document_loaders import UnstructuredFileLoader + loader = UnstructuredFileLoader(file_path, mode='single', strategy='fast') + doc = loader.load()[0] + return doc.page_content def _from_url(self, url: str) -> str: from langchain.document_loaders import UnstructuredURLLoader