from towhee.operator import PyOperator from typing import List, Optional class TextLoader(PyOperator): '''Load data from url or file (paths or file-like objects).''' def __init__(self, **kwargs) -> None: super().__init__() self.unstructured_kwargs = kwargs def __call__(self, data_src) -> List[str]: if data_src.startswith('http'): docs= self._from_url(data_src) else: docs = self._from_file(data_src) return docs def _from_file(self, file, encoding: Optional[str] = None) -> str: '''Load documents from path or file-like object, return a list of unsplit Langchain Documents''' if hasattr(file, 'name'): file_path = file.name else: file_path = file from langchain.document_loaders import UnstructuredFileLoader loader = UnstructuredFileLoader(file_path, mode='single', strategy='fast') data = loader.load() if len(data) > 0: doc = data[0] return doc.page_content else: raise RuntimeError(f'Failed to load data from {file}. Invalid output: {data}') def _from_url(self, url: str) -> str: from langchain.document_loaders import UnstructuredURLLoader loader = UnstructuredURLLoader(urls=[url]) data = loader.load() if len(data) > 0: doc = data[0] return doc.page_content else: raise RuntimeError(f'Failed to load data from {url}. Invalid output: {data}')