from towhee.operator import PyOperator from typing import List, Optional class TextLoader(PyOperator): '''Load data from url or file (paths or file-like objects).''' def __init__(self) -> None: super().__init__() def __call__(self, data_src) -> List[str]: if data_src.startswith('http'): docs= self._from_url(data_src) else: docs = self._from_file(data_src) return docs def _from_file(self, file, encoding: Optional[str] = None) -> str: '''Load documents from path or file-like object, return a list of unsplit Langchain Documents''' if hasattr(file, 'name'): file_path = file.name else: file_path = file with open(file_path, encoding=encoding) as f: text = f.read() return text def _from_url(self, url: str) -> str: from langchain.document_loaders import UnstructuredURLLoader loader = UnstructuredURLLoader(urls=[url]) doc = loader.load()[0] return doc.page_content