From 0f53700ceecc75897877940a5ae5be243571b8a4 Mon Sep 17 00:00:00 2001 From: shiyu22 Date: Fri, 26 May 2023 15:00:00 +0800 Subject: [PATCH] Update loader Signed-off-by: shiyu22 --- loader.py | 53 +++++++++++++++++++++++++++++++++++++++--------- requirements.txt | 1 + 2 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 requirements.txt diff --git a/loader.py b/loader.py index 4805b0e..471f44e 100644 --- a/loader.py +++ b/loader.py @@ -1,16 +1,49 @@ from towhee.operator import PyOperator +from typing import List, Optional +from langchain.docstore.document import Document +from langchain.text_splitter import RecursiveCharacterTextSplitter + class TextLoader(PyOperator): - """ - DefaultOperator for _input and _output nodes. - """ + '''Load data from urls or files (paths or file-like objects) as a list of doc chunks''' + def __init__(self, + chunk_size: int = 300, + source_type: str = 'file', + ): + self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) + self.source_type = source_type + + def __call__(self, data_src) -> List[str]: + if not isinstance(data_src, list): + data_src = [data_src] + if self.source_type == 'file': + docs = self._from_files(data_src) + elif self.source_type == 'url': + docs= self._from_urls(data_src) + else: + raise AttributeError('Invalid source type. Only support "file" or "url".') + + docs = self.splitter.split_documents(docs) + return [str(doc.page_content) for doc in docs] - def __init__(self): - #pylint: disable=useless-super-delegation - super().__init__() + def _from_files(self, files: list, encoding: Optional[str] = None) -> List[Document]: + '''Load documents from path or file-like object, return a list of unsplit Langchain Documents''' + docs = [] + for file in files: + if hasattr(file, 'name'): + file_path = file.name + else: + file_path = file + with open(file_path, encoding=encoding) as f: + text = f.read() + metadata = {"source": file_path} + docs.append(Document(page_content=text, metadata=metadata)) + return docs + + def _from_urls(self, urls: List[str]) -> List[Document]: + from langchain.document_loaders import UnstructuredURLLoader - def __call__(self, *args): - if len(args) == 1: - return args[0] - return args + loader = UnstructuredURLLoader(urls=urls) + docs = loader.load() + return docs \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..98cef79 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +langchain