from towhee.operator import PyOperator from typing import List, Optional from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter class TextLoader(PyOperator): '''Load data from urls or files (paths or file-like objects) as a list of doc chunks''' def __init__(self, chunk_size: int = 300, source_type: str = 'file', ): self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) self.source_type = source_type def __call__(self, data_src) -> List[str]: if not isinstance(data_src, list): data_src = [data_src] if self.source_type == 'file': docs = self._from_files(data_src) elif self.source_type == 'url': docs= self._from_urls(data_src) else: raise AttributeError('Invalid source type. Only support "file" or "url".') docs = self.splitter.split_documents(docs) return [str(doc.page_content) for doc in docs] def _from_files(self, files: list, encoding: Optional[str] = None) -> List[Document]: '''Load documents from path or file-like object, return a list of unsplit Langchain Documents''' docs = [] for file in files: if hasattr(file, 'name'): file_path = file.name else: file_path = file with open(file_path, encoding=encoding) as f: text = f.read() metadata = {"source": file_path} docs.append(Document(page_content=text, metadata=metadata)) return docs def _from_urls(self, urls: List[str]) -> List[Document]: from langchain.document_loaders import UnstructuredURLLoader loader = UnstructuredURLLoader(urls=urls) docs = loader.load() return docs