towhee
/
text-loader
copied
2 changed files with 44 additions and 10 deletions
@ -1,16 +1,49 @@ |
|||
from towhee.operator import PyOperator |
|||
from typing import List, Optional |
|||
from langchain.docstore.document import Document |
|||
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|||
|
|||
|
|||
|
|||
class TextLoader(PyOperator): |
|||
""" |
|||
DefaultOperator for _input and _output nodes. |
|||
""" |
|||
'''Load data from urls or files (paths or file-like objects) as a list of doc chunks''' |
|||
def __init__(self, |
|||
chunk_size: int = 300, |
|||
source_type: str = 'file', |
|||
): |
|||
self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) |
|||
self.source_type = source_type |
|||
|
|||
def __call__(self, data_src) -> List[str]: |
|||
if not isinstance(data_src, list): |
|||
data_src = [data_src] |
|||
if self.source_type == 'file': |
|||
docs = self._from_files(data_src) |
|||
elif self.source_type == 'url': |
|||
docs= self._from_urls(data_src) |
|||
else: |
|||
raise AttributeError('Invalid source type. Only support "file" or "url".') |
|||
|
|||
docs = self.splitter.split_documents(docs) |
|||
return [str(doc.page_content) for doc in docs] |
|||
|
|||
def __init__(self): |
|||
#pylint: disable=useless-super-delegation |
|||
super().__init__() |
|||
def _from_files(self, files: list, encoding: Optional[str] = None) -> List[Document]: |
|||
'''Load documents from path or file-like object, return a list of unsplit Langchain Documents''' |
|||
docs = [] |
|||
for file in files: |
|||
if hasattr(file, 'name'): |
|||
file_path = file.name |
|||
else: |
|||
file_path = file |
|||
with open(file_path, encoding=encoding) as f: |
|||
text = f.read() |
|||
metadata = {"source": file_path} |
|||
docs.append(Document(page_content=text, metadata=metadata)) |
|||
return docs |
|||
|
|||
def _from_urls(self, urls: List[str]) -> List[Document]: |
|||
from langchain.document_loaders import UnstructuredURLLoader |
|||
|
|||
def __call__(self, *args): |
|||
if len(args) == 1: |
|||
return args[0] |
|||
return args |
|||
loader = UnstructuredURLLoader(urls=urls) |
|||
docs = loader.load() |
|||
return docs |
@ -0,0 +1 @@ |
|||
langchain |
Loading…
Reference in new issue