|
|
|
from towhee.operator import PyOperator
|
|
|
|
from typing import List, Optional
|
|
|
|
from langchain.docstore.document import Document
|
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TextLoader(PyOperator):
|
|
|
|
'''Load data from urls or files (paths or file-like objects) as a list of doc chunks'''
|
|
|
|
def __init__(self,
|
|
|
|
chunk_size: int = 300,
|
|
|
|
source_type: str = 'file',
|
|
|
|
):
|
|
|
|
self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
|
|
|
|
self.source_type = source_type
|
|
|
|
|
|
|
|
def __call__(self, data_src) -> List[str]:
|
|
|
|
if not isinstance(data_src, list):
|
|
|
|
data_src = [data_src]
|
|
|
|
if self.source_type == 'file':
|
|
|
|
docs = self._from_files(data_src)
|
|
|
|
elif self.source_type == 'url':
|
|
|
|
docs= self._from_urls(data_src)
|
|
|
|
else:
|
|
|
|
raise AttributeError('Invalid source type. Only support "file" or "url".')
|
|
|
|
|
|
|
|
docs = self.splitter.split_documents(docs)
|
|
|
|
return [str(doc.page_content) for doc in docs]
|
|
|
|
|
|
|
|
def _from_files(self, files: list, encoding: Optional[str] = None) -> List[Document]:
|
|
|
|
'''Load documents from path or file-like object, return a list of unsplit Langchain Documents'''
|
|
|
|
docs = []
|
|
|
|
for file in files:
|
|
|
|
if hasattr(file, 'name'):
|
|
|
|
file_path = file.name
|
|
|
|
else:
|
|
|
|
file_path = file
|
|
|
|
with open(file_path, encoding=encoding) as f:
|
|
|
|
text = f.read()
|
|
|
|
metadata = {"source": file_path}
|
|
|
|
docs.append(Document(page_content=text, metadata=metadata))
|
|
|
|
return docs
|
|
|
|
|
|
|
|
def _from_urls(self, urls: List[str]) -> List[Document]:
|
|
|
|
from langchain.document_loaders import UnstructuredURLLoader
|
|
|
|
|
|
|
|
loader = UnstructuredURLLoader(urls=urls)
|
|
|
|
docs = loader.load()
|
|
|
|
return docs
|