|
|
|
from towhee.operator import PyOperator
|
|
|
|
from typing import List, Optional
|
|
|
|
|
|
|
|
|
|
|
|
class TextLoader(PyOperator):
|
|
|
|
'''Load data from url or file (paths or file-like objects).'''
|
|
|
|
def __init__(self, **kwargs) -> None:
|
|
|
|
super().__init__()
|
|
|
|
self.unstructured_kwargs = kwargs
|
|
|
|
|
|
|
|
def __call__(self, data_src) -> List[str]:
|
|
|
|
if data_src.startswith('http'):
|
|
|
|
docs= self._from_url(data_src)
|
|
|
|
else:
|
|
|
|
docs = self._from_file(data_src)
|
|
|
|
return docs
|
|
|
|
|
|
|
|
def _from_file(self, file, encoding: Optional[str] = None) -> str:
|
|
|
|
'''Load documents from path or file-like object, return a list of unsplit Langchain Documents'''
|
|
|
|
if hasattr(file, 'name'):
|
|
|
|
file_path = file.name
|
|
|
|
else:
|
|
|
|
file_path = file
|
|
|
|
from langchain.document_loaders import UnstructuredFileLoader
|
|
|
|
loader = UnstructuredFileLoader(file_path, mode='single', strategy='fast')
|
|
|
|
data = loader.load()
|
|
|
|
if len(data) > 0:
|
|
|
|
doc = data[0]
|
|
|
|
return doc.page_content
|
|
|
|
else:
|
|
|
|
raise RuntimeError(f'Failed to load data from {file}. Invalid output: {data}')
|
|
|
|
|
|
|
|
def _from_url(self, url: str) -> str:
|
|
|
|
from langchain.document_loaders import UnstructuredURLLoader
|
|
|
|
|
|
|
|
loader = UnstructuredURLLoader(urls=[url])
|
|
|
|
data = loader.load()
|
|
|
|
if len(data) > 0:
|
|
|
|
doc = data[0]
|
|
|
|
return doc.page_content
|
|
|
|
else:
|
|
|
|
raise RuntimeError(f'Failed to load data from {url}. Invalid output: {data}')
|