logo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions

42 lines
1.5 KiB

from towhee.operator import PyOperator
from typing import List, Optional
class TextLoader(PyOperator):
'''Load data from url or file (paths or file-like objects).'''
def __init__(self, **kwargs) -> None:
super().__init__()
self.unstructured_kwargs = kwargs
def __call__(self, data_src) -> List[str]:
if data_src.startswith('http'):
docs= self._from_url(data_src)
else:
docs = self._from_file(data_src)
return docs
def _from_file(self, file, encoding: Optional[str] = None) -> str:
'''Load documents from path or file-like object, return a list of unsplit Langchain Documents'''
if hasattr(file, 'name'):
file_path = file.name
else:
file_path = file
from langchain.document_loaders import UnstructuredFileLoader
loader = UnstructuredFileLoader(file_path, mode='single', strategy='fast')
data = loader.load()
if len(data) > 0:
doc = data[0]
return doc.page_content
else:
raise RuntimeError(f'Failed to load data from {file}. Invalid output: {data}')
def _from_url(self, url: str) -> str:
from langchain.document_loaders import UnstructuredURLLoader
loader = UnstructuredURLLoader(urls=[url])
data = loader.load()
if len(data) > 0:
doc = data[0]
return doc.page_content
else:
raise RuntimeError(f'Failed to load data from {url}. Invalid output: {data}')