towhee
/
text-loader
copied
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions
32 lines
1.0 KiB
32 lines
1.0 KiB
from towhee.operator import PyOperator
|
|
from typing import List, Optional
|
|
|
|
|
|
class TextLoader(PyOperator):
|
|
'''Load data from url or file (paths or file-like objects).'''
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
|
|
def __call__(self, data_src) -> List[str]:
|
|
if data_src.startswith('http'):
|
|
docs= self._from_url(data_src)
|
|
else:
|
|
docs = self._from_file(data_src)
|
|
return docs
|
|
|
|
def _from_file(self, file, encoding: Optional[str] = None) -> str:
|
|
'''Load documents from path or file-like object, return a list of unsplit Langchain Documents'''
|
|
if hasattr(file, 'name'):
|
|
file_path = file.name
|
|
else:
|
|
file_path = file
|
|
with open(file_path, encoding=encoding) as f:
|
|
text = f.read()
|
|
return text
|
|
|
|
def _from_url(self, url: str) -> str:
|
|
from langchain.document_loaders import UnstructuredURLLoader
|
|
|
|
loader = UnstructuredURLLoader(urls=[url])
|
|
doc = loader.load()[0]
|
|
return doc.page_content
|