logo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions

49 lines
1.8 KiB

from towhee.operator import PyOperator
from typing import List, Optional
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
class TextLoader(PyOperator):
'''Load data from urls or files (paths or file-like objects) as a list of doc chunks'''
def __init__(self,
chunk_size: int = 300,
source_type: str = 'file',
):
self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
self.source_type = source_type
def __call__(self, data_src) -> List[str]:
if not isinstance(data_src, list):
data_src = [data_src]
if self.source_type == 'file':
docs = self._from_files(data_src)
elif self.source_type == 'url':
docs= self._from_urls(data_src)
else:
raise AttributeError('Invalid source type. Only support "file" or "url".')
docs = self.splitter.split_documents(docs)
return [str(doc.page_content) for doc in docs]
def _from_files(self, files: list, encoding: Optional[str] = None) -> List[Document]:
'''Load documents from path or file-like object, return a list of unsplit Langchain Documents'''
docs = []
for file in files:
if hasattr(file, 'name'):
file_path = file.name
else:
file_path = file
with open(file_path, encoding=encoding) as f:
text = f.read()
metadata = {"source": file_path}
docs.append(Document(page_content=text, metadata=metadata))
return docs
def _from_urls(self, urls: List[str]) -> List[Document]:
from langchain.document_loaders import UnstructuredURLLoader
loader = UnstructuredURLLoader(urls=urls)
docs = loader.load()
return docs