diff --git a/README.md b/README.md index e7d5b08..2c55b43 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,7 @@ ### Description -**Text loader** is used to load files and split them into text lists. It supports loading local files (with file path), or web links (with url). - -> Refer to [Recursive Characters](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/recursive_text_splitter.html) for the operation of splitting text. +**Text loader** is used to load text file. It supports loading data from url or file path(file format as .md or .txt).
@@ -23,15 +21,15 @@ from towhee import pipe, ops, DataCollection p = ( pipe.input('url') - .flat_map('url', 'text', ops.text_loader(source_type='url')) + .map('url', 'text', ops.text_loader()) .output('url', 'text') ) -res = p('https://docs.towhee.io/Getting%20Started/create-pipeline/') +res = p('https://github.com/towhee-io/towhee/blob/main/README.md') DataCollection(res).show() ``` -result +result
@@ -41,17 +39,7 @@ DataCollection(res).show() Create the operator via the following factory method -***towhee.text_loader(chunk_size=300, source_type='file')*** - -**Parameters:** - -​ ***chunk_size***: int - -​ The size of each chunk, defaults to 300. - -​ ***source_type***: str - -​ The type of the soure, defaults to 'file', you can also set to 'url' for you url of your documentation. +***towhee.text_loader()***
@@ -69,7 +57,7 @@ The operator load the documentation, then split incoming the text and return chu -**Return**: List[Document] +**Return**: str -A list of the chunked document. +String data with the text. diff --git a/loader.py b/loader.py index 75e66bc..454854b 100644 --- a/loader.py +++ b/loader.py @@ -1,48 +1,32 @@ from towhee.operator import PyOperator from typing import List, Optional -from langchain.docstore.document import Document -from langchain.text_splitter import RecursiveCharacterTextSplitter class TextLoader(PyOperator): - '''Load data from urls or files (paths or file-like objects) as a list of doc chunks''' - def __init__(self, - chunk_size: int = 300, - source_type: str = 'file', - ): - self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) - self.source_type = source_type - + '''Load data from url or file (paths or file-like objects).''' + def __init__(self) -> None: + super().__init__() + def __call__(self, data_src) -> List[str]: - if not isinstance(data_src, list): - data_src = [data_src] - if self.source_type == 'file': - docs = self._from_files(data_src) - elif self.source_type == 'url': - docs= self._from_urls(data_src) + if data_src.startswith('http'): + docs= self._from_url(data_src) else: - raise AttributeError('Invalid source type. Only support "file" or "url".') - - docs = self.splitter.split_documents(docs) - return [str(doc.page_content) for doc in docs] + docs = self._from_file(data_src) + return docs - def _from_files(self, files: list, encoding: Optional[str] = None) -> List[Document]: + def _from_file(self, file, encoding: Optional[str] = None) -> str: '''Load documents from path or file-like object, return a list of unsplit Langchain Documents''' - docs = [] - for file in files: - if hasattr(file, 'name'): - file_path = file.name - else: - file_path = file - with open(file_path, encoding=encoding) as f: - text = f.read() - metadata = {"source": file_path} - docs.append(Document(page_content=text, metadata=metadata)) - return docs + if hasattr(file, 'name'): + file_path = file.name + else: + file_path = file + with open(file_path, encoding=encoding) as f: + text = f.read() + return text - def _from_urls(self, urls: List[str]) -> List[Document]: + def _from_url(self, url: str) -> str: from langchain.document_loaders import UnstructuredURLLoader - loader = UnstructuredURLLoader(urls=urls) - docs = loader.load() - return docs \ No newline at end of file + loader = UnstructuredURLLoader(urls=[url]) + doc = loader.load()[0] + return doc.page_content \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 391376e..d598e50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ langchain>=0.0.151 unstructured +pdf2image diff --git a/result.png b/result.png index 4842e99..9d784cd 100644 Binary files a/result.png and b/result.png differ