diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..a39f158 --- /dev/null +++ b/__init__.py @@ -0,0 +1,5 @@ +from .spliter import TextSpliter + + +def text_loader(*args, **kwargs): + return TextSpliter(*args, **kwargs) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d598e50 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +langchain>=0.0.151 +unstructured +pdf2image diff --git a/spliter.py b/spliter.py new file mode 100644 index 0000000..49eb881 --- /dev/null +++ b/spliter.py @@ -0,0 +1,14 @@ +from towhee.operator import PyOperator +from typing import List +from langchain.text_splitter import RecursiveCharacterTextSplitter + + +class TextSpliter(PyOperator): + '''Split data into a list.''' + def __init__(self, chunk_size: int = 300): + self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) + + def __call__(self, data: str) -> List[str]: + texts = self.splitter.create_documents([data]) + docs = self.splitter.split_documents(texts) + return [str(doc.page_content) for doc in docs]