From 54c754c330915c34f79fe188dc13c176fd49545c Mon Sep 17 00:00:00 2001 From: shiyu22 Date: Tue, 30 May 2023 12:38:13 +0800 Subject: [PATCH] Add spliter Signed-off-by: shiyu22 --- __init__.py | 5 +++++ requirements.txt | 3 +++ spliter.py | 14 ++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 __init__.py create mode 100644 requirements.txt create mode 100644 spliter.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..a39f158 --- /dev/null +++ b/__init__.py @@ -0,0 +1,5 @@ +from .spliter import TextSpliter + + +def text_loader(*args, **kwargs): + return TextSpliter(*args, **kwargs) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d598e50 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +langchain>=0.0.151 +unstructured +pdf2image diff --git a/spliter.py b/spliter.py new file mode 100644 index 0000000..49eb881 --- /dev/null +++ b/spliter.py @@ -0,0 +1,14 @@ +from towhee.operator import PyOperator +from typing import List +from langchain.text_splitter import RecursiveCharacterTextSplitter + + +class TextSpliter(PyOperator): + '''Split data into a list.''' + def __init__(self, chunk_size: int = 300): + self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) + + def __call__(self, data: str) -> List[str]: + texts = self.splitter.create_documents([data]) + docs = self.splitter.split_documents(texts) + return [str(doc.page_content) for doc in docs]