logo
Browse Source

Add spliter

Signed-off-by: shiyu22 <shiyu.chen@zilliz.com>
main
shiyu22 1 year ago
parent
commit
54c754c330
  1. 5
      __init__.py
  2. 3
      requirements.txt
  3. 14
      spliter.py

5
__init__.py

@ -0,0 +1,5 @@
from .spliter import TextSpliter
def text_loader(*args, **kwargs):
return TextSpliter(*args, **kwargs)

3
requirements.txt

@ -0,0 +1,3 @@
langchain>=0.0.151
unstructured
pdf2image

14
spliter.py

@ -0,0 +1,14 @@
from towhee.operator import PyOperator
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
class TextSpliter(PyOperator):
'''Split data into a list.'''
def __init__(self, chunk_size: int = 300):
self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
def __call__(self, data: str) -> List[str]:
texts = self.splitter.create_documents([data])
docs = self.splitter.split_documents(texts)
return [str(doc.page_content) for doc in docs]
Loading…
Cancel
Save