towhee
/
text-splitter
copied
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions
15 lines
560 B
15 lines
560 B
from towhee.operator import PyOperator
|
|
from typing import List
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
|
|
class TextSpliter(PyOperator):
|
|
'''Split data into a list.'''
|
|
def __init__(self, chunk_size: int = 300):
|
|
super().__init__()
|
|
self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
|
|
|
|
def __call__(self, data: str) -> List[str]:
|
|
texts = self.splitter.create_documents([data])
|
|
docs = self.splitter.split_documents(texts)
|
|
return [str(doc.page_content) for doc in docs]
|