towhee
/
text-splitter
copied
3 changed files with 22 additions and 0 deletions
@ -0,0 +1,5 @@ |
|||
from .spliter import TextSpliter |
|||
|
|||
|
|||
def text_loader(*args, **kwargs): |
|||
return TextSpliter(*args, **kwargs) |
@ -0,0 +1,3 @@ |
|||
langchain>=0.0.151 |
|||
unstructured |
|||
pdf2image |
@ -0,0 +1,14 @@ |
|||
from towhee.operator import PyOperator |
|||
from typing import List |
|||
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|||
|
|||
|
|||
class TextSpliter(PyOperator): |
|||
'''Split data into a list.''' |
|||
def __init__(self, chunk_size: int = 300): |
|||
self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) |
|||
|
|||
def __call__(self, data: str) -> List[str]: |
|||
texts = self.splitter.create_documents([data]) |
|||
docs = self.splitter.split_documents(texts) |
|||
return [str(doc.page_content) for doc in docs] |
Loading…
Reference in new issue