towhee
/
text-splitter
copied
3 changed files with 22 additions and 0 deletions
@ -0,0 +1,5 @@ |
|||||
|
from .spliter import TextSpliter |
||||
|
|
||||
|
|
||||
|
def text_loader(*args, **kwargs): |
||||
|
return TextSpliter(*args, **kwargs) |
@ -0,0 +1,3 @@ |
|||||
|
langchain>=0.0.151 |
||||
|
unstructured |
||||
|
pdf2image |
@ -0,0 +1,14 @@ |
|||||
|
from towhee.operator import PyOperator |
||||
|
from typing import List |
||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
||||
|
|
||||
|
|
||||
|
class TextSpliter(PyOperator): |
||||
|
'''Split data into a list.''' |
||||
|
def __init__(self, chunk_size: int = 300): |
||||
|
self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) |
||||
|
|
||||
|
def __call__(self, data: str) -> List[str]: |
||||
|
texts = self.splitter.create_documents([data]) |
||||
|
docs = self.splitter.split_documents(texts) |
||||
|
return [str(doc.page_content) for doc in docs] |
Loading…
Reference in new issue