towhee
/
text-loader
copied
4 changed files with 28 additions and 55 deletions
@ -1,48 +1,32 @@ |
|||||
from towhee.operator import PyOperator |
from towhee.operator import PyOperator |
||||
from typing import List, Optional |
from typing import List, Optional |
||||
from langchain.docstore.document import Document |
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
||||
|
|
||||
|
|
||||
class TextLoader(PyOperator): |
class TextLoader(PyOperator): |
||||
'''Load data from urls or files (paths or file-like objects) as a list of doc chunks''' |
|
||||
def __init__(self, |
|
||||
chunk_size: int = 300, |
|
||||
source_type: str = 'file', |
|
||||
): |
|
||||
self.splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size) |
|
||||
self.source_type = source_type |
|
||||
|
|
||||
|
'''Load data from url or file (paths or file-like objects).''' |
||||
|
def __init__(self) -> None: |
||||
|
super().__init__() |
||||
|
|
||||
def __call__(self, data_src) -> List[str]: |
def __call__(self, data_src) -> List[str]: |
||||
if not isinstance(data_src, list): |
|
||||
data_src = [data_src] |
|
||||
if self.source_type == 'file': |
|
||||
docs = self._from_files(data_src) |
|
||||
elif self.source_type == 'url': |
|
||||
docs= self._from_urls(data_src) |
|
||||
|
if data_src.startswith('http'): |
||||
|
docs= self._from_url(data_src) |
||||
else: |
else: |
||||
raise AttributeError('Invalid source type. Only support "file" or "url".') |
|
||||
|
|
||||
docs = self.splitter.split_documents(docs) |
|
||||
return [str(doc.page_content) for doc in docs] |
|
||||
|
docs = self._from_file(data_src) |
||||
|
return docs |
||||
|
|
||||
def _from_files(self, files: list, encoding: Optional[str] = None) -> List[Document]: |
|
||||
|
def _from_file(self, file, encoding: Optional[str] = None) -> str: |
||||
'''Load documents from path or file-like object, return a list of unsplit Langchain Documents''' |
'''Load documents from path or file-like object, return a list of unsplit Langchain Documents''' |
||||
docs = [] |
|
||||
for file in files: |
|
||||
if hasattr(file, 'name'): |
|
||||
file_path = file.name |
|
||||
else: |
|
||||
file_path = file |
|
||||
with open(file_path, encoding=encoding) as f: |
|
||||
text = f.read() |
|
||||
metadata = {"source": file_path} |
|
||||
docs.append(Document(page_content=text, metadata=metadata)) |
|
||||
return docs |
|
||||
|
if hasattr(file, 'name'): |
||||
|
file_path = file.name |
||||
|
else: |
||||
|
file_path = file |
||||
|
with open(file_path, encoding=encoding) as f: |
||||
|
text = f.read() |
||||
|
return text |
||||
|
|
||||
def _from_urls(self, urls: List[str]) -> List[Document]: |
|
||||
|
def _from_url(self, url: str) -> str: |
||||
from langchain.document_loaders import UnstructuredURLLoader |
from langchain.document_loaders import UnstructuredURLLoader |
||||
|
|
||||
loader = UnstructuredURLLoader(urls=urls) |
|
||||
docs = loader.load() |
|
||||
return docs |
|
||||
|
loader = UnstructuredURLLoader(urls=[url]) |
||||
|
doc = loader.load()[0] |
||||
|
return doc.page_content |
@ -1,2 +1,3 @@ |
|||||
langchain>=0.0.151 |
langchain>=0.0.151 |
||||
unstructured |
unstructured |
||||
|
pdf2image |
||||
|
Before Width: | Height: | Size: 122 KiB After Width: | Height: | Size: 38 KiB |
Loading…
Reference in new issue