from towhee.operator import PyOperator
from typing import List, Optional


class TextLoader(PyOperator):
    '''Load data from url or file (paths or file-like objects).'''
    def __init__(self) -> None:
        super().__init__()
    
    def __call__(self, data_src) -> List[str]:
        if data_src.startswith('http'):
            docs= self._from_url(data_src)
        else:
            docs = self._from_file(data_src)
        return docs

    def _from_file(self, file, encoding: Optional[str] = None) -> str:
        '''Load documents from path or file-like object, return a list of unsplit Langchain Documents'''
        if hasattr(file, 'name'):
            file_path = file.name
        else:
            file_path = file
        with open(file_path, encoding=encoding) as f:
            text = f.read()
        return text
    
    def _from_url(self, url: str) -> str:
        from langchain.document_loaders import UnstructuredURLLoader

        loader = UnstructuredURLLoader(urls=[url])
        doc = loader.load()[0]
        return doc.page_content