# Copyright 2023 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Union
import re
from urllib import parse

import requests
from bs4 import BeautifulSoup

from towhee.operator import PyOperator


class DocsReader(PyOperator):
    def __init__(self,
                 page_prefix: str,
                 index_page: str = None,
                 include: Union[List[str], str] = '',
                 exclude: Union[List[str], str] = None
                 ):

        self._page_prefix = page_prefix
        self._index_page = page_prefix if index_page is None else index_page
        self._include = include
        self._exclude = exclude

    def __call__(self):
        def _match(patterns, x):
            if patterns is None or not patterns:
                return False

            if isinstance(patterns, str):
                patterns = [patterns]

            return any(re.search(pattern, x) for pattern in patterns)
        
        response = requests.get(self._index_page)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        docs = soup.find_all('a')

        links = []
        for doc in docs:
            link = doc["href"].split('#')[0]
            if link.strip() == '':
                continue

            if _match(self._include, link) and not _match(self._exclude, link):
                links.append(link)

        links = set(links)
        for link in links:
            if link.startswith('http'):
                yield link
            else:
                yield parse.urljoin(self._page_prefix, link)