readthedocs/docs_reader.py


								# Copyright 2023 Zilliz. All rights reserved.

								#

								# Licensed under the Apache License, Version 2.0 (the "License");

								# you may not use this file except in compliance with the License.

								# You may obtain a copy of the License at

								#

								#     http://www.apache.org/licenses/LICENSE-2.0

								#

								# Unless required by applicable law or agreed to in writing, software

								# distributed under the License is distributed on an "AS IS" BASIS,

								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

								# See the License for the specific language governing permissions and

								# limitations under the License.


								from typing import List, Union

								import re

								from urllib import parse


								import requests

								from bs4 import BeautifulSoup


								from towhee.operator import PyOperator


								class DocsReader(PyOperator):

								    def __init__(self,

								                 page_prefix: str,

								                 index_page: str = None,

								                 include: Union[List[str], str] = '',

								                 exclude: Union[List[str], str] = None

								                 ):


								        self._page_prefix = page_prefix

								        self._index_page = page_prefix if index_page is None else index_page

								        self._include = include

								        self._exclude = exclude


								    def __call__(self):

								        def _match(patterns, x):

								            if patterns is None or not patterns:

								                return False


								            if isinstance(patterns, str):

								                patterns = [patterns]


								            return any(re.search(pattern, x) for pattern in patterns)


								        response = requests.get(self._index_page)

								        response.raise_for_status()

								        soup = BeautifulSoup(response.text, "html.parser")

								        docs = soup.find_all('a')


								        links = []

								        for doc in docs:

								            link = doc["href"].split('#')[0]

								            if link.strip() == '':

								                continue


								            if _match(self._include, link) and not _match(self._exclude, link):

								                links.append(link)


								        links = set(links)

								        for link in links:

								            if link.startswith('http'):

								                yield link

								            else:

								                yield parse.urljoin(self._page_prefix, link)