readthedocs/docs_reader.py

# Copyright 2023 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Union
import re
from urllib import parse

import requests
from bs4 import BeautifulSoup

from towhee.operator import PyOperator


class DocsReader(PyOperator):
    def __init__(self,
                 page_prefix: str,
                 index_page: str = None,
                 include: Union[List[str], str] = '.*',
                 exclude: Union[List[str], str] = None
                 ):

        self._page_prefix = page_prefix
        self._index_page = page_prefix if index_page is None else index_page
        self._include = include
        self._exclude = exclude

    def __call__(self):
        def _match(patterns, x):
            if patterns is None or not patterns:
                return False

            if isinstance(patterns, str):
                patterns = [patterns]

            return any(re.search(pattern, x) for pattern in patterns)
        
        response = requests.get(self._index_page)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        docs = soup.find_all('a')

        links = []
        for doc in docs:
            link = doc["href"].split('#')[0]
            if link.strip() == '':
                continue

            if _match(self._include, link) and not _match(self._exclude, link):
                links.append(link)

        links = set(links)
        for link in links:
            if link.startswith('http'):
                yield link
            else:
                yield parse.urljoin(self._page_prefix, link)
Add docs reader Signed-off-by: junjie.jiang <junjie.jiang@zilliz.com> 3 years ago			`# Copyright 2023 Zilliz. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`from typing import List, Union`
			`import re`
			`from urllib import parse`

			`import requests`
			`from bs4 import BeautifulSoup`

			`from towhee.operator import PyOperator`


			`class DocsReader(PyOperator):`
			`def __init__(self,`
			`page_prefix: str,`
			`index_page: str = None,`
			`include: Union[List[str], str] = '.*',`
			`exclude: Union[List[str], str] = None`
			`):`

			`self._page_prefix = page_prefix`
			`self._index_page = page_prefix if index_page is None else index_page`
			`self._include = include`
			`self._exclude = exclude`

			`def __call__(self):`
			`def _match(patterns, x):`
			`if patterns is None or not patterns:`
			`return False`

			`if isinstance(patterns, str):`
			`patterns = [patterns]`

			`return any(re.search(pattern, x) for pattern in patterns)`

			`response = requests.get(self._index_page)`
			`response.raise_for_status()`
			`soup = BeautifulSoup(response.text, "html.parser")`
			`docs = soup.find_all('a')`

			`links = []`
			`for doc in docs:`
			`link = doc["href"].split('#')[0]`
			`if link.strip() == '':`
			`continue`

			`if _match(self._include, link) and not _match(self._exclude, link):`
			`links.append(link)`

			`links = set(links)`
			`for link in links:`
			`if link.startswith('http'):`
			`yield link`
			`else:`
			`yield parse.urljoin(self._page_prefix, link)`