# Copyright 2023 Zilliz. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Union import re from urllib import parse import requests from bs4 import BeautifulSoup from towhee.operator import PyOperator class DocsReader(PyOperator): def __init__(self, page_prefix: str, index_page: str = None, include: Union[List[str], str] = '', exclude: Union[List[str], str] = None ): self._page_prefix = page_prefix self._index_page = page_prefix if index_page is None else index_page self._include = include self._exclude = exclude def __call__(self): def _match(patterns, x): if patterns is None or not patterns: return False if isinstance(patterns, str): patterns = [patterns] return any(re.search(pattern, x) for pattern in patterns) response = requests.get(self._index_page) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") docs = soup.find_all('a') links = [] for doc in docs: link = doc["href"].split('#')[0] if link.strip() == '': continue if _match(self._include, link) and not _match(self._exclude, link): links.append(link) links = set(links) for link in links: if link.startswith('http'): yield link else: yield parse.urljoin(self._page_prefix, link)