logo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions

68 lines
2.1 KiB

# Copyright 2023 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Union
import re
from urllib import parse
import requests
from bs4 import BeautifulSoup
from towhee.operator import PyOperator
class DocsReader(PyOperator):
def __init__(self,
page_prefix: str,
index_page: str = None,
include: Union[List[str], str] = '',
exclude: Union[List[str], str] = None
):
self._page_prefix = page_prefix
self._index_page = page_prefix if index_page is None else index_page
self._include = include
self._exclude = exclude
def __call__(self):
def _match(patterns, x):
if patterns is None or not patterns:
return False
if isinstance(patterns, str):
patterns = [patterns]
return any(re.search(pattern, x) for pattern in patterns)
response = requests.get(self._index_page)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
docs = soup.find_all('a')
links = []
for doc in docs:
link = doc["href"].split('#')[0]
if link.strip() == '':
continue
if _match(self._include, link) and not _match(self._exclude, link):
links.append(link)
links = set(links)
for link in links:
if link.startswith('http'):
yield link
else:
yield parse.urljoin(self._page_prefix, link)