readthedocs
copied
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions
69 lines
2.1 KiB
69 lines
2.1 KiB
2 years ago
|
# Copyright 2023 Zilliz. All rights reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
|
||
|
from typing import List, Union
|
||
|
import re
|
||
|
from urllib import parse
|
||
|
|
||
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
from towhee.operator import PyOperator
|
||
|
|
||
|
|
||
|
class DocsReader(PyOperator):
|
||
|
def __init__(self,
|
||
|
page_prefix: str,
|
||
|
index_page: str = None,
|
||
|
include: Union[List[str], str] = '.*',
|
||
|
exclude: Union[List[str], str] = None
|
||
|
):
|
||
|
|
||
|
self._page_prefix = page_prefix
|
||
|
self._index_page = page_prefix if index_page is None else index_page
|
||
|
self._include = include
|
||
|
self._exclude = exclude
|
||
|
|
||
|
def __call__(self):
|
||
|
def _match(patterns, x):
|
||
|
if patterns is None or not patterns:
|
||
|
return False
|
||
|
|
||
|
if isinstance(patterns, str):
|
||
|
patterns = [patterns]
|
||
|
|
||
|
return any(re.search(pattern, x) for pattern in patterns)
|
||
|
|
||
|
response = requests.get(self._index_page)
|
||
|
response.raise_for_status()
|
||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||
|
docs = soup.find_all('a')
|
||
|
|
||
|
links = []
|
||
|
for doc in docs:
|
||
|
link = doc["href"].split('#')[0]
|
||
|
if link.strip() == '':
|
||
|
continue
|
||
|
|
||
|
if _match(self._include, link) and not _match(self._exclude, link):
|
||
|
links.append(link)
|
||
|
|
||
|
links = set(links)
|
||
|
for link in links:
|
||
|
if link.startswith('http'):
|
||
|
yield link
|
||
|
else:
|
||
|
yield parse.urljoin(self._page_prefix, link)
|
||
|
|