readthedocs
copied
4 changed files with 150 additions and 0 deletions
@ -1,2 +1,63 @@ |
|||||
# readthedocs |
# readthedocs |
||||
|
|
||||
|
*author: junjie.jiang* |
||||
|
|
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
## Desription |
||||
|
|
||||
|
To get the list of documents for a single Read the Docs project. |
||||
|
|
||||
|
<br /> |
||||
|
|
||||
|
|
||||
|
## Code Example |
||||
|
|
||||
|
### Example |
||||
|
|
||||
|
```python |
||||
|
|
||||
|
from towhee import DataLoader, pipe, ops |
||||
|
p = ( |
||||
|
pipe.input('url') |
||||
|
.map('url', 'text', ops.text_loader()) |
||||
|
.flat_map('text', 'sentence', text_split_op) |
||||
|
.map('sentence', 'embedding', ops.sentence_embedding.transformers(model_name='all-MiniLM-L6-v2')) |
||||
|
.map('embedding', 'embedding', ops.towhee.np_normalize()) |
||||
|
.output('embedding') |
||||
|
) |
||||
|
|
||||
|
|
||||
|
# table cols: id, image_path, label |
||||
|
|
||||
|
for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html')): |
||||
|
print(p(data).to_list(kv_format=True)) |
||||
|
|
||||
|
# batch |
||||
|
for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html'), batch_size=10): |
||||
|
p.batch(data) |
||||
|
``` |
||||
|
|
||||
|
**Parameters:** |
||||
|
|
||||
|
|
||||
|
***page_prefix:*** *str* |
||||
|
|
||||
|
The root path of the page. Generally, the crawled links are relative paths. The complete URL needs to be obtained by splicing the root path + relative path. |
||||
|
|
||||
|
***index_page:*** *str* |
||||
|
|
||||
|
The main page contains links to all other pages, if None, will use `page_prefix`. |
||||
|
|
||||
|
example: https://towhee.readthedocs.io/en/latest/ |
||||
|
|
||||
|
***include:*** *Union[List[str], str]* |
||||
|
|
||||
|
Only contains URLs that meet this condition. |
||||
|
|
||||
|
***exclude:*** *Union[List[str], str]* |
||||
|
|
||||
|
Filter out URLs that meet this condition. |
||||
|
|
||||
|
|
||||
|
@ -0,0 +1,19 @@ |
|||||
|
# Copyright 2023 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
from .docs_reader import DocsReader |
||||
|
|
||||
|
|
||||
|
def readthedocs(*args, **kwargs): |
||||
|
return DocsReader(*args, **kwargs) |
@ -0,0 +1,68 @@ |
|||||
|
# Copyright 2023 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
from typing import List, Union |
||||
|
import re |
||||
|
from urllib import parse |
||||
|
|
||||
|
import requests |
||||
|
from bs4 import BeautifulSoup |
||||
|
|
||||
|
from towhee.operator import PyOperator |
||||
|
|
||||
|
|
||||
|
class DocsReader(PyOperator): |
||||
|
def __init__(self, |
||||
|
page_prefix: str, |
||||
|
index_page: str = None, |
||||
|
include: Union[List[str], str] = '.*', |
||||
|
exclude: Union[List[str], str] = None |
||||
|
): |
||||
|
|
||||
|
self._page_prefix = page_prefix |
||||
|
self._index_page = page_prefix if index_page is None else index_page |
||||
|
self._include = include |
||||
|
self._exclude = exclude |
||||
|
|
||||
|
def __call__(self): |
||||
|
def _match(patterns, x): |
||||
|
if patterns is None or not patterns: |
||||
|
return False |
||||
|
|
||||
|
if isinstance(patterns, str): |
||||
|
patterns = [patterns] |
||||
|
|
||||
|
return any(re.search(pattern, x) for pattern in patterns) |
||||
|
|
||||
|
response = requests.get(self._index_page) |
||||
|
response.raise_for_status() |
||||
|
soup = BeautifulSoup(response.text, "html.parser") |
||||
|
docs = soup.find_all('a') |
||||
|
|
||||
|
links = [] |
||||
|
for doc in docs: |
||||
|
link = doc["href"].split('#')[0] |
||||
|
if link.strip() == '': |
||||
|
continue |
||||
|
|
||||
|
if _match(self._include, link) and not _match(self._exclude, link): |
||||
|
links.append(link) |
||||
|
|
||||
|
links = set(links) |
||||
|
for link in links: |
||||
|
if link.startswith('http'): |
||||
|
yield link |
||||
|
else: |
||||
|
yield parse.urljoin(self._page_prefix, link) |
||||
|
|
@ -0,0 +1,2 @@ |
|||||
|
requests |
||||
|
bs4 |
Loading…
Reference in new issue