readthedocs
copied
4 changed files with 150 additions and 0 deletions
@ -1,2 +1,63 @@ |
|||
# readthedocs |
|||
|
|||
*author: junjie.jiang* |
|||
|
|||
|
|||
<br /> |
|||
|
|||
## Desription |
|||
|
|||
To get the list of documents for a single Read the Docs project. |
|||
|
|||
<br /> |
|||
|
|||
|
|||
## Code Example |
|||
|
|||
### Example |
|||
|
|||
```python |
|||
|
|||
from towhee import DataLoader, pipe, ops |
|||
p = ( |
|||
pipe.input('url') |
|||
.map('url', 'text', ops.text_loader()) |
|||
.flat_map('text', 'sentence', text_split_op) |
|||
.map('sentence', 'embedding', ops.sentence_embedding.transformers(model_name='all-MiniLM-L6-v2')) |
|||
.map('embedding', 'embedding', ops.towhee.np_normalize()) |
|||
.output('embedding') |
|||
) |
|||
|
|||
|
|||
# table cols: id, image_path, label |
|||
|
|||
for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html')): |
|||
print(p(data).to_list(kv_format=True)) |
|||
|
|||
# batch |
|||
for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html'), batch_size=10): |
|||
p.batch(data) |
|||
``` |
|||
|
|||
**Parameters:** |
|||
|
|||
|
|||
***page_prefix:*** *str* |
|||
|
|||
The root path of the page. Generally, the crawled links are relative paths. The complete URL needs to be obtained by splicing the root path + relative path. |
|||
|
|||
***index_page:*** *str* |
|||
|
|||
The main page contains links to all other pages, if None, will use `page_prefix`. |
|||
|
|||
example: https://towhee.readthedocs.io/en/latest/ |
|||
|
|||
***include:*** *Union[List[str], str]* |
|||
|
|||
Only contains URLs that meet this condition. |
|||
|
|||
***exclude:*** *Union[List[str], str]* |
|||
|
|||
Filter out URLs that meet this condition. |
|||
|
|||
|
|||
|
@ -0,0 +1,19 @@ |
|||
# Copyright 2023 Zilliz. All rights reserved. |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
from .docs_reader import DocsReader |
|||
|
|||
|
|||
def readthedocs(*args, **kwargs): |
|||
return DocsReader(*args, **kwargs) |
@ -0,0 +1,68 @@ |
|||
# Copyright 2023 Zilliz. All rights reserved. |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
from typing import List, Union |
|||
import re |
|||
from urllib import parse |
|||
|
|||
import requests |
|||
from bs4 import BeautifulSoup |
|||
|
|||
from towhee.operator import PyOperator |
|||
|
|||
|
|||
class DocsReader(PyOperator): |
|||
def __init__(self, |
|||
page_prefix: str, |
|||
index_page: str = None, |
|||
include: Union[List[str], str] = '.*', |
|||
exclude: Union[List[str], str] = None |
|||
): |
|||
|
|||
self._page_prefix = page_prefix |
|||
self._index_page = page_prefix if index_page is None else index_page |
|||
self._include = include |
|||
self._exclude = exclude |
|||
|
|||
def __call__(self): |
|||
def _match(patterns, x): |
|||
if patterns is None or not patterns: |
|||
return False |
|||
|
|||
if isinstance(patterns, str): |
|||
patterns = [patterns] |
|||
|
|||
return any(re.search(pattern, x) for pattern in patterns) |
|||
|
|||
response = requests.get(self._index_page) |
|||
response.raise_for_status() |
|||
soup = BeautifulSoup(response.text, "html.parser") |
|||
docs = soup.find_all('a') |
|||
|
|||
links = [] |
|||
for doc in docs: |
|||
link = doc["href"].split('#')[0] |
|||
if link.strip() == '': |
|||
continue |
|||
|
|||
if _match(self._include, link) and not _match(self._exclude, link): |
|||
links.append(link) |
|||
|
|||
links = set(links) |
|||
for link in links: |
|||
if link.startswith('http'): |
|||
yield link |
|||
else: |
|||
yield parse.urljoin(self._page_prefix, link) |
|||
|
@ -0,0 +1,2 @@ |
|||
requests |
|||
bs4 |
Loading…
Reference in new issue