diff --git a/README.md b/README.md index 40f3606..96229e0 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,63 @@ # readthedocs +*author: junjie.jiang* + + +
+ +## Desription + +To get the list of documents for a single Read the Docs project. + +
+ + +## Code Example + +### Example + +```python + +from towhee import DataLoader, pipe, ops +p = ( + pipe.input('url') + .map('url', 'text', ops.text_loader()) + .flat_map('text', 'sentence', text_split_op) + .map('sentence', 'embedding', ops.sentence_embedding.transformers(model_name='all-MiniLM-L6-v2')) + .map('embedding', 'embedding', ops.towhee.np_normalize()) + .output('embedding') +) + + +# table cols: id, image_path, label + +for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html')): + print(p(data).to_list(kv_format=True)) + +# batch +for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html'), batch_size=10): + p.batch(data) +``` + +**Parameters:** + + +***page_prefix:*** *str* + +The root path of the page. Generally, the crawled links are relative paths. The complete URL needs to be obtained by splicing the root path + relative path. + +***index_page:*** *str* + +The main page contains links to all other pages, if None, will use `page_prefix`. + +example: https://towhee.readthedocs.io/en/latest/ + +***include:*** *Union[List[str], str]* + +Only contains URLs that meet this condition. + +***exclude:*** *Union[List[str], str]* + +Filter out URLs that meet this condition. + + diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..af812ea --- /dev/null +++ b/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2023 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .docs_reader import DocsReader + + +def readthedocs(*args, **kwargs): + return DocsReader(*args, **kwargs) diff --git a/docs_reader.py b/docs_reader.py new file mode 100644 index 0000000..c9191c9 --- /dev/null +++ b/docs_reader.py @@ -0,0 +1,68 @@ +# Copyright 2023 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Union +import re +from urllib import parse + +import requests +from bs4 import BeautifulSoup + +from towhee.operator import PyOperator + + +class DocsReader(PyOperator): + def __init__(self, + page_prefix: str, + index_page: str = None, + include: Union[List[str], str] = '.*', + exclude: Union[List[str], str] = None + ): + + self._page_prefix = page_prefix + self._index_page = page_prefix if index_page is None else index_page + self._include = include + self._exclude = exclude + + def __call__(self): + def _match(patterns, x): + if patterns is None or not patterns: + return False + + if isinstance(patterns, str): + patterns = [patterns] + + return any(re.search(pattern, x) for pattern in patterns) + + response = requests.get(self._index_page) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + docs = soup.find_all('a') + + links = [] + for doc in docs: + link = doc["href"].split('#')[0] + if link.strip() == '': + continue + + if _match(self._include, link) and not _match(self._exclude, link): + links.append(link) + + links = set(links) + for link in links: + if link.startswith('http'): + yield link + else: + yield parse.urljoin(self._page_prefix, link) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dc1536f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +bs4