Add docs reader

Signed-off-by: junjie.jiang <junjie.jiang@zilliz.com>
3 years ago · 6e8e0ac4ac
4 changed files with 150 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,63 @@
 # readthedocs
 *author: junjie.jiang*
 <br />
 ## Desription
 To get the list of documents for a single Read the Docs project.
 <br />
 ## Code Example
 ### Example
 ```python
 from towhee import DataLoader, pipe, ops
 p = (
    pipe.input('url')
    .map('url', 'text', ops.text_loader())
    .flat_map('text', 'sentence', text_split_op)
    .map('sentence', 'embedding', ops.sentence_embedding.transformers(model_name='all-MiniLM-L6-v2'))
    .map('embedding', 'embedding', ops.towhee.np_normalize())
    .output('embedding')
 )
 # table cols: id, image_path, label
 for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html')):
    print(p(data).to_list(kv_format=True))
 # batch
 for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html'), batch_size=10):
    p.batch(data)
 ```
 **Parameters:**
 ***page_prefix:*** *str*
 The root path of the page. Generally, the crawled links are relative paths. The complete URL needs to be obtained by splicing the root path + relative path.
 ***index_page:*** *str*
 The main page contains links to all other pages, if None, will use `page_prefix`.
 example: https://towhee.readthedocs.io/en/latest/
 ***include:*** *Union[List[str], str]*
 Only contains URLs that meet this condition.
 ***exclude:*** *Union[List[str], str]*
 Filter out URLs that meet this condition.
--- a/init.py
+++ b/init.py
@ -0,0 +1,19 @@
 # Copyright 2023 Zilliz. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .docs_reader import DocsReader
 def readthedocs(*args, **kwargs):
    return DocsReader(*args, **kwargs)
--- a/docs_reader.py
+++ b/docs_reader.py
@ -0,0 +1,68 @@
 # Copyright 2023 Zilliz. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Union
 import re
 from urllib import parse
 import requests
 from bs4 import BeautifulSoup
 from towhee.operator import PyOperator
 class DocsReader(PyOperator):
    def __init__(self,
                 page_prefix: str,
                 index_page: str = None,
                 include: Union[List[str], str] = '.*',
                 exclude: Union[List[str], str] = None
                 ):
        self._page_prefix = page_prefix
        self._index_page = page_prefix if index_page is None else index_page
        self._include = include
        self._exclude = exclude
    def __call__(self):
        def _match(patterns, x):
            if patterns is None or not patterns:
                return False
            if isinstance(patterns, str):
                patterns = [patterns]
            return any(re.search(pattern, x) for pattern in patterns)
        response = requests.get(self._index_page)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        docs = soup.find_all('a')
        links = []
        for doc in docs:
            link = doc["href"].split('#')[0]
            if link.strip() == '':
                continue
            if _match(self._include, link) and not _match(self._exclude, link):
                links.append(link)
        links = set(links)
        for link in links:
            if link.startswith('http'):
                yield link
            else:
                yield parse.urljoin(self._page_prefix, link)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 requests
 bs4