Add docs reader

Signed-off-by: junjie.jiang <junjie.jiang@zilliz.com>
2 years ago · 6e8e0ac4ac
4 changed files with 150 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,63 @@
 # readthedocs

+*author: junjie.jiang*
+
+
+<br />
+
+## Desription
+
+To get the list of documents for a single Read the Docs project.
+
+<br />
+
+
+## Code Example
+
+### Example
+
+```python
+
+from towhee import DataLoader, pipe, ops
+p = (
+    pipe.input('url')
+    .map('url', 'text', ops.text_loader())
+    .flat_map('text', 'sentence', text_split_op)
+    .map('sentence', 'embedding', ops.sentence_embedding.transformers(model_name='all-MiniLM-L6-v2'))
+    .map('embedding', 'embedding', ops.towhee.np_normalize())
+    .output('embedding')
+)
+
+
+# table cols: id, image_path, label
+
+for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html')):
+    print(p(data).to_list(kv_format=True))
+
+# batch
+for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html'), batch_size=10):
+    p.batch(data)
+```
+
+**Parameters:**
+
+
+***page_prefix:*** *str*
+
+The root path of the page. Generally, the crawled links are relative paths. The complete URL needs to be obtained by splicing the root path + relative path.
+
+***index_page:*** *str*
+
+The main page contains links to all other pages, if None, will use `page_prefix`.
+
+example: https://towhee.readthedocs.io/en/latest/
+
+***include:*** *Union[List[str], str]*
+
+Only contains URLs that meet this condition.
+
+***exclude:*** *Union[List[str], str]*
+
+Filter out URLs that meet this condition.
+
+
--- a/init.py
+++ b/init.py
@ -0,0 +1,19 @@
+# Copyright 2023 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .docs_reader import DocsReader
+
+
+def readthedocs(*args, **kwargs):
+    return DocsReader(*args, **kwargs)
--- a/docs_reader.py
+++ b/docs_reader.py
@ -0,0 +1,68 @@
+# Copyright 2023 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+import re
+from urllib import parse
+
+import requests
+from bs4 import BeautifulSoup
+
+from towhee.operator import PyOperator
+
+
+class DocsReader(PyOperator):
+    def __init__(self,
+                 page_prefix: str,
+                 index_page: str = None,
+                 include: Union[List[str], str] = '.*',
+                 exclude: Union[List[str], str] = None
+                 ):
+
+        self._page_prefix = page_prefix
+        self._index_page = page_prefix if index_page is None else index_page
+        self._include = include
+        self._exclude = exclude
+
+    def __call__(self):
+        def _match(patterns, x):
+            if patterns is None or not patterns:
+                return False
+
+            if isinstance(patterns, str):
+                patterns = [patterns]
+
+            return any(re.search(pattern, x) for pattern in patterns)
+        
+        response = requests.get(self._index_page)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        docs = soup.find_all('a')
+
+        links = []
+        for doc in docs:
+            link = doc["href"].split('#')[0]
+            if link.strip() == '':
+                continue
+
+            if _match(self._include, link) and not _match(self._exclude, link):
+                links.append(link)
+
+        links = set(links)
+        for link in links:
+            if link.startswith('http'):
+                yield link
+            else:
+                yield parse.urljoin(self._page_prefix, link)
+
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+requests
+bs4