logo
Browse Source

Add docs reader

Signed-off-by: junjie.jiang <junjie.jiang@zilliz.com>
main
junjie.jiang 12 months ago
parent
commit
6e8e0ac4ac
  1. 61
      README.md
  2. 19
      __init__.py
  3. 68
      docs_reader.py
  4. 2
      requirements.txt

61
README.md

@ -1,2 +1,63 @@
# readthedocs
*author: junjie.jiang*
<br />
## Desription
To get the list of documents for a single Read the Docs project.
<br />
## Code Example
### Example
```python
from towhee import DataLoader, pipe, ops
p = (
pipe.input('url')
.map('url', 'text', ops.text_loader())
.flat_map('text', 'sentence', text_split_op)
.map('sentence', 'embedding', ops.sentence_embedding.transformers(model_name='all-MiniLM-L6-v2'))
.map('embedding', 'embedding', ops.towhee.np_normalize())
.output('embedding')
)
# table cols: id, image_path, label
for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html')):
print(p(data).to_list(kv_format=True))
# batch
for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html'), batch_size=10):
p.batch(data)
```
**Parameters:**
***page_prefix:*** *str*
The root path of the page. Generally, the crawled links are relative paths. The complete URL needs to be obtained by splicing the root path + relative path.
***index_page:*** *str*
The main page contains links to all other pages, if None, will use `page_prefix`.
example: https://towhee.readthedocs.io/en/latest/
***include:*** *Union[List[str], str]*
Only contains URLs that meet this condition.
***exclude:*** *Union[List[str], str]*
Filter out URLs that meet this condition.

19
__init__.py

@ -0,0 +1,19 @@
# Copyright 2023 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .docs_reader import DocsReader
def readthedocs(*args, **kwargs):
return DocsReader(*args, **kwargs)

68
docs_reader.py

@ -0,0 +1,68 @@
# Copyright 2023 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Union
import re
from urllib import parse
import requests
from bs4 import BeautifulSoup
from towhee.operator import PyOperator
class DocsReader(PyOperator):
def __init__(self,
page_prefix: str,
index_page: str = None,
include: Union[List[str], str] = '.*',
exclude: Union[List[str], str] = None
):
self._page_prefix = page_prefix
self._index_page = page_prefix if index_page is None else index_page
self._include = include
self._exclude = exclude
def __call__(self):
def _match(patterns, x):
if patterns is None or not patterns:
return False
if isinstance(patterns, str):
patterns = [patterns]
return any(re.search(pattern, x) for pattern in patterns)
response = requests.get(self._index_page)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
docs = soup.find_all('a')
links = []
for doc in docs:
link = doc["href"].split('#')[0]
if link.strip() == '':
continue
if _match(self._include, link) and not _match(self._exclude, link):
links.append(link)
links = set(links)
for link in links:
if link.startswith('http'):
yield link
else:
yield parse.urljoin(self._page_prefix, link)

2
requirements.txt

@ -0,0 +1,2 @@
requests
bs4
Loading…
Cancel
Save