diff --git a/README.md b/README.md
index 40f3606..96229e0 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,63 @@
# readthedocs
+*author: junjie.jiang*
+
+
+
+
+## Desription
+
+To get the list of documents for a single Read the Docs project.
+
+
+
+
+## Code Example
+
+### Example
+
+```python
+
+from towhee import DataLoader, pipe, ops
+p = (
+ pipe.input('url')
+ .map('url', 'text', ops.text_loader())
+ .flat_map('text', 'sentence', text_split_op)
+ .map('sentence', 'embedding', ops.sentence_embedding.transformers(model_name='all-MiniLM-L6-v2'))
+ .map('embedding', 'embedding', ops.towhee.np_normalize())
+ .output('embedding')
+)
+
+
+# table cols: id, image_path, label
+
+for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html')):
+ print(p(data).to_list(kv_format=True))
+
+# batch
+for data in DataLoader(ops.data_source.readthedocs('https://towhee.readthedocs.io/en/latest/', include='*html'), batch_size=10):
+ p.batch(data)
+```
+
+**Parameters:**
+
+
+***page_prefix:*** *str*
+
+The root path of the page. Generally, the crawled links are relative paths. The complete URL needs to be obtained by splicing the root path + relative path.
+
+***index_page:*** *str*
+
+The main page contains links to all other pages, if None, will use `page_prefix`.
+
+example: https://towhee.readthedocs.io/en/latest/
+
+***include:*** *Union[List[str], str]*
+
+Only contains URLs that meet this condition.
+
+***exclude:*** *Union[List[str], str]*
+
+Filter out URLs that meet this condition.
+
+
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..af812ea
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2023 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .docs_reader import DocsReader
+
+
+def readthedocs(*args, **kwargs):
+ return DocsReader(*args, **kwargs)
diff --git a/docs_reader.py b/docs_reader.py
new file mode 100644
index 0000000..c9191c9
--- /dev/null
+++ b/docs_reader.py
@@ -0,0 +1,68 @@
+# Copyright 2023 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+import re
+from urllib import parse
+
+import requests
+from bs4 import BeautifulSoup
+
+from towhee.operator import PyOperator
+
+
+class DocsReader(PyOperator):
+ def __init__(self,
+ page_prefix: str,
+ index_page: str = None,
+ include: Union[List[str], str] = '.*',
+ exclude: Union[List[str], str] = None
+ ):
+
+ self._page_prefix = page_prefix
+ self._index_page = page_prefix if index_page is None else index_page
+ self._include = include
+ self._exclude = exclude
+
+ def __call__(self):
+ def _match(patterns, x):
+ if patterns is None or not patterns:
+ return False
+
+ if isinstance(patterns, str):
+ patterns = [patterns]
+
+ return any(re.search(pattern, x) for pattern in patterns)
+
+ response = requests.get(self._index_page)
+ response.raise_for_status()
+ soup = BeautifulSoup(response.text, "html.parser")
+ docs = soup.find_all('a')
+
+ links = []
+ for doc in docs:
+ link = doc["href"].split('#')[0]
+ if link.strip() == '':
+ continue
+
+ if _match(self._include, link) and not _match(self._exclude, link):
+ links.append(link)
+
+ links = set(links)
+ for link in links:
+ if link.startswith('http'):
+ yield link
+ else:
+ yield parse.urljoin(self._page_prefix, link)
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..dc1536f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+requests
+bs4