logo
Browse Source

Add files

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
support-stopwords
Jael Gu 3 years ago
parent
commit
145fb1a24f
  1. 37
      README.md
  2. 4
      __init__.py
  3. 53
      es_index.py

37
README.md

@ -1,2 +1,37 @@
# index-client
# ElasticSearch Index
## Description
The index operator index the given documents in ElasticSearch to get ready for retrieval.
It accepts a single document in dictionary or a list of documents (dictionaries) as input.
For each document, the index automatically generates a unique id.
To use this operator, you need to [set up ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/setup.html) in advance.
## Code Example
Insert an example document into ElasticSearch with address of localhost:9200 and index of 'test_index'.
```python
from datetime import datetime
from towhee.dc2 import pipe, ops, DataCollection
example_doc = {
'title': 'Test Title',
'author': 'Towhee',
'content': 'This is an example.',
'timestamp': datetime.now()
}
es_insert = (
pipe.input('doc')
.map('doc', 'res', ops.elasticsearch.index_client(
host='localhost', port=9200, index_name='test_index'
))
.output('doc', 'res')
)
res = es_insert(example_doc) # OR: es_insert([example_doc])
DataCollection(res).show() # Optional: display output data
```

4
__init__.py

@ -0,0 +1,4 @@
from .es_index import ESIndex
def index_client(*args, **kwargs):
return ESIndex(*args, **kwargs)

53
es_index.py

@ -0,0 +1,53 @@
import logging
from typing import Union, List
from elasticsearch import Elasticsearch
import elasticsearch.helpers # type: ignore
from towhee.operator import PyOperator, SharedType # type: ignore
logger = logging.getLogger()
class ESIndex(PyOperator):
"""
Use bulk to insert docs into ElasticSearch index, using auto id generated.
Args:
host (`str`): host to connect ElasticSearch client
port (`int`): port to connect ElasticSearch client
index_name (`str`): index name to index input docs
user (`str=None`): user name to connect ElasticSearch client, defaults to None
password (`str=None`): user password to connect ElasticSearch client, defaults to None
ca_certs (`str=None`): path to CA certificate, defaults to None
"""
def __init__(self, host: str, port: int, index_name: str, user: str = None, password: str = None, ca_certs: str = None):
super().__init__()
self.index_name = index_name
try:
self.client = Elasticsearch(
f'https://{host}:{port}',
ca_certs=ca_certs,
basic_auth=(user, password))
logger.info('Successfully connected to ElasticSearch client.')
except Exception as e:
logger.error('Failed to connect ElasticSearch client:\n', e)
raise e
def __call__(self, doc: Union[dict, List[dict]]):
if isinstance(doc, dict):
docs = [doc]
for x in docs:
assert isinstance(x, dict)
actions = [
{
'_op_type': 'index',
'_index': self.index_name,
'_source': docs[i]
}
for i in range(len(docs))
]
res = elasticsearch.helpers.bulk(self.client, actions)
return res
Loading…
Cancel
Save