osschat-index
copied
3 changed files with 188 additions and 1 deletions
@ -1,2 +1,78 @@ |
|||
# osschat-index |
|||
# ElasticSearch Index |
|||
|
|||
*author: Jael* |
|||
|
|||
<br /> |
|||
|
|||
## Description |
|||
|
|||
The index operator index the given documents in ElasticSearch to get ready for retrieval. |
|||
It accepts a single document in dictionary or a list of documents (dictionaries) as input. |
|||
For each document, the index automatically generates a unique id. |
|||
To use this operator, you need to [set up ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/setup.html) in advance. |
|||
|
|||
<br /> |
|||
|
|||
## Code Example |
|||
|
|||
Insert an example document into ElasticSearch with address of localhost:9200 and index of 'test_index'. |
|||
|
|||
```python |
|||
from datetime import datetime |
|||
from towhee import pipe, ops, DataCollection |
|||
|
|||
|
|||
example_doc = { |
|||
'title': 'Test Title', |
|||
'author': 'Towhee', |
|||
'content': 'This is an example.', |
|||
'timestamp': datetime.now() |
|||
} |
|||
|
|||
es_insert = ( |
|||
pipe.input('index_name', 'doc') |
|||
.map(('index_name', 'doc'), 'res', ops.elasticsearch.index_client( |
|||
host='localhost', port=9200 |
|||
)) |
|||
.output('doc', 'res') |
|||
) |
|||
|
|||
res = es_insert('test_index', example_doc) # OR: es_insert('test_index', [example_doc]) |
|||
DataCollection(res).show() # Optional: display output data |
|||
``` |
|||
|
|||
<br /> |
|||
|
|||
## Factory Constructor |
|||
|
|||
Create the operator via the following factory method: |
|||
|
|||
***elasticsearch.search(host='localhost', port=9200, user=None, password=None, ca_certs=None)*** |
|||
|
|||
**Parameters:** |
|||
|
|||
***host***: *str* |
|||
|
|||
The host to connect ElasticSearch client. |
|||
|
|||
***port***: *int* |
|||
|
|||
The port to connect ElasticSearch client. |
|||
|
|||
***user***: *str* |
|||
|
|||
The username to connect ElasticSearch client if needed, defaults to None. |
|||
|
|||
***password***: *str* |
|||
|
|||
The user password to connect ElasticSearch client if needed, defaults to None. |
|||
|
|||
***ca_certs***: *str* |
|||
|
|||
The path to CA certificates to connect ElasticSearch client if needed, defaults to None. |
|||
|
|||
<br /> |
|||
|
|||
**Returns:** |
|||
|
|||
Index response wrapped by `elastic_transport.ObjectApiResponse`. |
|||
|
@ -0,0 +1,4 @@ |
|||
from .es_index import ESIndex |
|||
|
|||
def osschat_index(*args, **kwargs): |
|||
return ESIndex(*args, **kwargs) |
@ -0,0 +1,107 @@ |
|||
import logging |
|||
from typing import Union, List |
|||
|
|||
from elasticsearch import Elasticsearch |
|||
import elasticsearch.helpers # type: ignore |
|||
|
|||
from towhee.operator import PyOperator, SharedType # type: ignore |
|||
|
|||
|
|||
logger = logging.getLogger() |
|||
|
|||
|
|||
class ESIndex(PyOperator): |
|||
""" |
|||
Use bulk to insert docs into ElasticSearch index, using auto id generated. |
|||
|
|||
Args: |
|||
host (`str`): host to connect ElasticSearch client |
|||
port (`int`): port to connect ElasticSearch client |
|||
user (`str=None`): user name to connect ElasticSearch client, defaults to None |
|||
password (`str=None`): user password to connect ElasticSearch client, defaults to None |
|||
ca_certs (`str=None`): path to CA certificate, defaults to None |
|||
""" |
|||
def __init__(self, host: str, port: int, user: str = None, password: str = None, ca_certs: str = None): |
|||
super().__init__() |
|||
try: |
|||
self.client = Elasticsearch( |
|||
f'https://{host}:{port}', |
|||
ca_certs=ca_certs, |
|||
basic_auth=(user, password)) |
|||
logger.info('Successfully connected to ElasticSearch client.') |
|||
except Exception as e: |
|||
logger.error('Failed to connect ElasticSearch client:\n', e) |
|||
raise e |
|||
|
|||
|
|||
def __call__(self, index_name: str, doc: Union[dict, List[dict]]): |
|||
# if index not exist, create with stop words analyzer to strengthen the search accuracy |
|||
if not self.is_index_exist(index_name): |
|||
logger.info(f'index{index_name} not exists, will create the index with stopwords analyzer') |
|||
self.create_index_with_stopwords(index_name) |
|||
|
|||
if isinstance(doc, dict): |
|||
docs = [doc] |
|||
else: |
|||
docs = doc |
|||
|
|||
for x in docs: |
|||
assert isinstance(x, dict) |
|||
|
|||
actions = [ |
|||
{ |
|||
'_op_type': 'index', |
|||
'_index': index_name, |
|||
'_source': docs[i] |
|||
} |
|||
for i in range(len(docs)) |
|||
] |
|||
res = elasticsearch.helpers.bulk(self.client, actions, refresh=True) |
|||
return res |
|||
|
|||
def is_index_exist(self, index_name: str): |
|||
return self.client.indices.exists(index=index_name) |
|||
|
|||
def create_index_with_stopwords(self, index_name: str): |
|||
mappings = { |
|||
"properties": { |
|||
"milvus_id": { |
|||
"type": "long" |
|||
}, |
|||
"paragraph": { |
|||
"type": "text", |
|||
"analyzer": "my_stop_analyzer", |
|||
"fields": { |
|||
"keyword": { |
|||
"type": "keyword", |
|||
"ignore_above": 256 |
|||
} |
|||
} |
|||
}, |
|||
"path": { |
|||
"type": "text", |
|||
"analyzer": "my_stop_analyzer", |
|||
"fields": { |
|||
"keyword": { |
|||
"type": "keyword", |
|||
"ignore_above": 256 |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
|||
settings = { |
|||
"analysis": { |
|||
"analyzer": { |
|||
"my_stop_analyzer": { |
|||
"type": "stop", |
|||
"stopwords_path": "stopwords/stopwords-en.txt" |
|||
} |
|||
} |
|||
}, |
|||
"number_of_shards": 3, |
|||
"number_of_replicas": 0 |
|||
} |
|||
self.client.indices.create(index=index_name, mappings=mappings, settings=settings) |
|||
logger.info(f"created index{index_name}") |
|||
|
Loading…
Reference in new issue