logo
Browse Source

Update README

main
shiyu22 2 years ago
parent
commit
67378ab4cb
  1. 19
      README.md
  2. 63
      es_index.py

19
README.md

@ -1,14 +1,8 @@
# ElasticSearch Index
*author: Jael*
<br />
## Description
The index operator index the given documents in ElasticSearch to get ready for retrieval.
It accepts a single document in dictionary or a list of documents (dictionaries) as input.
For each document, the index automatically generates a unique id.
The index operator index the given documents in ElasticSearch to get ready for retrieval. It accepts a single document in dictionary or a list of documents (dictionaries) as input.
To use this operator, you need to [set up ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/setup.html) in advance.
<br />
@ -18,15 +12,11 @@ To use this operator, you need to [set up ElasticSearch](https://www.elastic.co/
Insert an example document into ElasticSearch with address of localhost:9200 and index of 'test_index'.
```python
from datetime import datetime
from towhee import pipe, ops, DataCollection
example_doc = {
'title': 'Test Title',
'author': 'Towhee',
'content': 'This is an example.',
'timestamp': datetime.now()
'sentence': 'This is an example.',
}
es_insert = (
@ -34,11 +24,12 @@ es_insert = (
.map(('index_name', 'doc'), 'res', ops.elasticsearch.osschat_index(
host='localhost', port=9200
))
.map('doc', 'doc', lambda x: str(x))
.output('doc', 'res')
)
res = es_insert('test_index', example_doc) # OR: es_insert('test_index', [example_doc])
DataCollection(res).show() # Optional: display output data
res = es_insert('test_index_5', example_doc)
DataCollection(res).show()
```
<br />

63
es_index.py

@ -25,9 +25,9 @@ class ESIndex(PyOperator):
super().__init__()
try:
self.client = Elasticsearch(
f'https://{host}:{port}',
f'http://{host}:{port}' if not ca_certs else f'https://{host}:{port}',
ca_certs=ca_certs,
basic_auth=(user, password))
basic_auth=(user, password) if user and password else None)
logger.info('Successfully connected to ElasticSearch client.')
except Exception as e:
logger.error('Failed to connect ElasticSearch client:\n', e)
@ -37,9 +37,7 @@ class ESIndex(PyOperator):
def __call__(self, index_name: str, doc: Union[dict, List[dict]]):
# if index not exist, create with stop words analyzer to strengthen the search accuracy
if not self.is_index_exist(index_name):
logger.info(f'index{index_name} not exists, will create the index with stopwords analyzer')
self.create_index_with_stopwords(index_name)
self.create_index(index_name)
if isinstance(doc, dict):
docs = [doc]
else:
@ -52,9 +50,9 @@ class ESIndex(PyOperator):
{
'_op_type': 'index',
'_index': index_name,
'_source': docs[i]
'_source': d
}
for i in range(len(docs))
for d in docs
]
res = elasticsearch.helpers.bulk(self.client, actions, refresh=True)
return res
@ -62,46 +60,25 @@ class ESIndex(PyOperator):
def is_index_exist(self, index_name: str):
return self.client.indices.exists(index=index_name)
def create_index_with_stopwords(self, index_name: str):
def create_index(self, index_name: str):
settings = {
"analysis": {"analyzer": {"default": {"type": "standard"}}},
"similarity": {
"custom_bm25": {
"type": "BM25",
"k1": 2.0,
"b": 0.75,
}
},
}
mappings = {
"properties": {
"milvus_id": {
"type": "long"
},
"paragraph": {
"type": "text",
"analyzer": "my_stop_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"path": {
"sentence": {
"type": "text",
"analyzer": "my_stop_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
"similarity": "custom_bm25", # Use the custom BM25 similarity
}
}
}
settings = {
"analysis": {
"analyzer": {
"my_stop_analyzer": {
"type": "stop",
"stopwords_path": "stopwords/stopwords-en.txt"
}
}
},
"number_of_shards": 3,
"number_of_replicas": 0
}
self.client.indices.create(index=index_name, mappings=mappings, settings=settings)
logger.info(f"created index{index_name}")
# Create the index with the specified settings and mappings
self.client.indices.create(index=index_name, mappings=mappings, settings=settings)

Loading…
Cancel
Save