diff --git a/README.md b/README.md
index 77a5257..d827214 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,8 @@
# ElasticSearch Index
-*author: Jael*
-
-
-
## Description
-The index operator index the given documents in ElasticSearch to get ready for retrieval.
-It accepts a single document in dictionary or a list of documents (dictionaries) as input.
-For each document, the index automatically generates a unique id.
+The index operator index the given documents in ElasticSearch to get ready for retrieval. It accepts a single document in dictionary or a list of documents (dictionaries) as input.
To use this operator, you need to [set up ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/setup.html) in advance.
@@ -18,15 +12,11 @@ To use this operator, you need to [set up ElasticSearch](https://www.elastic.co/
Insert an example document into ElasticSearch with address of localhost:9200 and index of 'test_index'.
```python
-from datetime import datetime
from towhee import pipe, ops, DataCollection
example_doc = {
- 'title': 'Test Title',
- 'author': 'Towhee',
- 'content': 'This is an example.',
- 'timestamp': datetime.now()
+ 'sentence': 'This is an example.',
}
es_insert = (
@@ -34,11 +24,12 @@ es_insert = (
.map(('index_name', 'doc'), 'res', ops.elasticsearch.osschat_index(
host='localhost', port=9200
))
+ .map('doc', 'doc', lambda x: str(x))
.output('doc', 'res')
)
-res = es_insert('test_index', example_doc) # OR: es_insert('test_index', [example_doc])
-DataCollection(res).show() # Optional: display output data
+res = es_insert('test_index_5', example_doc)
+DataCollection(res).show()
```
diff --git a/es_index.py b/es_index.py
index c7128f4..a18731c 100644
--- a/es_index.py
+++ b/es_index.py
@@ -25,9 +25,9 @@ class ESIndex(PyOperator):
super().__init__()
try:
self.client = Elasticsearch(
- f'https://{host}:{port}',
+ f'http://{host}:{port}' if not ca_certs else f'https://{host}:{port}',
ca_certs=ca_certs,
- basic_auth=(user, password))
+ basic_auth=(user, password) if user and password else None)
logger.info('Successfully connected to ElasticSearch client.')
except Exception as e:
logger.error('Failed to connect ElasticSearch client:\n', e)
@@ -37,9 +37,7 @@ class ESIndex(PyOperator):
def __call__(self, index_name: str, doc: Union[dict, List[dict]]):
# if index not exist, create with stop words analyzer to strengthen the search accuracy
if not self.is_index_exist(index_name):
- logger.info(f'index{index_name} not exists, will create the index with stopwords analyzer')
- self.create_index_with_stopwords(index_name)
-
+ self.create_index(index_name)
if isinstance(doc, dict):
docs = [doc]
else:
@@ -52,9 +50,9 @@ class ESIndex(PyOperator):
{
'_op_type': 'index',
'_index': index_name,
- '_source': docs[i]
+ '_source': d
}
- for i in range(len(docs))
+ for d in docs
]
res = elasticsearch.helpers.bulk(self.client, actions, refresh=True)
return res
@@ -62,46 +60,25 @@ class ESIndex(PyOperator):
def is_index_exist(self, index_name: str):
return self.client.indices.exists(index=index_name)
- def create_index_with_stopwords(self, index_name: str):
+ def create_index(self, index_name: str):
+ settings = {
+ "analysis": {"analyzer": {"default": {"type": "standard"}}},
+ "similarity": {
+ "custom_bm25": {
+ "type": "BM25",
+ "k1": 2.0,
+ "b": 0.75,
+ }
+ },
+ }
mappings = {
"properties": {
- "milvus_id": {
- "type": "long"
- },
- "paragraph": {
- "type": "text",
- "analyzer": "my_stop_analyzer",
- "fields": {
- "keyword": {
- "type": "keyword",
- "ignore_above": 256
- }
- }
- },
- "path": {
+ "sentence": {
"type": "text",
- "analyzer": "my_stop_analyzer",
- "fields": {
- "keyword": {
- "type": "keyword",
- "ignore_above": 256
- }
- }
+ "similarity": "custom_bm25", # Use the custom BM25 similarity
}
}
}
- settings = {
- "analysis": {
- "analyzer": {
- "my_stop_analyzer": {
- "type": "stop",
- "stopwords_path": "stopwords/stopwords-en.txt"
- }
- }
- },
- "number_of_shards": 3,
- "number_of_replicas": 0
- }
- self.client.indices.create(index=index_name, mappings=mappings, settings=settings)
- logger.info(f"created index{index_name}")
+ # Create the index with the specified settings and mappings
+ self.client.indices.create(index=index_name, mappings=mappings, settings=settings)