diff --git a/README.md b/README.md index 77a5257..d827214 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,8 @@ # ElasticSearch Index -*author: Jael* - -
- ## Description -The index operator index the given documents in ElasticSearch to get ready for retrieval. -It accepts a single document in dictionary or a list of documents (dictionaries) as input. -For each document, the index automatically generates a unique id. +The index operator index the given documents in ElasticSearch to get ready for retrieval. It accepts a single document in dictionary or a list of documents (dictionaries) as input. To use this operator, you need to [set up ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/setup.html) in advance.
@@ -18,15 +12,11 @@ To use this operator, you need to [set up ElasticSearch](https://www.elastic.co/ Insert an example document into ElasticSearch with address of localhost:9200 and index of 'test_index'. ```python -from datetime import datetime from towhee import pipe, ops, DataCollection example_doc = { - 'title': 'Test Title', - 'author': 'Towhee', - 'content': 'This is an example.', - 'timestamp': datetime.now() + 'sentence': 'This is an example.', } es_insert = ( @@ -34,11 +24,12 @@ es_insert = ( .map(('index_name', 'doc'), 'res', ops.elasticsearch.osschat_index( host='localhost', port=9200 )) + .map('doc', 'doc', lambda x: str(x)) .output('doc', 'res') ) -res = es_insert('test_index', example_doc) # OR: es_insert('test_index', [example_doc]) -DataCollection(res).show() # Optional: display output data +res = es_insert('test_index_5', example_doc) +DataCollection(res).show() ```
diff --git a/es_index.py b/es_index.py index c7128f4..a18731c 100644 --- a/es_index.py +++ b/es_index.py @@ -25,9 +25,9 @@ class ESIndex(PyOperator): super().__init__() try: self.client = Elasticsearch( - f'https://{host}:{port}', + f'http://{host}:{port}' if not ca_certs else f'https://{host}:{port}', ca_certs=ca_certs, - basic_auth=(user, password)) + basic_auth=(user, password) if user and password else None) logger.info('Successfully connected to ElasticSearch client.') except Exception as e: logger.error('Failed to connect ElasticSearch client:\n', e) @@ -37,9 +37,7 @@ class ESIndex(PyOperator): def __call__(self, index_name: str, doc: Union[dict, List[dict]]): # if index not exist, create with stop words analyzer to strengthen the search accuracy if not self.is_index_exist(index_name): - logger.info(f'index{index_name} not exists, will create the index with stopwords analyzer') - self.create_index_with_stopwords(index_name) - + self.create_index(index_name) if isinstance(doc, dict): docs = [doc] else: @@ -52,9 +50,9 @@ class ESIndex(PyOperator): { '_op_type': 'index', '_index': index_name, - '_source': docs[i] + '_source': d } - for i in range(len(docs)) + for d in docs ] res = elasticsearch.helpers.bulk(self.client, actions, refresh=True) return res @@ -62,46 +60,25 @@ class ESIndex(PyOperator): def is_index_exist(self, index_name: str): return self.client.indices.exists(index=index_name) - def create_index_with_stopwords(self, index_name: str): + def create_index(self, index_name: str): + settings = { + "analysis": {"analyzer": {"default": {"type": "standard"}}}, + "similarity": { + "custom_bm25": { + "type": "BM25", + "k1": 2.0, + "b": 0.75, + } + }, + } mappings = { "properties": { - "milvus_id": { - "type": "long" - }, - "paragraph": { - "type": "text", - "analyzer": "my_stop_analyzer", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } - }, - "path": { + "sentence": { "type": "text", - "analyzer": "my_stop_analyzer", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - } - } + "similarity": "custom_bm25", # Use the custom BM25 similarity } } } - settings = { - "analysis": { - "analyzer": { - "my_stop_analyzer": { - "type": "stop", - "stopwords_path": "stopwords/stopwords-en.txt" - } - } - }, - "number_of_shards": 3, - "number_of_replicas": 0 - } - self.client.indices.create(index=index_name, mappings=mappings, settings=settings) - logger.info(f"created index{index_name}") + # Create the index with the specified settings and mappings + self.client.indices.create(index=index_name, mappings=mappings, settings=settings)