From 496485dedd9308fff08c39fe7a97afeee5a30a62 Mon Sep 17 00:00:00 2001
From: shiyu22 <shiyu.chen@zilliz.com>
Date: Wed, 14 Jun 2023 14:58:51 +0800
Subject: [PATCH] Add osschat-insert

---
 README.md         | 154 +++++++++++++++++++++++++++++++++++++++++++++-
 osschat_insert.py | 114 ++++++++++++++++++++++++++++++++++
 2 files changed, 267 insertions(+), 1 deletion(-)
 create mode 100644 osschat_insert.py
diff --git a/README.md b/README.md
index 60e870a..032f7c8 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,154 @@
-# osschat-insert
+# OSSChat Insert
 
+## Description
+
+**[OSSChat](https://osschat.io/)** is enhanced ChatGPT with documentation, issues, blog posts, community Q&A as knowledge bases. Built for every community and developer. The osschat-insert pipeline is a pipeline to insert data.
+
+<br />
+
+
+
+## Code Example
+
+### **Create Milvus collection**
+
+Before running the pipeline, please [create Milvus collection](https://milvus.io/docs/v2.0.x/create_collection.md) first.
+
+> The `dim` is the dimensionality of the feature vector generated by the configured `model` in the `osschat-insert` pipeline.
+
+```python
+from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
+
+collection_name = 'osschat'
+dim = 384
+
+connections.connect(host='127.0.0.1', port='19530')
+
+fields = [
+   FieldSchema(name='id', dtype=DataType.INT64, description='ids', is_primary=True, auto_id=True),
+   FieldSchema(name='text_id', dtype=DataType.VARCHAR, description='text', max_length=500),
+   FieldSchema(name='text', dtype=DataType.VARCHAR, description='text', max_length=1000),
+   FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='embedding vectors', dim=dim)
+]
+schema = CollectionSchema(fields=fields, description='enhanced qa')
+collection = Collection(name=collection_name, schema=schema)
+
+index_params = {
+    'metric_type':"IP",
+    'index_type':"IVF_FLAT",
+    'params':{"nlist":2048}
+}
+collection.create_index(field_name="embedding", index_params=index_params)
+```
+
+### **Create pipeline and set the configuration**
+
+> You need also start [elasticsearch](https://www.elastic.co/elasticsearch/).
+>
+> More parameters refer to the Configuration.
+
+```python
+from towhee import AutoPipes, AutoConfig
+
+config = AutoConfig.load_config('osschat-insert')
+config.embedding_model = 'all-MiniLM-L6-v2'
+config.milvus_host = '127.0.0.1'
+config.milvus_port = '19530'
+config.es_host = '127.0.0.1'
+config.es_port = '9200'
+
+p = AutoPipes.pipeline('osschat-insert', config=config)
+res = p('https://github.com/towhee-io/towhee/blob/main/README.md')
+```
+
+Then you can run `collection.flush() ` and `collection.num_entities` to check the number of the data in Milvus as a knowledge base.
+
+<br />
+
+
+
+
+## Configuration 
+
+### **EnhancedQAInsertConfig**
+
+#### **Configuration for [Text Splitter](https://towhee.io/towhee/text-splitter):**
+
+***type***: str
+
+The type of splitter, defaults to 'RecursiveCharacter'. You can set this parameter in ['[RecursiveCharacter](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/recursive_text_splitter.html)', '[Markdown](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/markdown.html)', '[PythonCode](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/python.html)', '[Character](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/character_text_splitter.html#)', '[NLTK](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/nltk.html)', '[Spacy](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/spacy.html)', '[Tiktoken](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/tiktoken_splitter.html)', '[HuggingFace](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/huggingface_length_function.html)'].
+
+***chunk_size***: int
+The size of each chunk, defaults to 300.
+
+***splitter_kwargs***: dict
+
+The kwargs for the splitter, defaults to {}.
+
+#### **Configuration for Sentence Embedding:**
+
+***embedding_model***: str
+The model name for sentence embedding, defaults to `'all-MiniLM-L6-v2'`.
+You can refer to the above [Model(s) list ](https://towhee.io/tasks/detail/operator?field_name=Natural-Language-Processing&task_name=Sentence-Embedding)to set the model, some of these models are from [HuggingFace](https://huggingface.co/) (open source), and some are from [OpenAI](https://openai.com/) (not open, required API key).
+
+***openai_api_key***: str
+The api key of openai, default to `None`.
+This key is required if  the model is from OpenAI, you can check the model provider in the above [Model(s) list](https://towhee.io/sentence-embedding/openai).
+
+***embedding_device:*** int
+The number of device, defaults to `-1`, which means using the CPU. 
+If the setting is not `-1`, the specified GPU device will be used.
+
+#### **Configuration for [Milvus](https://towhee.io/ann-insert/osschat-milvus):**
+
+***milvus_host***: str
+Host of Milvus vector database, default is `'127.0.0.1'`.
+
+***milvus_port***: str
+Port of Milvus vector database, default is `'19530'`. 
+
+***milvus_user***: str
+The user name for [Cloud user](https://zilliz.com/cloud), defaults to `None`.
+
+***milvus_password***: str
+The user password for [Cloud user](https://zilliz.com/cloud), defaults to `None`.
+
+#### **Configuration for [Elasticsearch](https://towhee.io/elasticsearch/osschat-index):**
+
+***es_host***: str
+Host of Elasticsearch, default is `'127.0.0.1'`.
+
+***es_port***: str
+Port of Elasticsearche, default is `'9200'`. 
+
+***es_user***: str
+The user name for Elasticsearch, defaults to `None`.
+
+***es_password***: str
+The user password for Elasticsearch, defaults to `None`.
+
+<br />
+
+
+
+## Interface
+
+Insert documentation into Milvus as a knowledge base.
+
+**Parameters:**
+
+ ***doc***: str
+
+Path or url of the document to be loaded.
+
+***milvus_collection***: str
+The collection name for Milvus vector database, is required when inserting data into Milvus.
+
+***es_index***: str
+The index name of elasticsearch.
+
+
+
+**Returns:** MutationResult
+
+A MutationResult after inserting Milvus.
diff --git a/osschat_insert.py b/osschat_insert.py
new file mode 100644
index 0000000..e021242
--- /dev/null
+++ b/osschat_insert.py
@@ -0,0 +1,114 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Optional, Any
+from pydantic import BaseModel
+from datetime import datetime
+
+from towhee import ops, pipe, AutoPipes, AutoConfig
+
+
+@AutoConfig.register
+class OSSChatInsertConfig(BaseModel):
+    """
+    Config of pipeline
+    """
+    # config for text_splitter
+    type: Optional[str] = 'RecursiveCharacter'
+    chunk_size: Optional[int] = 300
+    splitter_kwargs: Optional[Dict[str, Any]] = {}
+    # config for sentence_embedding
+    embedding_model: Optional[str] = 'all-MiniLM-L6-v2'
+    openai_api_key: Optional[str] = None
+    embedding_device: Optional[int] = -1
+    # config for insert_milvus
+    milvus_host: Optional[str] = '127.0.0.1'
+    milvus_port: Optional[str] = '19530'
+    milvus_user: Optional[str] = None
+    milvus_password: Optional[str] = None
+    # config for elasticsearch
+    es_host: Optional[str] = '127.0.0.1'
+    es_port: Optional[str] = '9200'
+    es_user: Optional[str] = None
+    es_password: Optional[str] = None
+    es_ca_certs: Optional[str] = None
+
+
+_hf_models = ops.sentence_embedding.transformers().get_op().supported_model_names()
+_sbert_models = ops.sentence_embedding.sbert().get_op().supported_model_names()
+_openai_models = ['text-embedding-ada-002', 'text-similarity-davinci-001',
+                  'text-similarity-curie-001', 'text-similarity-babbage-001',
+                  'text-similarity-ada-001']
+
+
+def _get_embedding_op(config):
+    if config.embedding_device == -1:
+        device = 'cpu'
+    else:
+        device = config.embedding_device
+
+    if config.embedding_model in _hf_models:
+        return True, ops.sentence_embedding.transformers(model_name=config.embedding_model, device=device)
+    if config.embedding_model in _sbert_models:
+        return True, ops.sentence_embedding.sbert(model_name=config.embedding_model, device=device)
+    if config.embedding_model in _openai_models:
+        return False, ops.sentence_embedding.openai(model_name=config.embedding_model, api_key=config.openai_api_key)
+    raise RuntimeError('Unknown model: [%s], only support: %s' % (config.embedding_model, _hf_models + _sbert_models + _openai_models))
+
+
+
+@AutoPipes.register
+def osschat_insert_pipe(config):
+    text_split_op = ops.text_splitter(type=config.type, 
+                                      chunk_size=config.chunk_size, 
+                                      **config.splitter_kwargs)
+    
+    es_example_input = lambda x: {
+        'title': x,
+        'author': 'OSSChat',
+        'content': x,
+        'timestamp': datetime.now()
+        }
+    es_index_op = ops.elasticsearch.osschat_index(host=config.es_host, 
+                                                  port=config.es_port,
+                                                  user=config.es_user,
+                                                  password=config.es_password,
+                                                  ca_certs=config.es_ca_certs,
+                                                  )
+    
+    allow_triton, sentence_embedding_op = _get_embedding_op(config)
+    sentence_embedding_config = {}
+    if allow_triton:
+        if config.embedding_device >= 0:
+            sentence_embedding_config = AutoConfig.TritonGPUConfig(device_ids=[config.embedding_device], max_batch_size=128)
+        else:
+            sentence_embedding_config = AutoConfig.TritonCPUConfig()
+
+    insert_milvus_op = ops.ann_insert.osschat_milvus(host=config.milvus_host,
+                                                     port=config.milvus_port,
+                                                     user=config.milvus_user,
+                                                     password=config.milvus_password,
+                                                     )
+    
+    return (
+        pipe.input('doc', 'milvus_collection', 'es_index')
+            .map('doc', 'text', ops.text_loader())
+            .flat_map('text', 'sentence', text_split_op)
+            .map('sentence', 'es_sentence', es_example_input)
+            .map(('es_index', 'es_sentence'), es_index_op)
+            .map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config)
+            .map('embedding', 'embedding', ops.towhee.np_normalize())
+            .map(('milvus_collection', 'doc', 'sentence', 'embedding'), 'mr', insert_milvus_op)
+            .output('mr', 'es_res')
+    )