From 496485dedd9308fff08c39fe7a97afeee5a30a62 Mon Sep 17 00:00:00 2001 From: shiyu22 Date: Wed, 14 Jun 2023 14:58:51 +0800 Subject: [PATCH] Add osschat-insert --- README.md | 154 +++++++++++++++++++++++++++++++++++++++++++++- osschat_insert.py | 114 ++++++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 osschat_insert.py diff --git a/README.md b/README.md index 60e870a..032f7c8 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,154 @@ -# osschat-insert +# OSSChat Insert +## Description + +**[OSSChat](https://osschat.io/)** is enhanced ChatGPT with documentation, issues, blog posts, community Q&A as knowledge bases. Built for every community and developer. The osschat-insert pipeline is a pipeline to insert data. + +
+ + + +## Code Example + +### **Create Milvus collection** + +Before running the pipeline, please [create Milvus collection](https://milvus.io/docs/v2.0.x/create_collection.md) first. + +> The `dim` is the dimensionality of the feature vector generated by the configured `model` in the `osschat-insert` pipeline. + +```python +from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility + +collection_name = 'osschat' +dim = 384 + +connections.connect(host='127.0.0.1', port='19530') + +fields = [ + FieldSchema(name='id', dtype=DataType.INT64, description='ids', is_primary=True, auto_id=True), + FieldSchema(name='text_id', dtype=DataType.VARCHAR, description='text', max_length=500), + FieldSchema(name='text', dtype=DataType.VARCHAR, description='text', max_length=1000), + FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='embedding vectors', dim=dim) +] +schema = CollectionSchema(fields=fields, description='enhanced qa') +collection = Collection(name=collection_name, schema=schema) + +index_params = { + 'metric_type':"IP", + 'index_type':"IVF_FLAT", + 'params':{"nlist":2048} +} +collection.create_index(field_name="embedding", index_params=index_params) +``` + +### **Create pipeline and set the configuration** + +> You need also start [elasticsearch](https://www.elastic.co/elasticsearch/). +> +> More parameters refer to the Configuration. + +```python +from towhee import AutoPipes, AutoConfig + +config = AutoConfig.load_config('osschat-insert') +config.embedding_model = 'all-MiniLM-L6-v2' +config.milvus_host = '127.0.0.1' +config.milvus_port = '19530' +config.es_host = '127.0.0.1' +config.es_port = '9200' + +p = AutoPipes.pipeline('osschat-insert', config=config) +res = p('https://github.com/towhee-io/towhee/blob/main/README.md') +``` + +Then you can run `collection.flush() ` and `collection.num_entities` to check the number of the data in Milvus as a knowledge base. + +
+ + + + +## Configuration + +### **EnhancedQAInsertConfig** + +#### **Configuration for [Text Splitter](https://towhee.io/towhee/text-splitter):** + +***type***: str + +The type of splitter, defaults to 'RecursiveCharacter'. You can set this parameter in ['[RecursiveCharacter](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/recursive_text_splitter.html)', '[Markdown](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/markdown.html)', '[PythonCode](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/python.html)', '[Character](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/character_text_splitter.html#)', '[NLTK](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/nltk.html)', '[Spacy](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/spacy.html)', '[Tiktoken](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/tiktoken_splitter.html)', '[HuggingFace](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/huggingface_length_function.html)']. + +***chunk_size***: int +The size of each chunk, defaults to 300. + +***splitter_kwargs***: dict + +The kwargs for the splitter, defaults to {}. + +#### **Configuration for Sentence Embedding:** + +***embedding_model***: str +The model name for sentence embedding, defaults to `'all-MiniLM-L6-v2'`. +You can refer to the above [Model(s) list ](https://towhee.io/tasks/detail/operator?field_name=Natural-Language-Processing&task_name=Sentence-Embedding)to set the model, some of these models are from [HuggingFace](https://huggingface.co/) (open source), and some are from [OpenAI](https://openai.com/) (not open, required API key). + +***openai_api_key***: str +The api key of openai, default to `None`. +This key is required if the model is from OpenAI, you can check the model provider in the above [Model(s) list](https://towhee.io/sentence-embedding/openai). + +***embedding_device:*** int +The number of device, defaults to `-1`, which means using the CPU. +If the setting is not `-1`, the specified GPU device will be used. + +#### **Configuration for [Milvus](https://towhee.io/ann-insert/osschat-milvus):** + +***milvus_host***: str +Host of Milvus vector database, default is `'127.0.0.1'`. + +***milvus_port***: str +Port of Milvus vector database, default is `'19530'`. + +***milvus_user***: str +The user name for [Cloud user](https://zilliz.com/cloud), defaults to `None`. + +***milvus_password***: str +The user password for [Cloud user](https://zilliz.com/cloud), defaults to `None`. + +#### **Configuration for [Elasticsearch](https://towhee.io/elasticsearch/osschat-index):** + +***es_host***: str +Host of Elasticsearch, default is `'127.0.0.1'`. + +***es_port***: str +Port of Elasticsearche, default is `'9200'`. + +***es_user***: str +The user name for Elasticsearch, defaults to `None`. + +***es_password***: str +The user password for Elasticsearch, defaults to `None`. + +
+ + + +## Interface + +Insert documentation into Milvus as a knowledge base. + +**Parameters:** + + ***doc***: str + +Path or url of the document to be loaded. + +***milvus_collection***: str +The collection name for Milvus vector database, is required when inserting data into Milvus. + +***es_index***: str +The index name of elasticsearch. + + + +**Returns:** MutationResult + +A MutationResult after inserting Milvus. diff --git a/osschat_insert.py b/osschat_insert.py new file mode 100644 index 0000000..e021242 --- /dev/null +++ b/osschat_insert.py @@ -0,0 +1,114 @@ +# Copyright 2021 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Optional, Any +from pydantic import BaseModel +from datetime import datetime + +from towhee import ops, pipe, AutoPipes, AutoConfig + + +@AutoConfig.register +class OSSChatInsertConfig(BaseModel): + """ + Config of pipeline + """ + # config for text_splitter + type: Optional[str] = 'RecursiveCharacter' + chunk_size: Optional[int] = 300 + splitter_kwargs: Optional[Dict[str, Any]] = {} + # config for sentence_embedding + embedding_model: Optional[str] = 'all-MiniLM-L6-v2' + openai_api_key: Optional[str] = None + embedding_device: Optional[int] = -1 + # config for insert_milvus + milvus_host: Optional[str] = '127.0.0.1' + milvus_port: Optional[str] = '19530' + milvus_user: Optional[str] = None + milvus_password: Optional[str] = None + # config for elasticsearch + es_host: Optional[str] = '127.0.0.1' + es_port: Optional[str] = '9200' + es_user: Optional[str] = None + es_password: Optional[str] = None + es_ca_certs: Optional[str] = None + + +_hf_models = ops.sentence_embedding.transformers().get_op().supported_model_names() +_sbert_models = ops.sentence_embedding.sbert().get_op().supported_model_names() +_openai_models = ['text-embedding-ada-002', 'text-similarity-davinci-001', + 'text-similarity-curie-001', 'text-similarity-babbage-001', + 'text-similarity-ada-001'] + + +def _get_embedding_op(config): + if config.embedding_device == -1: + device = 'cpu' + else: + device = config.embedding_device + + if config.embedding_model in _hf_models: + return True, ops.sentence_embedding.transformers(model_name=config.embedding_model, device=device) + if config.embedding_model in _sbert_models: + return True, ops.sentence_embedding.sbert(model_name=config.embedding_model, device=device) + if config.embedding_model in _openai_models: + return False, ops.sentence_embedding.openai(model_name=config.embedding_model, api_key=config.openai_api_key) + raise RuntimeError('Unknown model: [%s], only support: %s' % (config.embedding_model, _hf_models + _sbert_models + _openai_models)) + + + +@AutoPipes.register +def osschat_insert_pipe(config): + text_split_op = ops.text_splitter(type=config.type, + chunk_size=config.chunk_size, + **config.splitter_kwargs) + + es_example_input = lambda x: { + 'title': x, + 'author': 'OSSChat', + 'content': x, + 'timestamp': datetime.now() + } + es_index_op = ops.elasticsearch.osschat_index(host=config.es_host, + port=config.es_port, + user=config.es_user, + password=config.es_password, + ca_certs=config.es_ca_certs, + ) + + allow_triton, sentence_embedding_op = _get_embedding_op(config) + sentence_embedding_config = {} + if allow_triton: + if config.embedding_device >= 0: + sentence_embedding_config = AutoConfig.TritonGPUConfig(device_ids=[config.embedding_device], max_batch_size=128) + else: + sentence_embedding_config = AutoConfig.TritonCPUConfig() + + insert_milvus_op = ops.ann_insert.osschat_milvus(host=config.milvus_host, + port=config.milvus_port, + user=config.milvus_user, + password=config.milvus_password, + ) + + return ( + pipe.input('doc', 'milvus_collection', 'es_index') + .map('doc', 'text', ops.text_loader()) + .flat_map('text', 'sentence', text_split_op) + .map('sentence', 'es_sentence', es_example_input) + .map(('es_index', 'es_sentence'), es_index_op) + .map('sentence', 'embedding', sentence_embedding_op, config=sentence_embedding_config) + .map('embedding', 'embedding', ops.towhee.np_normalize()) + .map(('milvus_collection', 'doc', 'sentence', 'embedding'), 'mr', insert_milvus_op) + .output('mr', 'es_res') + )