diff --git a/README.md b/README.md
index 3dba710..da6af99 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,140 @@
-# Llama-2
+# Llama-2 Chat
+
+*author: Jael*
+
+
+
+## Description
+
+A LLM operator generates answer given prompt in messages using a large language model or service.
+This operator uses a pretrained [Llama-2](https://ai.meta.com/llama) to generate response.
+By default, it will download the model file from [HuggingFace](https://huggingface.co/TheBloke)
+and then run the model with [Llama-cpp](https://github.com/ggerganov/llama.cpp).
+
+This operator will automatically install and run model with llama-cpp.
+If the automatic installation fails in your environment, please refer to [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) for instructions of manual installation.
+
+
+
+## Code Example
+
+Use the default model to continue the conversation from given messages.
+
+*Write a pipeline with explicit inputs/outputs name specifications:*
+
+```python
+from towhee import pipe, ops
+
+p = (
+ pipe.input('question', 'docs', 'history')
+ .map(('question', 'docs', 'history'), 'prompt', ops.prompt.question_answer())
+ .map('prompt', 'answer', ops.LLM.Llama_2('llama-2-7b-chat'))
+ .output('answer')
+)
+
+history=[('Who won the world series in 2020?', 'The Los Angeles Dodgers won the World Series in 2020.')]
+question = 'Where was it played?'
+answer = p(question, [], history).get()[0]
+```
+
+*Write a [retrieval-augmented generation pipeline](https://towhee.io/tasks/detail/pipeline/retrieval-augmented-generation) with explicit inputs/outputs name specifications:*
+
+```python
+from towhee import pipe, ops
+
+
+temp = '''Use the following pieces of context to answer the question at the end.
+
+{context}
+
+Question: {question}
+'''
+
+system_msg = 'Your name is TowheeChat.'
+
+q1 = 'Who are you?'
+q2 = 'What is Towhee?'
+
+p = (
+ pipe.input('question', 'docs', 'history')
+ .map(('question', 'docs', 'history'),
+ 'prompt',
+ ops.prompt.template(temp, ['question', 'context'], sys_message))
+ .map('prompt', 'answer',
+ ops.LLM.Llama_2(max_tokens=200))
+ .output('answer')
+)
+
+history = []
+docs = []
+ans1 = p(q1, docs, history).get()[0]
+print(q1, ans1)
+
+history.append((q1, ans1))
+docs.append('Towhee is a cutting-edge framework designed to streamline the processing of unstructured data through the use of Large Language Model (LLM) based pipeline orchestration.')
+ans2 = p(q2, docs, history)
+
+print(q2, ans2)
+```
+
+
+
+## Factory Constructor
+
+Create the operator via the following factory method:
+
+***LLM.Llama_2(model_name_or_file: str)***
+
+**Parameters:**
+
+***model_name_or_file***: *str*
+
+The model name or path to the model file in string, defaults to 'llama-2-7b-chat'.
+If model name is in `supported_model_names`, it will download corresponding model file from HuggingFace models.
+You can also use the local path of a model file, which can be ran by llama-cpp-python.
+
+***\*\*kwargs***
+
+Other model parameters such as temperature, max_tokens.
+
+
+
+## Interface
+
+The operator takes a piece of text in string as input.
+It returns answer in json.
+
+***\_\_call\_\_(txt)***
+
+**Parameters:**
+
+***messages***: *list*
+
+ A list of messages to set up chat.
+Must be a list of dictionaries with key value from "system", "question", "answer". For example, [{"question": "a past question?", "answer": "a past answer."}, {"question": "current question?"}]
+
+**Returns**:
+
+*answer: str*
+
+ The answer generated.
+
+
+
+***supported_model_names()***
+
+**Returns**:
+
+A dictionary of supported models with model name as key and huggingface hub id & model filename as value.
+
+ {
+ 'llama-2-7b-chat': {
+ 'hf_id': 'TheBloke/Llama-2-7B-GGML',
+ 'filename': 'llama-2-7b.ggmlv3.q4_0.bin'
+ },
+ 'llama-2-13-b-chat': {
+ 'hf_id': 'TheBloke/Llama-2-13B-GGML',
+ 'filename': 'llama-2-13b-chat.ggmlv3.q4_0.bin'
+ }
+ }
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..38e9485
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,5 @@
+from .llama2 import LlamaCpp
+
+
+def llama_2(*args, **kwargs):
+ return LlamaCpp(*args, **kwargs)
diff --git a/llama2.py b/llama2.py
new file mode 100644
index 0000000..86db3a0
--- /dev/null
+++ b/llama2.py
@@ -0,0 +1,90 @@
+# Copyright 2021 Zilliz. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List
+
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+
+from towhee.operator.base import PyOperator, SharedType
+
+
+class LlamaCpp(PyOperator):
+ '''Wrapper of Dolly inference'''
+ def __init__(self,
+ model_name_or_file: str = 'llama-2-7b-chat',
+ **kwargs
+ ):
+ self.kwargs = kwargs
+ supported_model_names = self.supported_model_names()
+
+ if model_name_or_file in supported_model_names:
+ model_info = supported_model_names[model_name_or_file]
+ hf_id = model_info['hf_id']
+ model_filename = model_info['filename']
+ self.model_path = hf_hub_download(repo_id=hf_id, filename=model_filename)
+ else:
+ self.model_path = model_name_or_file
+ assert os.path.isfile(self.model_path), f'Invalid model path: {self.model_path}'
+
+ print(111, self.model_path)
+ self.model = Llama(model_path=self.model_path)
+
+ def __call__(self, messages: List[dict]):
+ prompt = self.parse_inputs(messages)
+ resp = self.model(prompt, **self.kwargs)
+ answer = self.parse_outputs(resp)
+ return answer
+
+ def parse_inputs(self, messages: List[dict]):
+ assert isinstance(messages, list), \
+ 'Inputs must be a list of dictionaries with keys from ["system", "question", "answer"].'
+ prompt = ''
+ question = messages.pop[-1]
+ assert len(question) == 1 and 'question' in question.keys()
+ question = question['question']
+ for m in messages:
+ for k, v in m.items():
+ if k == 'system':
+ prompt += f'''[INST] <> {v} <> [/INST]\n'''
+ elif k == 'question':
+ prompt += f'''[INST] {v} [/INST]\n'''
+ elif k == 'answer':
+ prompt += f'''{v}\n'''
+ else:
+ raise KeyError(f'Invalid key of message: {k}')
+ prompt = ' ' + prompt + ' ' + f' [INST] {question} [/INST]'
+ return prompt
+
+ def parse_outputs(self, response):
+ return response['choices'][0]['text']
+
+ @staticmethod
+ def supported_model_names():
+ models = {
+ 'llama-2-7b-chat': {
+ 'hf_id': 'TheBloke/Llama-2-7B-GGML',
+ 'filename': 'llama-2-7b.ggmlv3.q4_0.bin'
+ },
+ 'llama-2-13-b-chat': {
+ 'hf_id': 'TheBloke/Llama-2-13B-GGML',
+ 'filename': 'llama-2-13b-chat.ggmlv3.q4_0.bin'
+ }
+ }
+ return models
+
+ @property
+ def shared_type(self):
+ return SharedType.Shareable
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..de714be
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+llama-cpp-python
+huggingface-hub