docs: create doc page for llm ops

and include the LLMs in the use cases
UKP-SQuARE · Mar 27, 2024 · 20bec8e · 20bec8e
1 parent f9da97c
commit 20bec8e
Show file tree

Hide file tree

Showing 6 changed files with 235 additions and 93 deletions.
diff --git a/docs/docusaurus.config.js b/docs/docusaurus.config.js
@@ -24,8 +24,8 @@ const config = {
           sidebarPath: require.resolve('./sidebars.js'),
           // Please change this to your repo.
           editUrl: 'https://github.com/UKP-SQuARE/square-core/tree/master/docs',
-            path: 'home',
-            routeBasePath: 'home',
+          path: 'home',
+          routeBasePath: 'home',
         },
         blog: {
           showReadingTime: true,
@@ -38,42 +38,42 @@ const config = {
         },
       }),
     ],
-      [
-     "redocusaurus",
-     {
-       specs: [
-           {
-           route: "/api/datastores/",
-           spec: "https://square.ukp-lab.de/api/datastores/openapi.json",
-           },
-           {
+    [
+      "redocusaurus",
+      {
+        specs: [
+          {
+            route: "/api/datastores/",
+            spec: "https://square.ukp-lab.de/api/datastores/openapi.json",
+          },
+          {
             route: "/api/skills/",
-           spec: "https://square.ukp-lab.de/api/skill-manager/openapi.json",
+            spec: "https://square.ukp-lab.de/api/skill-manager/openapi.json",
           },
-           {
+          {
             route: "/api/models-inference/",
-           spec: "https://square.ukp-lab.de/api/main/openapi.json",
+            spec: "https://square.ukp-lab.de/api/main/openapi.json",
           },
-           {
+          {
             route: "/api/models-management/",
-           spec: "https://square.ukp-lab.de/api/models/openapi.json",
+            spec: "https://square.ukp-lab.de/api/models/openapi.json",
           },
-       ],
-     },
-   ],
+        ],
+      },
+    ],
   ],
 
   themeConfig:
     /** @type {import('@docusaurus/preset-classic').ThemeConfig} */
     ({
       announcementBar: {
-          id: 'beta',
-          content:
-            'The <b>model management</b> service is currently in beta. The stable version will be released soon.',
-          backgroundColor: '#fafbfc',
-          textColor: '#091E42',
-          isCloseable: true,
-     },
+        id: 'beta',
+        content:
+          'The <b>model management</b> service is currently in beta. The stable version will be released soon.',
+        backgroundColor: '#fafbfc',
+        textColor: '#091E42',
+        isCloseable: true,
+      },
       navbar: {
         title: 'UKP-SQuARE',
         logo: {
@@ -87,35 +87,35 @@ const config = {
             position: 'left',
             label: 'Overview',
           },
-            {
+          {
             type: 'doc',
             docId: 'components/datastores',
             position: 'left',
             label: 'Components',
           },
-            {
-           label: 'API',
-           position: 'left',
-           items: [
-             {
-               label: 'Datastores',
-               to: '/api/datastores/',
-             },
-             {
-               label: 'Models-Inference',
-               to: '/api/models-inference/',
-             },
-               {
-               label: 'Models-Management',
-               to: '/api/models-management/',
-             },
-             {
-               label: 'Skills',
-               to: '/api/skills/',
-             },
-           ],
-         },
-            {
+          {
+            label: 'API',
+            position: 'left',
+            items: [
+              {
+                label: 'Datastores',
+                to: '/api/datastores/',
+              },
+              {
+                label: 'Models-Inference',
+                to: '/api/models-inference/',
+              },
+              {
+                label: 'Models-Management',
+                to: '/api/models-management/',
+              },
+              {
+                label: 'Skills',
+                to: '/api/skills/',
+              },
+            ],
+          },
+          {
             type: 'doc',
             docId: 'versioning/changelog',
             position: 'left',
@@ -148,7 +148,7 @@ const config = {
                 label: 'Twitter',
                 href: 'https://twitter.com/UKPLab',
               },
-                {
+              {
                 label: 'Linkedin',
                 href: 'https://www.linkedin.com/company/tu-darmstadt/',
               },

diff --git a/docs/home/components/llms.md b/docs/home/components/llms.md
@@ -0,0 +1,133 @@
+---
+sidebar_position: 5
+---
+
+# LLM Ops
+
+LLM Ops is a deployment service for LLMs that facilitates the deployment of LLMs on GPUs. It provides an API designed to support both chat and completion requests, accommodating streaming and non-streaming requests alike. The foundation of the service rests on the [FastChat](https://github.com/lm-sys/FastChat) platform. For model deployment on GPUs, LLM Ops utilizes the [vllm](https://github.com/vllm-project/vllm) serving engine.
+
+## Model Deployment
+
+### Requirements
+- Docker
+
+### Setup
+
+
+The service is dockerized, enabling straightforward deployment through a single Docker command. To start the service, navigate to the `llm-ops` directory and run:
+
+```bash
+docker-compose up -d
+```
+
+### Deploying a Model
+Currently, deploying a new model requires the model to be explicitly included in the docker-compose file. Below is a demonstration of deploying the LLaMA-2 7b chat model as an illustrative example.
+
+```yaml
+llm_chat:
+    build:
+      context: .
+    container_name: llm_chat
+    volumes:
+      - /home/hf_models:/root/.cache/huggingface  # replace "/home/hf_models" with your own huggingface models directory
+    deploy:
+      resources:
+        reservations:
+          devices:  # adjust this based on the specification of your machine
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    entrypoint:
+      - /bin/bash
+      - ./start_chat.sh
+    command:
+      - --model-path
+      - ../root/.cache/huggingface/Llama-2-7b-chat  # falcon-7b-instruct  # Llama-2-7b-chat # vicuna-7b-v1.3
+    labels:
+      - "traefik.enable=true"
+      - "traefik.http.routers.llm-chat.rule=PathPrefix(`/api/Llama-2-7b-chat`)"  # API path of you model. Adjust it base on the model you deploy
+      - "traefik.http.routers.llm-chat.entrypoints=websecure"
+      - "traefik.http.routers.llm-chat.tls=true"
+      - "traefik.http.routers.llm-chat.tls.certresolver=le"
+      - "traefik.http.routers.llm-chat.middlewares=llm-chat-stripprefix,llm-chat-addprefix"
+      - "traefik.http.middlewares.llm-chat-stripprefix.stripPrefixRegex.regex=/api/[a-zA-Z0-9_-]+"
+      - "traefik.http.middlewares.llm-chat-addprefix.addPrefix.prefix=/api"
+```
+
+### Supported Models
+Currently, the following models are supported by default: 
+- LLama2
+- Vicuna v1.1
+- Dolly V2
+- Falcon 180B
+- Falcon
+- Mistral-instruct-v0.1
+
+If you want to add support to a new model, you have to consider the following: 
+- The model has to be supported by [vllm](https://github.com/vllm-project/vllm). See: [Supported Models — vLLM](https://docs.vllm.ai/en/latest/models/supported_models.html)
+- If you want to support a chat model, you also have to add a new `conv_template` in `llm-ops/llm_ops/prompts/conversation.py`. For example, here is how to add the conv_template for the LLama2 chat model: 
+
+```python
+register_conv_template(
+    Conversation(
+        name="llama-2",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+    )
+)
+```
+
+
+## API
+After starting the service with your model being deployed, you can make non-streaming or streaming requests. The following are examples of how to make requests to the deployed model Llama-2-7b-chat.
+
+### Non-Streaming Request
+```bash
+curl -k -X POST https://localhost:8443/api/Llama-2-7b-chat/worker_generate \
+-H "Content-Type: application/json" \
+-d '{
+    "model_identifier": "Llama-2-7b-chat",
+    "messages": [
+        {
+            "role": "user",
+            "text": "Hellow!"
+        }, 
+        {
+            "role": "ai",
+            "text": "Hey! How can I help you today?"
+        },
+        {
+            "role": "user",
+            "text": "Tell me a short funny joke."
+        }
+    ],
+    "system_message": "The following is a friendly conversation between a human and an AI.",
+    "temperature": 0.7,
+    "top_p": 0.9,
+    "echo": false,
+    "generation_mode": "chat"
+}'
+```
+`generation_mode` can be either `chat` or `completion` depending on the type of request you want to make. If you want to make a completion request, you have to set `generation_mode` to `completion` and provide a string `prompt` instead of `messages`.
+
+
+### Streaming Request
+The streaming request is very similar to the non-streaming request, but you have to use the endpoint `/api/Llama-2-7b-chat/worker_generate_stream` instead of `/api/Llama-2-7b-chat/worker_generate`. I replace the `messages` field with a `prompt` field just to show how to make a completion request too.
+```bash
+curl -k -X POST https://localhost:8443/api/Llama-2-7b-chat/worker_generate_stream \
+-H "Content-Type: application/json" \
+-d '{
+    "model_identifier": "Llama-2-7b-chat",
+    "prompt": "Hellow! Can you tell me a joke?",
+    "system_message": "The following is a friendly conversation between a human and an AI.",
+    "temperature": 0.7,
+    "top_p": 0.9,
+    "echo": false,
+    "generation_mode": "completion"
+}'
+```
+
+Note that in both non-streaming and streaming requests, you have to provide the `model_identifier`, `prompt` or `messages` and `generation_mode`. The other fields are optional. The `echo` field is used to determine whether the service should return the initial prompt/messages or not.
diff --git a/docs/home/overview/use_cases.md b/docs/home/overview/use_cases.md
@@ -45,4 +45,11 @@ Combining Skills can make systems more modular and multi-domain. [Puerto et al.,
 
 
 
+## 9. Analyzing LLMs
+Recent studies have demonstrated that contemporary Large Language Models (LLMs) exhibit significant sensitivity to variations in prompt inputs. To facilitate a comprehensive examination of model behaviors under diverse prompting conditions and with varied generation parameters (e.g., temperature settings or system prompts), we have developed an interactive chat interface. This interface enables users to engage with LLMs in a manner akin to popular chat applications. Furthermore, we have the capability to deploy locally hosted LLMs, offering unrestricted access to these models for users.
+
+Beyond the standard chat functionality, which allows direct interaction between a user and an LLM, we have introduced an innovative mode wherein the model functions as an Agent to address user queries. In this mode, similar to the normal chat mode, users submit their queries, and the model, informed by a predefined set of tools (as specified in the system prompt), identifies and applies the appropriate tools to resolve the queries. These tools are versatile functions that process textual input and generate textual output. For instance, a calculator tool might accept a mathematical expression in textual form, execute the calculation, and return the result. This system is designed to be highly adaptable, not only by leveraging internal tools but also by allowing users to integrate external functionalities via AWS Lambda functions. Additionally, this approach enables the model to perform complex operations by sequentially applying multiple tools based on the intermediate outcomes obtained.
+
+Moreover, following the research conducted by [Lu et al. (2023)](https://arxiv.org/abs/2203.00001), our platform includes a sensitivity testing interface. This feature allows users to input sentences and generate modified versions of them. The altered sentences, along with selected examples, are then evaluated by a language model to determine their acceptability. Through this process, it is feasible to assess the sensitivity of LLMs to prompt variations, providing valuable insights into their operational dynamics.
+
 
diff --git a/docs/sidebars.js b/docs/sidebars.js
@@ -23,14 +23,14 @@ const sidebars = {
       type: 'category',
       label: 'Overview',
       items: ['overview/introduction',
-              'overview/get_started',
-              'overview/use_cases',
-              'overview/tutorials',
-              'overview/architecture',
-              'overview/local_installation',
-              'overview/faq'],
+        'overview/get_started',
+        'overview/use_cases',
+        'overview/tutorials',
+        'overview/architecture',
+        'overview/local_installation',
+        'overview/faq'],
     },
-      {
+    {
       type: 'category',
       label: 'Components',
       items: [
@@ -42,14 +42,18 @@ const sidebars = {
           type: 'doc',
           id: 'components/models',
         },
-          {
+        {
           type: 'doc',
           id: 'components/skills',
         },
-          {
+        {
           type: 'doc',
           id: 'components/explainability',
         },
+        {
+          type: 'doc',
+          'id': 'components/llms',
+        }
       ],
     },
   ],

diff --git a/llm-ops/api.http b/llm-ops/api.http
@@ -56,12 +56,12 @@ Content-Type: application/json
     "messages": [
         {
             "role": "user",
-            "text": "yo!" 
+            "text": "yo!"
         }, 
         {
             "role": "ai",
             "text": "Hey! How can I help you today?"
-        }, 
+        },
         {
             "role": "user",
             "text": "I want you to tell me a short funny joke."
@@ -70,7 +70,7 @@ Content-Type: application/json
     "system_message": "The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know. You only need to complete the ai's sentences. \n",
     "temperature": 0.7,
     "top_p": 0.9,
-    "echo": false, // Returns the initial messages in the response
+    "echo": false,
     "generation_mode": "chat"
 }