Merge pull request #3028 from redpanda-data/llama-guard

ollama: add support for moderation model
redpanda-data · Nov 25, 2024 · 1c54126 · 1c54126
2 parents 9c5ce84 + e9e9bd9
commit 1c54126
Show file tree

Hide file tree

Showing 9 changed files with 606 additions and 42 deletions.
diff --git a/docs/modules/components/pages/processors/ollama_chat.adoc b/docs/modules/components/pages/processors/ollama_chat.adoc
@@ -44,6 +44,7 @@ ollama_chat:
   response_format: text
   max_tokens: 0 # No default (optional)
   temperature: 0 # No default (optional)
+  save_prompt_metadata: false
   runner:
     context_size: 0 # No default (optional)
     batch_size: 0 # No default (optional)
@@ -74,6 +75,7 @@ ollama_chat:
   presence_penalty: 0 # No default (optional)
   frequency_penalty: 0 # No default (optional)
   stop: [] # No default (optional)
+  save_prompt_metadata: false
   runner:
     context_size: 0 # No default (optional)
     batch_size: 0 # No default (optional)
@@ -282,6 +284,15 @@ Sets the stop sequences to use. When this pattern is encountered the LLM stops g
 *Type*: `array`
 
 
+=== `save_prompt_metadata`
+
+If enabled the prompt is saved as @prompt metadata on the output message. If system_prompt is used it's also saved as @system_prompt
+
+
+*Type*: `bool`
+
+*Default*: `false`
+
 === `runner`
 
 Options for the model runner that are used when the model is first loaded into memory.

diff --git a/docs/modules/components/pages/processors/ollama_moderation.adoc b/docs/modules/components/pages/processors/ollama_moderation.adoc
@@ -0,0 +1,257 @@
+= ollama_moderation
+:type: processor
+:status: experimental
+:categories: ["AI"]
+
+
+
+////
+     THIS FILE IS AUTOGENERATED!
+
+     To make changes, edit the corresponding source file under:
+
+     https://github.com/redpanda-data/connect/tree/main/internal/impl/<provider>.
+
+     And:
+
+     https://github.com/redpanda-data/connect/tree/main/cmd/tools/docs_gen/templates/plugin.adoc.tmpl
+////
+
+// © 2024 Redpanda Data Inc.
+
+
+component_type_dropdown::[]
+
+
+Generates responses to messages in a chat conversation, using the Ollama API.
+
+Introduced in version 4.42.0.
+
+
+[tabs]
+======
+Common::
++
+--
+
+```yml
+# Common config fields, showing default values
+label: ""
+ollama_moderation:
+  model: llama-guard3 # No default (required)
+  prompt: "" # No default (required)
+  response: "" # No default (required)
+  runner:
+    context_size: 0 # No default (optional)
+    batch_size: 0 # No default (optional)
+  server_address: http://127.0.0.1:11434 # No default (optional)
+```
+
+--
+Advanced::
++
+--
+
+```yml
+# All config fields, showing default values
+label: ""
+ollama_moderation:
+  model: llama-guard3 # No default (required)
+  prompt: "" # No default (required)
+  response: "" # No default (required)
+  runner:
+    context_size: 0 # No default (optional)
+    batch_size: 0 # No default (optional)
+    gpu_layers: 0 # No default (optional)
+    threads: 0 # No default (optional)
+    use_mmap: false # No default (optional)
+    use_mlock: false # No default (optional)
+  server_address: http://127.0.0.1:11434 # No default (optional)
+  cache_directory: /opt/cache/connect/ollama # No default (optional)
+  download_url: "" # No default (optional)
+```
+
+--
+======
+
+This processor checks LLM response safety using either `llama-guard3` or `shieldgemma`. If you want to check if a given prompt is safe, then that can be done with the `ollama_chat` processor - this processor is for response classification only.
+
+By default, the processor starts and runs a locally installed Ollama server. Alternatively, to use an already running Ollama server, add your server details to the `server_address` field. You can https://ollama.com/download[download and install Ollama from the Ollama website^].
+
+For more information, see the https://github.com/ollama/ollama/tree/main/docs[Ollama documentation^].
+
+== Examples
+
+[tabs]
+======
+Use Llama Guard 3 classify a LLM response::
++
+--
+
+This example uses Llama Guard 3 to check if another model responded with a safe or unsafe content.
+
+```yaml
+input:
+  stdin:
+    scanner:
+      lines: {}
+pipeline:
+  processors:
+    - ollama_chat:
+        model: llava
+        prompt: "${!content().string()}"
+        save_prompt_metadata: true
+    - ollama_moderation:
+        model: llama-guard3
+        prompt: "${!@prompt}"
+        response: "${!content().string()}"
+    - mapping: |
+        root.response = content().string()
+        root.is_safe = @safe
+output:
+  stdout:
+    codec: lines
+```
+
+--
+======
+
+== Fields
+
+=== `model`
+
+The name of the Ollama LLM to use.
+
+
+*Type*: `string`
+
+
+|===
+| Option | Summary
+
+| `llama-guard3`
+| When using llama-guard3, two pieces of metadata is added: @safe with the value of `yes` or `no` and the second being @category for the safety category violation. For more information see the https://ollama.com/library/llama-guard3[Llama Guard 3 Model Card].
+| `shieldgemma`
+| When using shieldgemma, the model output is a single piece of metadata of @safe with a value of `yes` or `no` if the response is not in violation of its defined safety policies.
+
+|===
+
+```yml
+# Examples
+
+model: llama-guard3
+
+model: shieldgemma
+```
+
+=== `prompt`
+
+The input prompt that was used with the LLM. If using `ollama_chat` the you can use `save_prompt_metadata` to safe the prompt as metadata.
+This field supports xref:configuration:interpolation.adoc#bloblang-queries[interpolation functions].
+
+
+*Type*: `string`
+
+
+=== `response`
+
+The LLM's response to classify if it contains safe or unsafe content.
+This field supports xref:configuration:interpolation.adoc#bloblang-queries[interpolation functions].
+
+
+*Type*: `string`
+
+
+=== `runner`
+
+Options for the model runner that are used when the model is first loaded into memory.
+
+
+*Type*: `object`
+
+
+=== `runner.context_size`
+
+Sets the size of the context window used to generate the next token. Using a larger context window uses more memory and takes longer to processor.
+
+
+*Type*: `int`
+
+
+=== `runner.batch_size`
+
+The maximum number of requests to process in parallel.
+
+
+*Type*: `int`
+
+
+=== `runner.gpu_layers`
+
+This option allows offloading some layers to the GPU for computation. This generally results in increased performance. By default, the runtime decides the number of layers dynamically.
+
+
+*Type*: `int`
+
+
+=== `runner.threads`
+
+Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has. By default, the runtime decides the optimal number of threads.
+
+
+*Type*: `int`
+
+
+=== `runner.use_mmap`
+
+Map the model into memory. This is only support on unix systems and allows loading only the necessary parts of the model as needed.
+
+
+*Type*: `bool`
+
+
+=== `runner.use_mlock`
+
+Lock the model in memory, preventing it from being swapped out when memory-mapped. This option can improve performance but reduces some of the advantages of memory-mapping because it uses more RAM to run and can slow down load times as the model loads into RAM.
+
+
+*Type*: `bool`
+
+
+=== `server_address`
+
+The address of the Ollama server to use. Leave the field blank and the processor starts and runs a local Ollama server or specify the address of your own local or remote server.
+
+
+*Type*: `string`
+
+
+```yml
+# Examples
+
+server_address: http://127.0.0.1:11434
+```
+
+=== `cache_directory`
+
+If `server_address` is not set - the directory to download the ollama binary and use as a model cache.
+
+
+*Type*: `string`
+
+
+```yml
+# Examples
+
+cache_directory: /opt/cache/connect/ollama
+```
+
+=== `download_url`
+
+If `server_address` is not set - the URL to download the ollama binary from. Defaults to the offical Ollama GitHub release for this platform.
+
+
+*Type*: `string`
+
+
+
diff --git a/go.mod b/go.mod
@@ -374,7 +374,7 @@ require (
 	github.com/sirupsen/logrus v1.9.3 // indirect
 	github.com/spaolacci/murmur3 v1.1.0 // indirect
 	github.com/stretchr/objx v0.5.2 // indirect
-	github.com/testcontainers/testcontainers-go v0.33.0 // indirect
+	github.com/testcontainers/testcontainers-go v0.33.0
 	github.com/tilinna/z85 v1.0.0 // indirect
 	github.com/tklauser/go-sysconf v0.3.13 // indirect
 	github.com/tklauser/numcpus v0.7.0 // indirect

diff --git a/internal/asyncroutine/periodic_test.go b/internal/asyncroutine/periodic_test.go
@@ -40,7 +40,7 @@ func TestWorks(t *testing.T) {
 		counter.Add(1)
 	})
 	p.Start()
-	require.Eventually(t, func() bool { return counter.Load() > 5 }, time.Second, time.Millisecond)
+	require.Eventually(t, func() bool { return counter.Load() > 5 }, time.Second, 2*time.Millisecond)
 	p.Stop()
 	snapshot := counter.Load()
 	time.Sleep(time.Millisecond * 250)
@@ -56,7 +56,7 @@ func TestWorksWithContext(t *testing.T) {
 		active.Store(false)
 	})
 	p.Start()
-	require.Eventually(t, func() bool { return active.Load() }, 10*time.Millisecond, time.Millisecond)
+	require.Eventually(t, func() bool { return active.Load() }, time.Second, 5*time.Millisecond)
 	p.Stop()
 	require.False(t, active.Load())
 }
diff --git a/internal/impl/ollama/chat_processor.go b/internal/impl/ollama/chat_processor.go
@@ -25,16 +25,17 @@ const (
 	ocpFieldResponseFormat = "response_format"
 	ocpFieldImage          = "image"
 	// Prediction options
-	ocpFieldMaxTokens        = "max_tokens"
-	ocpFieldNumKeep          = "num_keep"
-	ocpFieldSeed             = "seed"
-	ocpFieldTopK             = "top_k"
-	ocpFieldTopP             = "top_p"
-	ocpFieldTemp             = "temperature"
-	ocpFieldRepeatPenalty    = "repeat_penalty"
-	ocpFieldPresencePenalty  = "presence_penalty"
-	ocpFieldFrequencyPenalty = "frequency_penalty"
-	ocpFieldStop             = "stop"
+	ocpFieldMaxTokens          = "max_tokens"
+	ocpFieldNumKeep            = "num_keep"
+	ocpFieldSeed               = "seed"
+	ocpFieldTopK               = "top_k"
+	ocpFieldTopP               = "top_p"
+	ocpFieldTemp               = "temperature"
+	ocpFieldRepeatPenalty      = "repeat_penalty"
+	ocpFieldPresencePenalty    = "presence_penalty"
+	ocpFieldFrequencyPenalty   = "frequency_penalty"
+	ocpFieldStop               = "stop"
+	ocpFieldEmitPromptMetadata = "save_prompt_metadata"
 )
 
 func init() {
@@ -121,6 +122,9 @@ For more information, see the https://github.com/ollama/ollama/tree/main/docs[Ol
 				Optional().
 				Advanced().
 				Description(`Sets the stop sequences to use. When this pattern is encountered the LLM stops generating text and returns the final response.`),
+			service.NewBoolField(ocpFieldEmitPromptMetadata).
+				Default(false).
+				Description(`If enabled the prompt is saved as @prompt metadata on the output message. If system_prompt is used it's also saved as @system_prompt`),
 		).Fields(commonFields()...).
 		Example(
 			"Use Llava to analyze an image",
@@ -180,6 +184,10 @@ func makeOllamaCompletionProcessor(conf *service.ParsedConfig, mgr *service.Reso
 	} else {
 		return nil, fmt.Errorf("invalid %s: %q", ocpFieldResponseFormat, format)
 	}
+	p.savePrompt, err = conf.FieldBool(ocpFieldEmitPromptMetadata)
+	if err != nil {
+		return nil, err
+	}
 	b, err := newBaseProcessor(conf, mgr)
 	if err != nil {
 		return nil, err
@@ -195,6 +203,7 @@ type ollamaCompletionProcessor struct {
 	userPrompt   *service.InterpolatedString
 	systemPrompt *service.InterpolatedString
 	image        *bloblang.Executor
+	savePrompt   bool
 }
 
 func (o *ollamaCompletionProcessor) Process(ctx context.Context, msg *service.Message) (service.MessageBatch, error) {
@@ -227,6 +236,12 @@ func (o *ollamaCompletionProcessor) Process(ctx context.Context, msg *service.Me
 	}
 	m := msg.Copy()
 	m.SetBytes([]byte(g))
+	if o.savePrompt {
+		if sp != "" {
+			m.MetaSet("system_prompt", sp)
+		}
+		m.MetaSet("prompt", up)
+	}
 	return service.MessageBatch{m}, nil
 }