From: mkroemer <redacted>
Date: Fri, 9 Jan 2026 19:58:56 +0000 (+0100)
Subject: fix: Add support for multimodal content (text + images)
X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=d0f18d93f8bd55fed7a2f73cb16d5b4eff28476c;p=sgasser-llm-shield.git

fix: Add support for multimodal content (text + images)

Fixes crashes when processing OpenAI Chat Completion requests with
multimodal content (array format). Previously, the code assumed message
content is always a string, causing Presidio errors and 502/503 responses
when LibreChat Agents sent vision requests.

Changes:
- Add extractTextContent() utility to safely extract text from both
  string and array content formats
- Update PII detection to handle multimodal messages
- Update secrets detection to extract text from array content
- Update message redaction to preserve images while redacting text
- Fix dashboard display to show readable content instead of [object Object]

This enables full support for OpenAI's multimodal API format while
maintaining PII/secrets protection on text portions.

Resolves issues with LibreChat Agents feature returning 502 errors.
---

diff --git a/src/routes/proxy.ts b/src/routes/proxy.ts
index cd948a9..d683405 100644
--- a/src/routes/proxy.ts
+++ b/src/routes/proxy.ts
@@ -18,6 +18,7 @@ import type {
   ChatMessage,
   LLMResult,
 } from "../services/llm-client";
+import { extractTextContent, type ContentPart } from "../utils/content";
 import { logRequest, type RequestLogData } from "../services/logger";
 import { unmaskResponse } from "../services/masking";
 import { createUnmaskingStream } from "../services/stream-transformer";
@@ -28,9 +29,9 @@ const ChatCompletionSchema = z
     messages: z
       .array(
         z.object({
-          role: z.enum(["system", "user", "assistant"]),
-          content: z.string(),
-        }),
+          role: z.enum(["system", "user", "assistant", "tool"]),
+          content: z.union([z.string(), z.array(z.any()), z.null()]).optional(),
+        }).passthrough(), // Allow additional fields like name, tool_calls, etc.
       )
       .min(1, "At least one message is required"),
   })
@@ -185,7 +186,8 @@ function redactMessagesWithSecrets(
   const messagePositions: { start: number; end: number }[] = [];
 
   for (const msg of messages) {
-    const length = typeof msg.content === "string" ? msg.content.length : 0;
+    const text = extractTextContent(msg.content);
+    const length = text.length;
     messagePositions.push({ start: currentOffset, end: currentOffset + length });
     currentOffset += length + 1; // +1 for \n separator
   }
@@ -199,7 +201,48 @@ function redactMessagesWithSecrets(
 
   // Apply redactions to each message
   const redactedMessages = messages.map((msg, i) => {
-    if (typeof msg.content !== "string" || !msg.content) {
+    // Handle null/undefined content
+    if (!msg.content) {
+      return msg;
+    }
+
+    // Handle array content (multimodal messages)
+    if (Array.isArray(msg.content)) {
+      const msgPos = messagePositions[i];
+
+      // Filter redactions for this message
+      const messageRedactions = (secretsResult.redactions || [])
+        .filter((r) => r.start >= msgPos.start && r.end <= msgPos.end)
+        .map((r) => ({
+          ...r,
+          start: r.start - msgPos.start,
+          end: r.end - msgPos.start,
+        }));
+
+      if (messageRedactions.length === 0) {
+        return msg;
+      }
+
+      // Redact only text parts of array content
+      const redactedContent = msg.content.map((part: ContentPart) => {
+        if (part.type === "text" && typeof part.text === "string") {
+          const { redacted, context: updatedContext } = redactSecrets(
+            part.text,
+            messageRedactions,
+            config,
+            context,
+          );
+          context = updatedContext;
+          return { ...part, text: redacted };
+        }
+        return part;
+      });
+
+      return { ...msg, content: redactedContent };
+    }
+
+    // Handle string content (text-only messages)
+    if (typeof msg.content !== "string") {
       return msg;
     }
 
@@ -454,5 +497,11 @@ function createLogData(
  * Format messages for logging
  */
 function formatMessagesForLog(messages: ChatMessage[]): string {
-  return messages.map((m) => `[${m.role}] ${m.content}`).join("\n");
+  return messages
+    .map((m) => {
+      const text = extractTextContent(m.content);
+      const isMultimodal = Array.isArray(m.content);
+      return `[${m.role}${isMultimodal ? " multimodal" : ""}] ${text}`;
+    })
+    .join("\n");
 }
diff --git a/src/secrets/detect.ts b/src/secrets/detect.ts
index 59c0390..85e8f7f 100644
--- a/src/secrets/detect.ts
+++ b/src/secrets/detect.ts
@@ -1,5 +1,6 @@
 import type { SecretsDetectionConfig } from "../config";
 import type { ChatCompletionRequest } from "../services/llm-client";
+import { extractTextContent } from "../utils/content";
 
 /**
  * All supported secret entity types
@@ -34,14 +35,14 @@ export interface SecretsDetectionResult {
  * Extracts all text content from an OpenAI chat completion request
  *
  * Concatenates content from all messages (system, user, assistant) for secrets scanning.
- * The proxy validation ensures content is always a string, so we can safely access it directly.
+ * Handles both string content (text-only) and array content (multimodal messages).
  *
  * Returns concatenated text for secrets scanning.
  */
 export function extractTextFromRequest(body: ChatCompletionRequest): string {
   return body.messages
-    .map((message) => message.content)
-    .filter((content): content is string => typeof content === "string" && content.length > 0)
+    .map((message) => extractTextContent(message.content))
+    .filter((text) => text.length > 0)
     .join("\n");
 }
 
diff --git a/src/services/pii-detector.ts b/src/services/pii-detector.ts
index cff9772..5cc9ad3 100644
--- a/src/services/pii-detector.ts
+++ b/src/services/pii-detector.ts
@@ -4,6 +4,7 @@ import {
   type LanguageDetectionResult,
   type SupportedLanguage,
 } from "./language-detector";
+import { extractTextContent, type MessageContent } from "../utils/content";
 
 export interface PIIEntity {
   entity_type: string;
@@ -82,7 +83,7 @@ export class PIIDetector {
   }
 
   async analyzeMessages(
-    messages: Array<{ role: string; content: string }>,
+    messages: Array<{ role: string; content: MessageContent }>,
   ): Promise<PIIDetectionResult> {
     const startTime = Date.now();
 
@@ -100,7 +101,7 @@ export class PIIDetector {
       };
     }
 
-    const text = messages[lastUserIndex].content;
+    const text = extractTextContent(messages[lastUserIndex].content);
     const langResult = getLanguageDetector().detect(text);
     const newEntities = await this.detectPII(text, langResult.language);
 
@@ -118,17 +119,18 @@ export class PIIDetector {
   }
 
   async analyzeAllMessages(
-    messages: Array<{ role: string; content: string }>,
+    messages: Array<{ role: string; content: MessageContent }>,
     langResult: LanguageDetectionResult,
   ): Promise<PIIDetectionResult> {
     const startTime = Date.now();
 
     const entitiesByMessage = await Promise.all(
-      messages.map((message) =>
-        message.content && (message.role === "user" || message.role === "assistant")
-          ? this.detectPII(message.content, langResult.language)
-          : Promise.resolve([]),
-      ),
+      messages.map((message) => {
+        const text = extractTextContent(message.content);
+        return text && (message.role === "user" || message.role === "assistant")
+          ? this.detectPII(text, langResult.language)
+          : Promise.resolve([]);
+      }),
     );
 
     return {
diff --git a/src/utils/content.ts b/src/utils/content.ts
new file mode 100644
index 0000000..5e52983
--- /dev/null
+++ b/src/utils/content.ts
@@ -0,0 +1,79 @@
+/**
+ * Utility functions for handling OpenAI message content
+ *
+ * OpenAI's Chat Completions API supports two content formats:
+ * 1. String content (text-only messages)
+ * 2. Array content (multimodal messages with text and images)
+ */
+
+/**
+ * Content part for multimodal messages
+ */
+export interface ContentPart {
+  type: string;
+  text?: string;
+  image_url?: {
+    url: string;
+    detail?: string;
+  };
+}
+
+/**
+ * Message content can be a string (text-only) or array (multimodal)
+ */
+export type MessageContent = string | ContentPart[] | null | undefined;
+
+/**
+ * Safely extracts text content from a message
+ *
+ * Handles both string content and array content (multimodal messages).
+ * For array content, extracts and concatenates all text parts.
+ *
+ * @param content - The message content (string, array, null, or undefined)
+ * @returns Extracted text content, or empty string if no text found
+ *
+ * @example
+ * // Text-only message
+ * extractTextContent("Hello world") // => "Hello world"
+ *
+ * // Multimodal message
+ * extractTextContent([
+ *   { type: "text", text: "What's in this image?" },
+ *   { type: "image_url", image_url: { url: "..." } }
+ * ]) // => "What's in this image?"
+ *
+ * // Null/undefined
+ * extractTextContent(null) // => ""
+ */
+export function extractTextContent(content: MessageContent): string {
+  // Handle null/undefined
+  if (!content) {
+    return "";
+  }
+
+  // Handle string content (simple case)
+  if (typeof content === "string") {
+    return content;
+  }
+
+  // Handle array content (multimodal messages)
+  if (Array.isArray(content)) {
+    return content
+      .filter((part) => part.type === "text" && typeof part.text === "string")
+      .map((part) => part.text!)
+      .join("\n");
+  }
+
+  // Unexpected type - return empty string
+  return "";
+}
+
+/**
+ * Checks if content has any text
+ *
+ * @param content - The message content to check
+ * @returns true if content contains text, false otherwise
+ */
+export function hasTextContent(content: MessageContent): boolean {
+  return extractTextContent(content).length > 0;
+}