From: mkroemer Date: Fri, 9 Jan 2026 19:58:56 +0000 (+0100) Subject: fix: Add support for multimodal content (text + images) X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=d0f18d93f8bd55fed7a2f73cb16d5b4eff28476c;p=sgasser-llm-shield.git fix: Add support for multimodal content (text + images) Fixes crashes when processing OpenAI Chat Completion requests with multimodal content (array format). Previously, the code assumed message content is always a string, causing Presidio errors and 502/503 responses when LibreChat Agents sent vision requests. Changes: - Add extractTextContent() utility to safely extract text from both string and array content formats - Update PII detection to handle multimodal messages - Update secrets detection to extract text from array content - Update message redaction to preserve images while redacting text - Fix dashboard display to show readable content instead of [object Object] This enables full support for OpenAI's multimodal API format while maintaining PII/secrets protection on text portions. Resolves issues with LibreChat Agents feature returning 502 errors. --- diff --git a/src/routes/proxy.ts b/src/routes/proxy.ts index cd948a9..d683405 100644 --- a/src/routes/proxy.ts +++ b/src/routes/proxy.ts @@ -18,6 +18,7 @@ import type { ChatMessage, LLMResult, } from "../services/llm-client"; +import { extractTextContent, type ContentPart } from "../utils/content"; import { logRequest, type RequestLogData } from "../services/logger"; import { unmaskResponse } from "../services/masking"; import { createUnmaskingStream } from "../services/stream-transformer"; @@ -28,9 +29,9 @@ const ChatCompletionSchema = z messages: z .array( z.object({ - role: z.enum(["system", "user", "assistant"]), - content: z.string(), - }), + role: z.enum(["system", "user", "assistant", "tool"]), + content: z.union([z.string(), z.array(z.any()), z.null()]).optional(), + }).passthrough(), // Allow additional fields like name, tool_calls, etc. ) .min(1, "At least one message is required"), }) @@ -185,7 +186,8 @@ function redactMessagesWithSecrets( const messagePositions: { start: number; end: number }[] = []; for (const msg of messages) { - const length = typeof msg.content === "string" ? msg.content.length : 0; + const text = extractTextContent(msg.content); + const length = text.length; messagePositions.push({ start: currentOffset, end: currentOffset + length }); currentOffset += length + 1; // +1 for \n separator } @@ -199,7 +201,48 @@ function redactMessagesWithSecrets( // Apply redactions to each message const redactedMessages = messages.map((msg, i) => { - if (typeof msg.content !== "string" || !msg.content) { + // Handle null/undefined content + if (!msg.content) { + return msg; + } + + // Handle array content (multimodal messages) + if (Array.isArray(msg.content)) { + const msgPos = messagePositions[i]; + + // Filter redactions for this message + const messageRedactions = (secretsResult.redactions || []) + .filter((r) => r.start >= msgPos.start && r.end <= msgPos.end) + .map((r) => ({ + ...r, + start: r.start - msgPos.start, + end: r.end - msgPos.start, + })); + + if (messageRedactions.length === 0) { + return msg; + } + + // Redact only text parts of array content + const redactedContent = msg.content.map((part: ContentPart) => { + if (part.type === "text" && typeof part.text === "string") { + const { redacted, context: updatedContext } = redactSecrets( + part.text, + messageRedactions, + config, + context, + ); + context = updatedContext; + return { ...part, text: redacted }; + } + return part; + }); + + return { ...msg, content: redactedContent }; + } + + // Handle string content (text-only messages) + if (typeof msg.content !== "string") { return msg; } @@ -454,5 +497,11 @@ function createLogData( * Format messages for logging */ function formatMessagesForLog(messages: ChatMessage[]): string { - return messages.map((m) => `[${m.role}] ${m.content}`).join("\n"); + return messages + .map((m) => { + const text = extractTextContent(m.content); + const isMultimodal = Array.isArray(m.content); + return `[${m.role}${isMultimodal ? " multimodal" : ""}] ${text}`; + }) + .join("\n"); } diff --git a/src/secrets/detect.ts b/src/secrets/detect.ts index 59c0390..85e8f7f 100644 --- a/src/secrets/detect.ts +++ b/src/secrets/detect.ts @@ -1,5 +1,6 @@ import type { SecretsDetectionConfig } from "../config"; import type { ChatCompletionRequest } from "../services/llm-client"; +import { extractTextContent } from "../utils/content"; /** * All supported secret entity types @@ -34,14 +35,14 @@ export interface SecretsDetectionResult { * Extracts all text content from an OpenAI chat completion request * * Concatenates content from all messages (system, user, assistant) for secrets scanning. - * The proxy validation ensures content is always a string, so we can safely access it directly. + * Handles both string content (text-only) and array content (multimodal messages). * * Returns concatenated text for secrets scanning. */ export function extractTextFromRequest(body: ChatCompletionRequest): string { return body.messages - .map((message) => message.content) - .filter((content): content is string => typeof content === "string" && content.length > 0) + .map((message) => extractTextContent(message.content)) + .filter((text) => text.length > 0) .join("\n"); } diff --git a/src/services/pii-detector.ts b/src/services/pii-detector.ts index cff9772..5cc9ad3 100644 --- a/src/services/pii-detector.ts +++ b/src/services/pii-detector.ts @@ -4,6 +4,7 @@ import { type LanguageDetectionResult, type SupportedLanguage, } from "./language-detector"; +import { extractTextContent, type MessageContent } from "../utils/content"; export interface PIIEntity { entity_type: string; @@ -82,7 +83,7 @@ export class PIIDetector { } async analyzeMessages( - messages: Array<{ role: string; content: string }>, + messages: Array<{ role: string; content: MessageContent }>, ): Promise { const startTime = Date.now(); @@ -100,7 +101,7 @@ export class PIIDetector { }; } - const text = messages[lastUserIndex].content; + const text = extractTextContent(messages[lastUserIndex].content); const langResult = getLanguageDetector().detect(text); const newEntities = await this.detectPII(text, langResult.language); @@ -118,17 +119,18 @@ export class PIIDetector { } async analyzeAllMessages( - messages: Array<{ role: string; content: string }>, + messages: Array<{ role: string; content: MessageContent }>, langResult: LanguageDetectionResult, ): Promise { const startTime = Date.now(); const entitiesByMessage = await Promise.all( - messages.map((message) => - message.content && (message.role === "user" || message.role === "assistant") - ? this.detectPII(message.content, langResult.language) - : Promise.resolve([]), - ), + messages.map((message) => { + const text = extractTextContent(message.content); + return text && (message.role === "user" || message.role === "assistant") + ? this.detectPII(text, langResult.language) + : Promise.resolve([]); + }), ); return { diff --git a/src/utils/content.ts b/src/utils/content.ts new file mode 100644 index 0000000..5e52983 --- /dev/null +++ b/src/utils/content.ts @@ -0,0 +1,79 @@ +/** + * Utility functions for handling OpenAI message content + * + * OpenAI's Chat Completions API supports two content formats: + * 1. String content (text-only messages) + * 2. Array content (multimodal messages with text and images) + */ + +/** + * Content part for multimodal messages + */ +export interface ContentPart { + type: string; + text?: string; + image_url?: { + url: string; + detail?: string; + }; +} + +/** + * Message content can be a string (text-only) or array (multimodal) + */ +export type MessageContent = string | ContentPart[] | null | undefined; + +/** + * Safely extracts text content from a message + * + * Handles both string content and array content (multimodal messages). + * For array content, extracts and concatenates all text parts. + * + * @param content - The message content (string, array, null, or undefined) + * @returns Extracted text content, or empty string if no text found + * + * @example + * // Text-only message + * extractTextContent("Hello world") // => "Hello world" + * + * // Multimodal message + * extractTextContent([ + * { type: "text", text: "What's in this image?" }, + * { type: "image_url", image_url: { url: "..." } } + * ]) // => "What's in this image?" + * + * // Null/undefined + * extractTextContent(null) // => "" + */ +export function extractTextContent(content: MessageContent): string { + // Handle null/undefined + if (!content) { + return ""; + } + + // Handle string content (simple case) + if (typeof content === "string") { + return content; + } + + // Handle array content (multimodal messages) + if (Array.isArray(content)) { + return content + .filter((part) => part.type === "text" && typeof part.text === "string") + .map((part) => part.text!) + .join("\n"); + } + + // Unexpected type - return empty string + return ""; +} + +/** + * Checks if content has any text + * + * @param content - The message content to check + * @returns true if content contains text, false otherwise + */ +export function hasTextContent(content: MessageContent): boolean { + return extractTextContent(content).length > 0; +}