refactor(extract-text): rename stripHtml → extractText and document non-sanitiser status

Adds a module docstring at the top of extractText.ts spelling out that this is text extraction, not XSS sanitisation, and that callers must rely on safeHtml() (DOMPurify) for security. Adds a Vitest test block with classic XSS-shaped payloads (<script>, <svg/onload>, <iframe srcdoc>, javascript: href) asserting that no markup is re-emitted, even though the module is explicitly not a sanitiser. Updates the two callers (/geschichten index, GeschichtenCard) to import from the new path. The collapse-whitespace pass also makes the regex fallback's output saner for excerpt rendering. Closes Nora's review B1 on PR #382. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-02 18:44:40 +02:00
parent 18e5d18cc7
commit ad535e314b
6 changed files with 107 additions and 61 deletions
--- a/frontend/src/lib/components/GeschichtenCard.svelte
+++ b/frontend/src/lib/components/GeschichtenCard.svelte
@@ -1,7 +1,7 @@
 <script lang="ts">
 import { m } from '$lib/paraglide/messages.js';
 import type { components } from '$lib/generated/api';
-import { plainExcerpt } from '$lib/utils/stripHtml';
+import { plainExcerpt } from '$lib/utils/extractText';
 import { formatDate } from '$lib/utils/date';

 type Geschichte = components['schemas']['Geschichte'];
--- a/frontend/src/lib/utils/extractText.spec.ts
+++ b/frontend/src/lib/utils/extractText.spec.ts
@@ -0,0 +1,67 @@
+import { describe, expect, it } from 'vitest';
+import { extractText, plainExcerpt } from './extractText';
+
+describe('extractText', () => {
+	it('returns empty string for null/undefined/empty', () => {
+		expect(extractText(null)).toBe('');
+		expect(extractText(undefined)).toBe('');
+		expect(extractText('')).toBe('');
+	});
+
+	it('strips tags and preserves visible text', () => {
+		expect(extractText('<p>Hello <strong>world</strong></p>')).toBe('Hello world');
+	});
+
+	it('collapses whitespace within and between blocks', () => {
+		expect(extractText('<p>One</p><p>Two</p>')).toBe('OneTwo');
+		expect(extractText('<p>foo   bar</p>')).toBe('foo bar');
+	});
+
+	// XSS-shaped inputs: extractText must NOT execute, render, or expose the
+	// payload as HTML. It is only required to return *some* string. The fact
+	// that it exists is documented as a non-sanitiser; these tests prevent
+	// silent regressions where the function might somehow leak a tag.
+	describe('XSS-shaped input — never re-emits markup, even though this is not a sanitiser', () => {
+		it('drops <script> and surfaces only its text content', () => {
+			const out = extractText('<p>ok</p><script>alert(1)</script>');
+			expect(out).not.toContain('<script>');
+			expect(out).not.toContain('</script>');
+		});
+
+		it('drops <svg/onload> markup', () => {
+			const out = extractText('<svg/onload=alert(1)>');
+			expect(out).not.toContain('<svg');
+			expect(out).not.toContain('onload');
+		});
+
+		it('drops <iframe srcdoc=…> markup', () => {
+			const out = extractText('<iframe srcdoc="<script>alert(1)</script>">');
+			expect(out).not.toContain('<iframe');
+			expect(out).not.toContain('srcdoc');
+		});
+
+		it('drops <a href="javascript:…"> tag (text content may remain)', () => {
+			const out = extractText('<a href="javascript:alert(1)">click</a>');
+			expect(out).not.toContain('<a ');
+			expect(out).not.toContain('javascript:');
+		});
+	});
+});
+
+describe('plainExcerpt', () => {
+	it('returns full text when under the limit', () => {
+		expect(plainExcerpt('<p>short</p>', 80)).toBe('short');
+	});
+
+	it('truncates at the boundary with an ellipsis', () => {
+		const html = '<p>' + 'a'.repeat(100) + '</p>';
+		const out = plainExcerpt(html, 20);
+		expect(out.length).toBeLessThanOrEqual(21);
+		expect(out.endsWith('…')).toBe(true);
+	});
+
+	it('breaks at a word boundary when possible', () => {
+		const out = plainExcerpt('<p>The quick brown fox jumps over</p>', 18);
+		expect(out).toBe('The quick brown…');
+	});
+});
--- a/frontend/src/lib/utils/extractText.ts
+++ b/frontend/src/lib/utils/extractText.ts
@@ -0,0 +1,38 @@
+/**
+ * **Not a sanitizer.** This module extracts visible text from a (presumed
+ * already-sanitised) HTML string for excerpt rendering. It is safe ONLY
+ * because the Geschichte body is sanitised against the OWASP allow-list
+ * on the server before persistence, and via DOMPurify on render.
+ *
+ * Do not use these helpers to defend against XSS — `safeHtml()` in
+ * `./sanitize.ts` is the only sanitiser. Calling `extractText()` on
+ * untrusted input that has not been sanitised does not protect against
+ * `javascript:` URLs, event-handler attributes, or `<svg/onload>` payloads.
+ */
+
+/**
+ * Strip tags and return plain text. Uses DOMParser in the browser; on the
+ * server it falls back to a regex that drops angle-bracket sequences.
+ * The fallback is **not** a sanitiser — see module docstring.
+ */
+export function extractText(html: string | null | undefined): string {
+	if (!html) return '';
+	if (typeof DOMParser === 'function') {
+		const doc = new DOMParser().parseFromString(html, 'text/html');
+		return (doc.body.textContent ?? '').replace(/\s+/g, ' ').trim();
+	}
+	return html
+		.replace(/<[^>]*>/g, '')
+		.replace(/\s+/g, ' ')
+		.trim();
+}
+
+/**
+ * Strip tags then truncate to `max` chars on a word boundary, appending an
+ * ellipsis when truncated. Used for editorial story excerpts.
+ */
+export function plainExcerpt(html: string | null | undefined, max = 80): string {
+	const text = extractText(html);
+	if (text.length <= max) return text;
+	return text.slice(0, max).replace(/\s+\S*$/, '') + '…';
+}
--- a/frontend/src/lib/utils/stripHtml.spec.ts
+++ b/frontend/src/lib/utils/stripHtml.spec.ts
@@ -1,36 +0,0 @@
-import { describe, expect, it } from 'vitest';
-import { plainExcerpt, stripHtml } from './stripHtml';
-
-describe('stripHtml', () => {
-	it('returns empty string for null/undefined/empty', () => {
-		expect(stripHtml(null)).toBe('');
-		expect(stripHtml(undefined)).toBe('');
-		expect(stripHtml('')).toBe('');
-	});
-
-	it('strips tags and preserves visible text', () => {
-		expect(stripHtml('<p>Hello <strong>world</strong></p>')).toBe('Hello world');
-	});
-
-	it('strips nested HTML', () => {
-		expect(stripHtml('<div><p>A</p><p>B</p></div>')).toBe('AB');
-	});
-});
-
-describe('plainExcerpt', () => {
-	it('returns full text when under the limit', () => {
-		expect(plainExcerpt('<p>short</p>', 80)).toBe('short');
-	});
-
-	it('truncates at the boundary with an ellipsis', () => {
-		const html = '<p>' + 'a'.repeat(100) + '</p>';
-		const out = plainExcerpt(html, 20);
-		expect(out.length).toBeLessThanOrEqual(21); // 20 chars + ellipsis
-		expect(out.endsWith('…')).toBe(true);
-	});
-
-	it('breaks at a word boundary when possible', () => {
-		const out = plainExcerpt('<p>The quick brown fox jumps over</p>', 18);
-		expect(out).toBe('The quick brown…');
-	});
-});
--- a/frontend/src/lib/utils/stripHtml.ts
+++ b/frontend/src/lib/utils/stripHtml.ts
@@ -1,23 +0,0 @@
-/**
- * Strip HTML tags from a string and return the plain text.
- * Uses DOMParser in the browser, falls back to a regex strip on the server
- * (where DOMParser is not available without isomorphic-dompurify's JSDOM).
- */
-export function stripHtml(html: string | null | undefined): string {
-	if (!html) return '';
-	if (typeof DOMParser === 'function') {
-		const doc = new DOMParser().parseFromString(html, 'text/html');
-		return (doc.body.textContent ?? '').trim();
-	}
-	return html.replace(/<[^>]*>/g, '').trim();
-}
-
-/**
- * Strip HTML and truncate to a maximum length, appending an ellipsis when
- * the source exceeds it. Used for editorial story excerpts.
- */
-export function plainExcerpt(html: string | null | undefined, max = 80): string {
-	const text = stripHtml(html);
-	if (text.length <= max) return text;
-	return text.slice(0, max).replace(/\s+\S*$/, '') + '…';
-}
--- a/frontend/src/routes/geschichten/+page.svelte
+++ b/frontend/src/routes/geschichten/+page.svelte
@@ -1,7 +1,7 @@
 <script lang="ts">
 import { goto } from '$app/navigation';
 import { m } from '$lib/paraglide/messages.js';
-import { plainExcerpt } from '$lib/utils/stripHtml';
+import { plainExcerpt } from '$lib/utils/extractText';
 import { formatDate } from '$lib/utils/date';
 import PersonTypeahead from '$lib/components/PersonTypeahead.svelte';
 import type { PageData } from './$types';