refactor(extract-text): rename stripHtml → extractText and document non-sanitiser status

Adds a module docstring at the top of extractText.ts spelling out that this
is text extraction, not XSS sanitisation, and that callers must rely on
safeHtml() (DOMPurify) for security. Adds a Vitest test block with classic
XSS-shaped payloads (<script>, <svg/onload>, <iframe srcdoc>, javascript:
href) asserting that no markup is re-emitted, even though the module is
explicitly not a sanitiser.

Updates the two callers (/geschichten index, GeschichtenCard) to import
from the new path. The collapse-whitespace pass also makes the regex
fallback's output saner for excerpt rendering.

Closes Nora's review B1 on PR #382.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-02 18:44:40 +02:00
parent 18e5d18cc7
commit ad535e314b
6 changed files with 107 additions and 61 deletions

View File

@@ -1,7 +1,7 @@
<script lang="ts">
import { m } from '$lib/paraglide/messages.js';
import type { components } from '$lib/generated/api';
import { plainExcerpt } from '$lib/utils/stripHtml';
import { plainExcerpt } from '$lib/utils/extractText';
import { formatDate } from '$lib/utils/date';
type Geschichte = components['schemas']['Geschichte'];

View File

@@ -0,0 +1,67 @@
import { describe, expect, it } from 'vitest';
import { extractText, plainExcerpt } from './extractText';
describe('extractText', () => {
it('returns empty string for null/undefined/empty', () => {
expect(extractText(null)).toBe('');
expect(extractText(undefined)).toBe('');
expect(extractText('')).toBe('');
});
it('strips tags and preserves visible text', () => {
expect(extractText('<p>Hello <strong>world</strong></p>')).toBe('Hello world');
});
it('collapses whitespace within and between blocks', () => {
expect(extractText('<p>One</p><p>Two</p>')).toBe('OneTwo');
expect(extractText('<p>foo bar</p>')).toBe('foo bar');
});
// XSS-shaped inputs: extractText must NOT execute, render, or expose the
// payload as HTML. It is only required to return *some* string. The fact
// that it exists is documented as a non-sanitiser; these tests prevent
// silent regressions where the function might somehow leak a tag.
describe('XSS-shaped input — never re-emits markup, even though this is not a sanitiser', () => {
it('drops <script> and surfaces only its text content', () => {
const out = extractText('<p>ok</p><script>alert(1)</script>');
expect(out).not.toContain('<script>');
expect(out).not.toContain('</script>');
});
it('drops <svg/onload> markup', () => {
const out = extractText('<svg/onload=alert(1)>');
expect(out).not.toContain('<svg');
expect(out).not.toContain('onload');
});
it('drops <iframe srcdoc=…> markup', () => {
const out = extractText('<iframe srcdoc="<script>alert(1)</script>">');
expect(out).not.toContain('<iframe');
expect(out).not.toContain('srcdoc');
});
it('drops <a href="javascript:…"> tag (text content may remain)', () => {
const out = extractText('<a href="javascript:alert(1)">click</a>');
expect(out).not.toContain('<a ');
expect(out).not.toContain('javascript:');
});
});
});
describe('plainExcerpt', () => {
it('returns full text when under the limit', () => {
expect(plainExcerpt('<p>short</p>', 80)).toBe('short');
});
it('truncates at the boundary with an ellipsis', () => {
const html = '<p>' + 'a'.repeat(100) + '</p>';
const out = plainExcerpt(html, 20);
expect(out.length).toBeLessThanOrEqual(21);
expect(out.endsWith('…')).toBe(true);
});
it('breaks at a word boundary when possible', () => {
const out = plainExcerpt('<p>The quick brown fox jumps over</p>', 18);
expect(out).toBe('The quick brown…');
});
});

View File

@@ -0,0 +1,38 @@
/**
* **Not a sanitizer.** This module extracts visible text from a (presumed
* already-sanitised) HTML string for excerpt rendering. It is safe ONLY
* because the Geschichte body is sanitised against the OWASP allow-list
* on the server before persistence, and via DOMPurify on render.
*
* Do not use these helpers to defend against XSS — `safeHtml()` in
* `./sanitize.ts` is the only sanitiser. Calling `extractText()` on
* untrusted input that has not been sanitised does not protect against
* `javascript:` URLs, event-handler attributes, or `<svg/onload>` payloads.
*/
/**
* Strip tags and return plain text. Uses DOMParser in the browser; on the
* server it falls back to a regex that drops angle-bracket sequences.
* The fallback is **not** a sanitiser — see module docstring.
*/
export function extractText(html: string | null | undefined): string {
if (!html) return '';
if (typeof DOMParser === 'function') {
const doc = new DOMParser().parseFromString(html, 'text/html');
return (doc.body.textContent ?? '').replace(/\s+/g, ' ').trim();
}
return html
.replace(/<[^>]*>/g, '')
.replace(/\s+/g, ' ')
.trim();
}
/**
* Strip tags then truncate to `max` chars on a word boundary, appending an
* ellipsis when truncated. Used for editorial story excerpts.
*/
export function plainExcerpt(html: string | null | undefined, max = 80): string {
const text = extractText(html);
if (text.length <= max) return text;
return text.slice(0, max).replace(/\s+\S*$/, '') + '…';
}

View File

@@ -1,36 +0,0 @@
import { describe, expect, it } from 'vitest';
import { plainExcerpt, stripHtml } from './stripHtml';
describe('stripHtml', () => {
it('returns empty string for null/undefined/empty', () => {
expect(stripHtml(null)).toBe('');
expect(stripHtml(undefined)).toBe('');
expect(stripHtml('')).toBe('');
});
it('strips tags and preserves visible text', () => {
expect(stripHtml('<p>Hello <strong>world</strong></p>')).toBe('Hello world');
});
it('strips nested HTML', () => {
expect(stripHtml('<div><p>A</p><p>B</p></div>')).toBe('AB');
});
});
describe('plainExcerpt', () => {
it('returns full text when under the limit', () => {
expect(plainExcerpt('<p>short</p>', 80)).toBe('short');
});
it('truncates at the boundary with an ellipsis', () => {
const html = '<p>' + 'a'.repeat(100) + '</p>';
const out = plainExcerpt(html, 20);
expect(out.length).toBeLessThanOrEqual(21); // 20 chars + ellipsis
expect(out.endsWith('…')).toBe(true);
});
it('breaks at a word boundary when possible', () => {
const out = plainExcerpt('<p>The quick brown fox jumps over</p>', 18);
expect(out).toBe('The quick brown…');
});
});

View File

@@ -1,23 +0,0 @@
/**
* Strip HTML tags from a string and return the plain text.
* Uses DOMParser in the browser, falls back to a regex strip on the server
* (where DOMParser is not available without isomorphic-dompurify's JSDOM).
*/
export function stripHtml(html: string | null | undefined): string {
if (!html) return '';
if (typeof DOMParser === 'function') {
const doc = new DOMParser().parseFromString(html, 'text/html');
return (doc.body.textContent ?? '').trim();
}
return html.replace(/<[^>]*>/g, '').trim();
}
/**
* Strip HTML and truncate to a maximum length, appending an ellipsis when
* the source exceeds it. Used for editorial story excerpts.
*/
export function plainExcerpt(html: string | null | undefined, max = 80): string {
const text = stripHtml(html);
if (text.length <= max) return text;
return text.slice(0, max).replace(/\s+\S*$/, '') + '…';
}

View File

@@ -1,7 +1,7 @@
<script lang="ts">
import { goto } from '$app/navigation';
import { m } from '$lib/paraglide/messages.js';
import { plainExcerpt } from '$lib/utils/stripHtml';
import { plainExcerpt } from '$lib/utils/extractText';
import { formatDate } from '$lib/utils/date';
import PersonTypeahead from '$lib/components/PersonTypeahead.svelte';
import type { PageData } from './$types';