refactor(extract-text): rename stripHtml → extractText and document non-sanitiser status
Adds a module docstring at the top of extractText.ts spelling out that this is text extraction, not XSS sanitisation, and that callers must rely on safeHtml() (DOMPurify) for security. Adds a Vitest test block with classic XSS-shaped payloads (<script>, <svg/onload>, <iframe srcdoc>, javascript: href) asserting that no markup is re-emitted, even though the module is explicitly not a sanitiser. Updates the two callers (/geschichten index, GeschichtenCard) to import from the new path. The collapse-whitespace pass also makes the regex fallback's output saner for excerpt rendering. Closes Nora's review B1 on PR #382. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
<script lang="ts">
|
||||
import { m } from '$lib/paraglide/messages.js';
|
||||
import type { components } from '$lib/generated/api';
|
||||
import { plainExcerpt } from '$lib/utils/stripHtml';
|
||||
import { plainExcerpt } from '$lib/utils/extractText';
|
||||
import { formatDate } from '$lib/utils/date';
|
||||
|
||||
type Geschichte = components['schemas']['Geschichte'];
|
||||
|
||||
67
frontend/src/lib/utils/extractText.spec.ts
Normal file
67
frontend/src/lib/utils/extractText.spec.ts
Normal file
@@ -0,0 +1,67 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { extractText, plainExcerpt } from './extractText';
|
||||
|
||||
describe('extractText', () => {
|
||||
it('returns empty string for null/undefined/empty', () => {
|
||||
expect(extractText(null)).toBe('');
|
||||
expect(extractText(undefined)).toBe('');
|
||||
expect(extractText('')).toBe('');
|
||||
});
|
||||
|
||||
it('strips tags and preserves visible text', () => {
|
||||
expect(extractText('<p>Hello <strong>world</strong></p>')).toBe('Hello world');
|
||||
});
|
||||
|
||||
it('collapses whitespace within and between blocks', () => {
|
||||
expect(extractText('<p>One</p><p>Two</p>')).toBe('OneTwo');
|
||||
expect(extractText('<p>foo bar</p>')).toBe('foo bar');
|
||||
});
|
||||
|
||||
// XSS-shaped inputs: extractText must NOT execute, render, or expose the
|
||||
// payload as HTML. It is only required to return *some* string. The fact
|
||||
// that it exists is documented as a non-sanitiser; these tests prevent
|
||||
// silent regressions where the function might somehow leak a tag.
|
||||
describe('XSS-shaped input — never re-emits markup, even though this is not a sanitiser', () => {
|
||||
it('drops <script> and surfaces only its text content', () => {
|
||||
const out = extractText('<p>ok</p><script>alert(1)</script>');
|
||||
expect(out).not.toContain('<script>');
|
||||
expect(out).not.toContain('</script>');
|
||||
});
|
||||
|
||||
it('drops <svg/onload> markup', () => {
|
||||
const out = extractText('<svg/onload=alert(1)>');
|
||||
expect(out).not.toContain('<svg');
|
||||
expect(out).not.toContain('onload');
|
||||
});
|
||||
|
||||
it('drops <iframe srcdoc=…> markup', () => {
|
||||
const out = extractText('<iframe srcdoc="<script>alert(1)</script>">');
|
||||
expect(out).not.toContain('<iframe');
|
||||
expect(out).not.toContain('srcdoc');
|
||||
});
|
||||
|
||||
it('drops <a href="javascript:…"> tag (text content may remain)', () => {
|
||||
const out = extractText('<a href="javascript:alert(1)">click</a>');
|
||||
expect(out).not.toContain('<a ');
|
||||
expect(out).not.toContain('javascript:');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('plainExcerpt', () => {
|
||||
it('returns full text when under the limit', () => {
|
||||
expect(plainExcerpt('<p>short</p>', 80)).toBe('short');
|
||||
});
|
||||
|
||||
it('truncates at the boundary with an ellipsis', () => {
|
||||
const html = '<p>' + 'a'.repeat(100) + '</p>';
|
||||
const out = plainExcerpt(html, 20);
|
||||
expect(out.length).toBeLessThanOrEqual(21);
|
||||
expect(out.endsWith('…')).toBe(true);
|
||||
});
|
||||
|
||||
it('breaks at a word boundary when possible', () => {
|
||||
const out = plainExcerpt('<p>The quick brown fox jumps over</p>', 18);
|
||||
expect(out).toBe('The quick brown…');
|
||||
});
|
||||
});
|
||||
38
frontend/src/lib/utils/extractText.ts
Normal file
38
frontend/src/lib/utils/extractText.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
/**
|
||||
* **Not a sanitizer.** This module extracts visible text from a (presumed
|
||||
* already-sanitised) HTML string for excerpt rendering. It is safe ONLY
|
||||
* because the Geschichte body is sanitised against the OWASP allow-list
|
||||
* on the server before persistence, and via DOMPurify on render.
|
||||
*
|
||||
* Do not use these helpers to defend against XSS — `safeHtml()` in
|
||||
* `./sanitize.ts` is the only sanitiser. Calling `extractText()` on
|
||||
* untrusted input that has not been sanitised does not protect against
|
||||
* `javascript:` URLs, event-handler attributes, or `<svg/onload>` payloads.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Strip tags and return plain text. Uses DOMParser in the browser; on the
|
||||
* server it falls back to a regex that drops angle-bracket sequences.
|
||||
* The fallback is **not** a sanitiser — see module docstring.
|
||||
*/
|
||||
export function extractText(html: string | null | undefined): string {
|
||||
if (!html) return '';
|
||||
if (typeof DOMParser === 'function') {
|
||||
const doc = new DOMParser().parseFromString(html, 'text/html');
|
||||
return (doc.body.textContent ?? '').replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
return html
|
||||
.replace(/<[^>]*>/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip tags then truncate to `max` chars on a word boundary, appending an
|
||||
* ellipsis when truncated. Used for editorial story excerpts.
|
||||
*/
|
||||
export function plainExcerpt(html: string | null | undefined, max = 80): string {
|
||||
const text = extractText(html);
|
||||
if (text.length <= max) return text;
|
||||
return text.slice(0, max).replace(/\s+\S*$/, '') + '…';
|
||||
}
|
||||
@@ -1,36 +0,0 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { plainExcerpt, stripHtml } from './stripHtml';
|
||||
|
||||
describe('stripHtml', () => {
|
||||
it('returns empty string for null/undefined/empty', () => {
|
||||
expect(stripHtml(null)).toBe('');
|
||||
expect(stripHtml(undefined)).toBe('');
|
||||
expect(stripHtml('')).toBe('');
|
||||
});
|
||||
|
||||
it('strips tags and preserves visible text', () => {
|
||||
expect(stripHtml('<p>Hello <strong>world</strong></p>')).toBe('Hello world');
|
||||
});
|
||||
|
||||
it('strips nested HTML', () => {
|
||||
expect(stripHtml('<div><p>A</p><p>B</p></div>')).toBe('AB');
|
||||
});
|
||||
});
|
||||
|
||||
describe('plainExcerpt', () => {
|
||||
it('returns full text when under the limit', () => {
|
||||
expect(plainExcerpt('<p>short</p>', 80)).toBe('short');
|
||||
});
|
||||
|
||||
it('truncates at the boundary with an ellipsis', () => {
|
||||
const html = '<p>' + 'a'.repeat(100) + '</p>';
|
||||
const out = plainExcerpt(html, 20);
|
||||
expect(out.length).toBeLessThanOrEqual(21); // 20 chars + ellipsis
|
||||
expect(out.endsWith('…')).toBe(true);
|
||||
});
|
||||
|
||||
it('breaks at a word boundary when possible', () => {
|
||||
const out = plainExcerpt('<p>The quick brown fox jumps over</p>', 18);
|
||||
expect(out).toBe('The quick brown…');
|
||||
});
|
||||
});
|
||||
@@ -1,23 +0,0 @@
|
||||
/**
|
||||
* Strip HTML tags from a string and return the plain text.
|
||||
* Uses DOMParser in the browser, falls back to a regex strip on the server
|
||||
* (where DOMParser is not available without isomorphic-dompurify's JSDOM).
|
||||
*/
|
||||
export function stripHtml(html: string | null | undefined): string {
|
||||
if (!html) return '';
|
||||
if (typeof DOMParser === 'function') {
|
||||
const doc = new DOMParser().parseFromString(html, 'text/html');
|
||||
return (doc.body.textContent ?? '').trim();
|
||||
}
|
||||
return html.replace(/<[^>]*>/g, '').trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip HTML and truncate to a maximum length, appending an ellipsis when
|
||||
* the source exceeds it. Used for editorial story excerpts.
|
||||
*/
|
||||
export function plainExcerpt(html: string | null | undefined, max = 80): string {
|
||||
const text = stripHtml(html);
|
||||
if (text.length <= max) return text;
|
||||
return text.slice(0, max).replace(/\s+\S*$/, '') + '…';
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
<script lang="ts">
|
||||
import { goto } from '$app/navigation';
|
||||
import { m } from '$lib/paraglide/messages.js';
|
||||
import { plainExcerpt } from '$lib/utils/stripHtml';
|
||||
import { plainExcerpt } from '$lib/utils/extractText';
|
||||
import { formatDate } from '$lib/utils/date';
|
||||
import PersonTypeahead from '$lib/components/PersonTypeahead.svelte';
|
||||
import type { PageData } from './$types';
|
||||
|
||||
Reference in New Issue
Block a user