familienarchiv/frontend/src/lib/shared/utils/extractText.ts

/**
 * **Not a sanitizer.** This module extracts visible text from a (presumed
 * already-sanitised) HTML string for excerpt rendering. It is safe ONLY
 * because the Geschichte body is sanitised against the OWASP allow-list
 * on the server before persistence, and via DOMPurify on render.
 *
 * Do not use these helpers to defend against XSS — `safeHtml()` in
 * `./sanitize.ts` is the only sanitiser. Calling `extractText()` on
 * untrusted input that has not been sanitised does not protect against
 * `javascript:` URLs, event-handler attributes, or `<svg/onload>` payloads.
 */

/**
 * Strip tags and return plain text. Uses DOMParser in the browser; on the
 * server it falls back to a regex that drops angle-bracket sequences.
 * The fallback is **not** a sanitiser — see module docstring.
 */
export function extractText(html: string | null | undefined): string {
	if (!html) return '';
	if (typeof DOMParser === 'function') {
		const doc = new DOMParser().parseFromString(html, 'text/html');
		return (doc.body.textContent ?? '').replace(/\s+/g, ' ').trim();
	}
	return html
		.replace(/<[^>]*>/g, '')
		.replace(/\s+/g, ' ')
		.trim();
}

/**
 * Strip tags then truncate to `max` chars on a word boundary, appending an
 * ellipsis when truncated. Used for editorial story excerpts.
 */
export function plainExcerpt(html: string | null | undefined, max = 80): string {
	const text = extractText(html);
	if (text.length <= max) return text;
	return text.slice(0, max).replace(/\s+\S*$/, '') + '…';
}