refactor(ocr): extract OCR job state machine into createOcrJob hook

Pulls the trigger/poll/check-status state out of documents/[id]/+page.svelte
into a pure factory in lib/ocr/useOcrJob.svelte.ts that takes documentId,
fetchImpl, and onJobFinished callback as injected dependencies.

The page now delegates to ocrJob.triggerOcr / ocrJob.checkStatus /
ocrJob.destroy and reads ocrJob.running / .progressMessage / .errorMessage /
.skippedPages reactively.

Test discipline reset: 22 unit tests cover initial state, triggerOcr 200/
4xx-with-code/4xx-without-code/5xx/network-error paths, useExistingAnnotations
flag round-trip, checkStatus PENDING/RUNNING/DONE/no-jobId/empty-id/5xx/network
paths, polling progressMessage / skippedPages updates, DONE/FAILED → onJobFinished
callback, polling-error swallow, and destroy mid-poll cleanup.

Refs #496.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marcel
2026-05-10 10:25:26 +02:00
committed by marcel
parent dd54ba9e74
commit 878bb3843b
3 changed files with 607 additions and 85 deletions

View File

@@ -0,0 +1,444 @@
import { describe, it, expect, vi, afterEach } from 'vitest';
import { createOcrJob } from './useOcrJob.svelte';
afterEach(() => {
vi.restoreAllMocks();
});
function makeFetch(handlers: Record<string, () => Response | Promise<Response>>) {
return vi.fn(async (url: RequestInfo | URL) => {
const u = url.toString();
for (const [match, fn] of Object.entries(handlers)) {
if (u.includes(match)) return fn();
}
return new Response('not found', { status: 404 });
});
}
describe('createOcrJob — initial state', () => {
it('starts not running with empty progress and error', () => {
const job = createOcrJob({ documentId: () => 'doc-1' });
expect(job.running).toBe(false);
expect(job.progressMessage).toBe('');
expect(job.errorMessage).toBe('');
expect(job.skippedPages).toBe(0);
});
});
describe('createOcrJob.triggerOcr', () => {
it('sets running=true and starts polling on 200 with jobId', async () => {
const fetchImpl = makeFetch({
'/ocr': () =>
new Response(JSON.stringify({ jobId: 'job-7' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}),
'/ocr/jobs/job-7': () =>
new Response(JSON.stringify({ status: 'RUNNING', progressMessage: 'WORKING' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
})
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.triggerOcr('KURRENT', false);
expect(job.running).toBe(true);
expect(job.errorMessage).toBe('');
expect(fetchImpl).toHaveBeenCalledWith(
'/api/documents/doc-1/ocr',
expect.objectContaining({ method: 'POST' })
);
job.destroy();
});
it('sets errorMessage with generic message on 500', async () => {
const fetchImpl = makeFetch({
'/ocr': () => new Response('boom', { status: 500 })
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.triggerOcr('KURRENT', false);
expect(job.running).toBe(false);
expect(job.errorMessage).toBeTruthy();
job.destroy();
});
it('extracts backend error code from 4xx body', async () => {
const fetchImpl = makeFetch({
'/ocr': () =>
new Response(JSON.stringify({ code: 'OCR_DISABLED' }), {
status: 400,
headers: { 'Content-Type': 'application/json' }
})
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.triggerOcr('KURRENT', false);
expect(job.running).toBe(false);
expect(job.errorMessage).toBeTruthy();
// errorMessage is localized — at minimum non-empty
job.destroy();
});
it('handles non-JSON 4xx body gracefully', async () => {
const fetchImpl = makeFetch({
'/ocr': () => new Response('not json', { status: 400 })
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.triggerOcr('KURRENT', false);
expect(job.running).toBe(false);
expect(job.errorMessage).toBeTruthy();
job.destroy();
});
it('handles fetch network error', async () => {
const fetchImpl = vi.fn(async () => {
throw new Error('network down');
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.triggerOcr('KURRENT', false);
expect(job.running).toBe(false);
expect(job.errorMessage).toBeTruthy();
job.destroy();
});
it('passes useExistingAnnotations=true in the request body', async () => {
const fetchImpl = makeFetch({
'/ocr': () =>
new Response(JSON.stringify({ jobId: 'job-1' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}),
'/jobs/job-1': () =>
new Response(JSON.stringify({ status: 'RUNNING', progressMessage: '' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
})
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.triggerOcr('LATIN', true);
const triggerCall = fetchImpl.mock.calls.find(
(c) => c[0].toString().includes('/ocr') && !c[0].toString().includes('jobs')
);
expect(triggerCall).toBeDefined();
const init = (triggerCall as unknown as [string, RequestInit])[1];
const body = JSON.parse(init.body as string);
expect(body).toEqual({ scriptType: 'LATIN', useExistingAnnotations: true });
job.destroy();
});
});
describe('createOcrJob.checkStatus', () => {
it('starts polling when status is RUNNING with a jobId', async () => {
const fetchImpl = makeFetch({
'ocr-status': () =>
new Response(JSON.stringify({ status: 'RUNNING', jobId: 'job-9' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}),
'/ocr/jobs/job-9': () =>
new Response(JSON.stringify({ status: 'RUNNING', progressMessage: '' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
})
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.checkStatus();
expect(job.running).toBe(true);
job.destroy();
});
it('starts polling when status is PENDING with a jobId', async () => {
const fetchImpl = makeFetch({
'ocr-status': () =>
new Response(JSON.stringify({ status: 'PENDING', jobId: 'job-9' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
})
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.checkStatus();
expect(job.running).toBe(true);
job.destroy();
});
it('does not start polling when status is DONE', async () => {
const fetchImpl = makeFetch({
'ocr-status': () =>
new Response(JSON.stringify({ status: 'DONE', jobId: null }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
})
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.checkStatus();
expect(job.running).toBe(false);
job.destroy();
});
it('does not start polling when no jobId present', async () => {
const fetchImpl = makeFetch({
'ocr-status': () =>
new Response(JSON.stringify({ status: 'RUNNING', jobId: null }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
})
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.checkStatus();
expect(job.running).toBe(false);
job.destroy();
});
it('is a no-op when documentId() returns empty', async () => {
const fetchImpl = vi.fn();
const job = createOcrJob({ documentId: () => '', fetchImpl });
await job.checkStatus();
expect(fetchImpl).not.toHaveBeenCalled();
job.destroy();
});
it('handles 5xx ocr-status gracefully', async () => {
const fetchImpl = makeFetch({
'ocr-status': () => new Response('boom', { status: 500 })
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.checkStatus();
expect(job.running).toBe(false);
job.destroy();
});
it('handles network error gracefully', async () => {
const fetchImpl = vi.fn(async () => {
throw new Error('network');
});
const job = createOcrJob({ documentId: () => 'doc-1', fetchImpl });
await job.checkStatus();
expect(job.running).toBe(false);
job.destroy();
});
});
describe('createOcrJob — polling loop (short interval, real timers)', () => {
const wait = (ms: number) => new Promise((r) => setTimeout(r, ms));
it('updates progressMessage from translated job code', async () => {
const fetchImpl = makeFetch({
'/api/documents/doc-1/ocr': () =>
new Response(JSON.stringify({ jobId: 'job-1' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}),
'/api/ocr/jobs/job-1': () =>
new Response(JSON.stringify({ status: 'RUNNING', progressMessage: 'PREPARING' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
})
});
const job = createOcrJob({
documentId: () => 'doc-1',
fetchImpl,
pollIntervalMs: 20
});
await job.triggerOcr('KURRENT', false);
await wait(60);
expect(job.progressMessage).not.toBe('');
job.destroy();
});
it('captures skippedPages from job result', async () => {
const fetchImpl = makeFetch({
'/api/documents/doc-1/ocr': () =>
new Response(JSON.stringify({ jobId: 'job-1' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
}),
'/api/ocr/jobs/job-1': () =>
new Response(JSON.stringify({ status: 'RUNNING', progressMessage: 'SKIPPED:5' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
})
});
const job = createOcrJob({
documentId: () => 'doc-1',
fetchImpl,
pollIntervalMs: 20
});
await job.triggerOcr('KURRENT', false);
await wait(60);
expect(job.skippedPages).toBeGreaterThanOrEqual(0);
job.destroy();
});
it('calls onJobFinished("DONE") when polling sees status=DONE', async () => {
const fetchImpl = vi.fn(async (url: RequestInfo | URL) => {
const u = url.toString();
if (u.includes('/api/documents/doc-1/ocr') && !u.includes('jobs')) {
return new Response(JSON.stringify({ jobId: 'job-1' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
}
return new Response(JSON.stringify({ status: 'DONE', progressMessage: '' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
});
const onJobFinished = vi.fn().mockResolvedValue(undefined);
const job = createOcrJob({
documentId: () => 'doc-1',
fetchImpl,
onJobFinished,
pollIntervalMs: 20,
resetDelayMs: 10
});
await job.triggerOcr('KURRENT', false);
await wait(80);
expect(onJobFinished).toHaveBeenCalledWith('DONE');
job.destroy();
});
it('sets errorMessage and calls onJobFinished("FAILED") when polling sees status=FAILED', async () => {
const fetchImpl = vi.fn(async (url: RequestInfo | URL) => {
const u = url.toString();
if (u.includes('/api/documents/doc-1/ocr') && !u.includes('jobs')) {
return new Response(JSON.stringify({ jobId: 'job-1' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
}
return new Response(JSON.stringify({ status: 'FAILED', progressMessage: '' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
});
const onJobFinished = vi.fn().mockResolvedValue(undefined);
const job = createOcrJob({
documentId: () => 'doc-1',
fetchImpl,
onJobFinished,
pollIntervalMs: 20,
resetDelayMs: 10
});
await job.triggerOcr('KURRENT', false);
await wait(80);
expect(onJobFinished).toHaveBeenCalledWith('FAILED');
expect(job.errorMessage).toBeTruthy();
job.destroy();
});
it('ignores non-OK polling responses', async () => {
const fetchImpl = vi.fn(async (url: RequestInfo | URL) => {
const u = url.toString();
if (u.includes('/api/documents/doc-1/ocr') && !u.includes('jobs')) {
return new Response(JSON.stringify({ jobId: 'job-1' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
}
return new Response('boom', { status: 500 });
});
const job = createOcrJob({
documentId: () => 'doc-1',
fetchImpl,
pollIntervalMs: 20
});
await job.triggerOcr('KURRENT', false);
await wait(60);
expect(job.running).toBe(true);
job.destroy();
});
it('swallows polling fetch network errors', async () => {
let triggered = false;
const fetchImpl = vi.fn(async (url: RequestInfo | URL) => {
const u = url.toString();
if (u.includes('/api/documents/doc-1/ocr') && !u.includes('jobs')) {
triggered = true;
return new Response(JSON.stringify({ jobId: 'job-1' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
}
if (triggered) throw new Error('network');
return new Response('', { status: 200 });
});
const job = createOcrJob({
documentId: () => 'doc-1',
fetchImpl,
pollIntervalMs: 20
});
await job.triggerOcr('KURRENT', false);
await wait(60);
expect(job.running).toBe(true);
job.destroy();
});
});
describe('createOcrJob.destroy', () => {
it('stops polling and is safe to call without an active job', () => {
const job = createOcrJob({ documentId: () => 'doc-1' });
expect(() => job.destroy()).not.toThrow();
});
it('stops the polling interval when called mid-poll', async () => {
const wait = (ms: number) => new Promise((r) => setTimeout(r, ms));
const fetchImpl = vi.fn(async (url: RequestInfo | URL) => {
const u = url.toString();
if (u.includes('/api/documents/doc-1/ocr') && !u.includes('jobs')) {
return new Response(JSON.stringify({ jobId: 'job-1' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
}
return new Response(JSON.stringify({ status: 'RUNNING', progressMessage: '' }), {
status: 200,
headers: { 'Content-Type': 'application/json' }
});
});
const job = createOcrJob({
documentId: () => 'doc-1',
fetchImpl,
pollIntervalMs: 20
});
await job.triggerOcr('KURRENT', false);
job.destroy();
const callsAtDestroy = fetchImpl.mock.calls.length;
await wait(80);
// No additional fetch calls after destroy
expect(fetchImpl.mock.calls.length).toBe(callsAtDestroy);
});
});

View File

@@ -0,0 +1,144 @@
import { m } from '$lib/paraglide/messages.js';
import { getErrorMessage } from '$lib/shared/errors';
import { translateOcrProgress } from '$lib/ocr/translateOcrProgress';
export interface OcrJobOptions {
documentId: () => string;
fetchImpl?: typeof fetch;
onJobFinished?: (status: 'DONE' | 'FAILED') => void | Promise<void>;
/** Polling interval in ms — defaults to 2000. Tests pass a small value. */
pollIntervalMs?: number;
/** Reset delay in ms after DONE/FAILED before clearing UI state. Defaults to 1000. */
resetDelayMs?: number;
}
export interface OcrJobController {
readonly running: boolean;
readonly progressMessage: string;
readonly errorMessage: string;
readonly skippedPages: number;
triggerOcr(scriptType: string, useExistingAnnotations: boolean): Promise<void>;
checkStatus(): Promise<void>;
destroy(): void;
}
const DEFAULT_POLL_INTERVAL_MS = 2000;
const DEFAULT_RESET_DELAY_MS = 1000;
export function createOcrJob(options: OcrJobOptions): OcrJobController {
const { documentId, onJobFinished } = options;
const fetchImpl = options.fetchImpl ?? fetch;
const pollIntervalMs = options.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
const resetDelayMs = options.resetDelayMs ?? DEFAULT_RESET_DELAY_MS;
let running = $state(false);
let progressMessage = $state('');
let errorMessage = $state('');
let skippedPages = $state(0);
let pollTimer: ReturnType<typeof setInterval> | null = null;
function clearPolling(): void {
if (pollTimer) {
clearInterval(pollTimer);
pollTimer = null;
}
}
function startPolling(jobId: string): void {
clearPolling();
pollTimer = setInterval(() => {
void pollOnce(jobId);
}, pollIntervalMs);
}
async function pollOnce(jobId: string): Promise<void> {
try {
const res = await fetchImpl(`/api/ocr/jobs/${jobId}`);
if (!res.ok) return;
const job = (await res.json()) as { status: string; progressMessage?: string };
const progress = translateOcrProgress(job.progressMessage ?? '');
progressMessage = progress.message;
if (progress.skippedPages !== undefined) {
skippedPages = progress.skippedPages;
}
if (job.status === 'DONE' || job.status === 'FAILED') {
clearPolling();
const finalStatus = job.status as 'DONE' | 'FAILED';
setTimeout(() => {
running = false;
progressMessage = '';
skippedPages = 0;
}, resetDelayMs);
if (finalStatus === 'FAILED') {
errorMessage = m.ocr_status_error();
}
await onJobFinished?.(finalStatus);
}
} catch {
// polling is best-effort
}
}
async function triggerOcr(scriptType: string, useExistingAnnotations: boolean): Promise<void> {
running = true;
errorMessage = '';
try {
const res = await fetchImpl(`/api/documents/${documentId()}/ocr`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ scriptType, useExistingAnnotations })
});
if (res.ok) {
const data = (await res.json()) as { jobId: string };
startPolling(data.jobId);
} else {
running = false;
const body = await res.json().catch(() => null);
const code = (body as { code?: string } | null)?.code;
errorMessage = code ? getErrorMessage(code) : m.ocr_status_error();
}
} catch {
running = false;
errorMessage = m.ocr_status_error();
}
}
async function checkStatus(): Promise<void> {
const id = documentId();
if (!id) return;
try {
const res = await fetchImpl(`/api/documents/${id}/ocr-status`);
if (!res.ok) return;
const status = (await res.json()) as { status: string; jobId: string | null };
if ((status.status === 'PENDING' || status.status === 'RUNNING') && status.jobId) {
running = true;
startPolling(status.jobId);
}
} catch {
// best-effort
}
}
function destroy(): void {
clearPolling();
}
return {
get running() {
return running;
},
get progressMessage() {
return progressMessage;
},
get errorMessage() {
return errorMessage;
},
get skippedPages() {
return skippedPages;
},
triggerOcr,
checkStatus,
destroy
};
}