refactor(document): move document domain core to document/ package

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-05 12:39:20 +02:00
parent bb7d872a61
commit e85057bed2
2371 changed files with 385726 additions and 1971 deletions
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/init.py
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/init.py
@@ -0,0 +1,15 @@
+"""SpellChecker Module"""
+
+from spellchecker.info import (  # noqa: F401
+    __author__,
+    __bugtrack_url__,
+    __credits__,
+    __email__,
+    __license__,  # noqa: F401
+    __maintainer__,
+    __url__,
+    __version__,
+)
+from spellchecker.spellchecker import SpellChecker, WordFrequency
+
+__all__ = ["SpellChecker", "WordFrequency"]
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/pycache/init.cpython-312.pyc
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/pycache/init.cpython-312.pyc
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/pycache/info.cpython-312.pyc
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/pycache/info.cpython-312.pyc
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/pycache/spellchecker.cpython-312.pyc
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/pycache/spellchecker.cpython-312.pyc
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/pycache/utils.cpython-312.pyc
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/pycache/utils.cpython-312.pyc
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/info.py
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/info.py
@@ -0,0 +1,10 @@
+"""SpellChecker Information"""
+
+__author__ = "Tyler Barrus"
+__maintainer__ = "Tyler Barrus"
+__email__ = "barrust@gmail.com"
+__license__ = "MIT"
+__version__ = "0.9.0"
+__credits__ = ["Peter Norvig"]
+__url__ = "https://github.com/barrust/pyspellchecker"
+__bugtrack_url__ = f"{__url__}/issues"
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/py.typed
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/py.typed
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/ar.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/ar.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/de.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/de.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/en.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/en.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/es.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/es.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/eu.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/eu.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/fa.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/fa.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/fr.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/fr.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/it.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/it.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/lv.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/lv.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/nl.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/nl.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/pt.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/pt.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/ru.json.gz
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/resources/ru.json.gz
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/spellchecker.py
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/spellchecker.py
@@ -0,0 +1,547 @@
+"""SpellChecker Module; simple, intuitive spell checker based on the post by
+Peter Norvig. See: https://norvig.com/spell-correct.html"""
+
+import gzip
+import json
+import pkgutil
+import string
+import typing
+import unicodedata
+from collections import Counter
+from collections.abc import Iterable
+
+from spellchecker.utils import KeyT, PathOrStr, _parse_into_words, ensure_unicode, load_file, write_file
+
+
+class SpellChecker:
+    """The SpellChecker class encapsulates the basics needed to accomplish a
+    simple spell checking algorithm. It is based on the work by
+    Peter Norvig (https://norvig.com/spell-correct.html)
+
+    Args:
+        language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
+            `en`, `es`, `it`, `de`, `fr`, `pt`, `ru`, `lv`, `eu`, `nl` and `fa`. Defaults to `en`. A list of \
+            languages may be provided and all languages will be loaded.
+        local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
+            will be loaded
+        distance (int): The edit distance to use. Defaults to 2.
+        case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \
+            language dictionary.
+    Note:
+        Using a case sensitive dictionary can be slow to correct words.
+    Raises:
+        ValueError: If the provided language dictionary does not exist, if case_sensitive is True with a language \
+            dictionary, or if both language and local_dictionary are specified."""
+
+    __slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]
+
+    def __init__(
+        self,
+        language: str | typing.Iterable[str] | None = "en",
+        local_dictionary: PathOrStr | None = None,
+        distance: int = 2,
+        tokenizer: typing.Callable[[str], typing.Iterable[str]] | None = None,
+        case_sensitive: bool = False,
+    ) -> None:
+        self._distance = 2  # default
+        self.distance = distance  # use the setter value check
+
+        if tokenizer:
+            self._tokenizer = tokenizer
+        else:
+            self._tokenizer = _parse_into_words
+
+        if language is not None and local_dictionary is not None:
+            raise ValueError("Cannot specify both 'language' and 'local_dictionary'. Choose one.")
+
+        if language is not None and case_sensitive:
+            raise ValueError("case_sensitive can only be True when not using a language dictionary.")
+
+        self._case_sensitive = case_sensitive if not language else False
+        self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)
+
+        if local_dictionary:
+            self._word_frequency.load_dictionary(local_dictionary)
+        elif language:
+            if not isinstance(language, Iterable) or isinstance(language, KeyT):
+                language = [language]
+            for lang in language:
+                filename = f"resources/{lang.lower()}.json.gz"
+                try:
+                    json_open = pkgutil.get_data("spellchecker", filename)
+                except FileNotFoundError as exc:
+                    msg = f"The provided dictionary language ({lang.lower()}) does not exist!"
+                    raise ValueError(msg) from exc
+                if json_open:
+                    lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))
+                self._word_frequency.load_json(lang_dict)
+
+    def __contains__(self, key: KeyT) -> bool:
+        """setup easier known checks"""
+        key = ensure_unicode(key)
+        return key in self._word_frequency
+
+    def __getitem__(self, key: KeyT) -> int:
+        """setup easier frequency checks"""
+        key = ensure_unicode(key)
+        return self._word_frequency[key]
+
+    def __iter__(self) -> typing.Generator[str, None, None]:
+        """setup iter support"""
+        yield from self._word_frequency.dictionary
+
+    @classmethod
+    def languages(cls) -> typing.Iterable[str]:
+        """list: A list of all official languages supported by the library"""
+        return ["en", "es", "fr", "it", "pt", "de", "ru", "ar", "lv", "eu", "nl", "fa"]
+
+    @property
+    def word_frequency(self) -> "WordFrequency":
+        """WordFrequency: An encapsulation of the word frequency `dictionary`
+
+        Note:
+            Not settable"""
+        return self._word_frequency
+
+    @property
+    def distance(self) -> int:
+        """int: The maximum edit distance to calculate
+
+        Note:
+            Valid values are 1 or 2; if an invalid value is passed, defaults to 2"""
+        return self._distance
+
+    @distance.setter
+    def distance(self, val: int) -> None:
+        """set the distance parameter"""
+        tmp = 2
+        try:
+            if 0 < int(val) <= 2:
+                tmp = val
+        except (ValueError, TypeError):
+            pass
+        self._distance = tmp
+
+    def split_words(self, text: KeyT) -> typing.Iterable[str]:
+        """Split text into individual `words` using either a simple whitespace
+        regex or the passed in tokenizer
+
+        Args:
+            text (str): The text to split into individual words
+        Returns:
+            list(str): A listing of all words in the provided text"""
+        text = ensure_unicode(text)
+        return self._tokenizer(text)
+
+    def export(self, filepath: PathOrStr, encoding: str = "utf-8", gzipped: bool = True) -> None:
+        """Export the word frequency list for import in the future
+
+        Args:
+           filepath (str): The filepath to the exported dictionary
+           encoding (str): The encoding of the resulting output
+           gzipped (bool): Whether to gzip the dictionary or not"""
+        data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
+        write_file(filepath, encoding, gzipped, data)
+
+    def word_usage_frequency(self, word: KeyT, total_words: int | None = None) -> float:
+        """Calculate the frequency to the `word` provided as seen across the
+        entire dictionary
+
+        Args:
+            word (str): The word for which the word probability is calculated
+            total_words (int): The total number of words to use in the calculation; \
+                use the default for using the whole word frequency
+        Returns:
+            float: The probability that the word is the correct word"""
+        if not total_words:
+            total_words = self._word_frequency.total_words
+        word = ensure_unicode(word)
+        return self._word_frequency.dictionary[word] / total_words
+
+    def correction(self, word: KeyT) -> str | None:
+        """The most probable correct spelling for the word
+
+        Args:
+            word (str): The word to correct
+        Returns:
+            str: The most likely candidate or None if no correction is present"""
+        word = ensure_unicode(word)
+        candidates = self.candidates(word)
+        if not candidates:
+            return None
+        # Prefer exact matches with incorrect diacritics
+        word_no_accents = self._remove_diacritics(word)
+        diacritics_candidates = [c for c in candidates if self._remove_diacritics(c) == word_no_accents]
+        if diacritics_candidates:
+            return max(diacritics_candidates, key=self.__getitem__)
+        return max(candidates, key=self.__getitem__)
+
+    def candidates(self, word: KeyT) -> set[str] | None:
+        """Generate possible spelling corrections for the provided word up to
+        an edit distance of two, if and only when needed
+
+        Args:
+            word (str): The word for which to calculate candidate spellings
+        Returns:
+            set: The set of words that are possible candidates or None if there are no candidates"""
+        word = ensure_unicode(word)
+        if self.known([word]):  # short-cut if word is correct already
+            return {word}
+
+        if not self._check_if_should_check(word):
+            return {word}
+
+        # get edit distance 1...
+        res = list(self.edit_distance_1(word))
+        tmp = self.known(res)
+        if tmp:
+            return tmp
+        # if still not found, use the edit distance 1 to calc edit distance 2
+        if self._distance == 2:
+            tmp = self.known(list(self.__edit_distance_alt(res)))
+            if tmp:
+                return tmp
+        return None
+
+    def known(self, words: typing.Iterable[KeyT]) -> set[str]:
+        """The subset of `words` that appear in the dictionary of words
+
+        Args:
+            words (list): List of words to determine which are in the corpus
+        Returns:
+            set: The set of those words from the input that are in the corpus"""
+        tmp_words = [ensure_unicode(w) for w in words]
+        tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]
+        return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}
+
+    def unknown(self, words: typing.Iterable[KeyT]) -> set[str]:
+        """The subset of `words` that do not appear in the dictionary
+
+        Args:
+            words (list): List of words to determine which are not in the corpus
+        Returns:
+            set: The set of those words from the input that are not in the corpus"""
+        tmp_words = [ensure_unicode(w) for w in words]
+        tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
+        return {w for w in tmp if w not in self._word_frequency.dictionary}
+
+    def edit_distance_1(self, word: KeyT) -> set[str]:
+        """Compute all strings that are one edit away from `word` using only
+        the letters in the corpus
+
+        Args:
+            word (str): The word for which to calculate the edit distance
+        Returns:
+            set: The set of strings that are edit distance one from the provided word"""
+        tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
+        if self._check_if_should_check(tmp_word) is False:
+            return {tmp_word}
+        letters = self._word_frequency.letters
+        splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]
+        deletes = [L + R[1:] for L, R in splits if R]
+        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
+        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
+        inserts = [L + c + R for L, R in splits for c in letters]
+        return set(deletes + transposes + replaces + inserts)
+
+    def edit_distance_2(self, word: KeyT) -> list[str]:
+        """Compute all strings that are two edits away from `word` using only
+        the letters in the corpus
+
+        Args:
+            word (str): The word for which to calculate the edit distance
+        Returns:
+            set: The set of strings that are edit distance two from the provided word"""
+        word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
+        return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]
+
+    def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> list[str]:
+        """Compute all strings that are 1 edits away from all the words using
+        only the letters in the corpus
+
+        Args:
+            words (list): The words for which to calculate the edit distance
+        Returns:
+            set: The set of strings that are edit distance two from the provided words"""
+        tmp_words = [ensure_unicode(w) for w in words]
+        tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
+        return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]
+
+    def _remove_diacritics(self, input_str: KeyT) -> str:
+        """Remove diacritics from the input string
+
+        Args:
+            input_str (str): The string from which to remove diacritics
+        Returns:
+            str: The string with diacritics removed"""
+        nfkd_form = unicodedata.normalize("NFKD", ensure_unicode(input_str))
+        return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
+
+    def _check_if_should_check(self, word: str) -> bool:
+        if len(word) == 1 and word in string.punctuation:
+            return False
+        if len(word) > self._word_frequency.longest_word_length + 3:  # allow removal of up to 2 letters
+            return False
+        if word.lower() == "nan":  # nan passes the float(word) so this will bypass that issue (#125)
+            return True
+        try:  # check if it is a number (int, float, etc)
+            float(word)
+            return False
+        except ValueError:
+            pass
+
+        return True
+
+
+class WordFrequency:
+    """Store the `dictionary` as a word frequency list while allowing for
+    different methods to load the data and update over time"""
+
+    __slots__ = [
+        "_dictionary",
+        "_total_words",
+        "_unique_words",
+        "_letters",
+        "_tokenizer",
+        "_case_sensitive",
+        "_longest_word_length",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: typing.Callable[[str], typing.Iterable[str]] | None = None,
+        case_sensitive: bool = False,
+    ) -> None:
+        self._dictionary: typing.Counter = Counter()
+        self._total_words = 0
+        self._unique_words = 0
+        self._letters: set[str] = set()
+        self._case_sensitive = case_sensitive
+        self._longest_word_length = 0
+
+        self._tokenizer = _parse_into_words
+        if tokenizer is not None:
+            self._tokenizer = tokenizer  # type:  ignore
+
+    def __contains__(self, key: KeyT) -> bool:
+        """turn on contains"""
+        key = ensure_unicode(key)
+        key = key if self._case_sensitive else key.lower()
+        return key in self._dictionary
+
+    def __getitem__(self, key: KeyT) -> int:
+        """turn on getitem"""
+        key = ensure_unicode(key)
+        key = key if self._case_sensitive else key.lower()
+        return self._dictionary[key]
+
+    def __iter__(self) -> typing.Generator[str, None, None]:
+        """turn on iter support"""
+        yield from self._dictionary
+
+    def pop(self, key: KeyT, default: int | None = None) -> int | None:
+        """Remove the key and return the associated value or default if not
+        found
+
+        Args:
+            key (str): The key to remove
+            default (obj): The value to return if key is not present
+        Returns:
+            int | None: Returns the number of instances of key, or None if not in the dictionary"""
+        key = ensure_unicode(key)
+        return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)
+
+    @property
+    def dictionary(self) -> dict[str, int]:
+        """Counter: A counting dictionary of all words in the corpus and the number
+        of times each has been seen
+
+        Note:
+            Not settable"""
+        return self._dictionary
+
+    @property
+    def total_words(self) -> int:
+        """int: The sum of all word occurrences in the word frequency dictionary
+
+        Note:
+            Not settable"""
+        return self._total_words
+
+    @property
+    def unique_words(self) -> int:
+        """int: The total number of unique words in the word frequency list
+
+        Note:
+            Not settable"""
+        return self._unique_words
+
+    @property
+    def letters(self) -> set[str]:
+        """set: The listing of all letters found within the corpus
+
+        Note:
+            Not settable"""
+        return self._letters
+
+    @property
+    def longest_word_length(self) -> int:
+        """int: The longest word length in the dictionary
+
+        Note:
+            Not settable"""
+        return self._longest_word_length
+
+    def tokenize(self, text: KeyT) -> typing.Iterator[str]:
+        """Tokenize the provided string object into individual words
+
+        Args:
+            text (str): The string object to tokenize
+        Yields:
+            str: The next `word` in the tokenized string
+        Note:
+            This is the same as the `spellchecker.split_words()` unless a tokenizer function was provided."""
+        tmp_text = ensure_unicode(text)
+        for word in self._tokenizer(tmp_text):
+            yield word if self._case_sensitive else word.lower()
+
+    def keys(self) -> typing.Iterator[str]:
+        """Iterator over the key of the dictionary
+
+        Yields:
+            str: The next key in the dictionary
+        Note:
+            This is the same as `spellchecker.words()`"""
+        yield from self._dictionary.keys()
+
+    def words(self) -> typing.Iterator[str]:
+        """Iterator over the words in the dictionary
+
+        Yields:
+            str: The next word in the dictionary
+        Note:
+            This is the same as `spellchecker.keys()`"""
+        yield from self._dictionary.keys()
+
+    def items(self) -> typing.Generator[tuple[str, int], None, None]:
+        """Iterator over the words in the dictionary
+
+        Yields:
+            str: The next word in the dictionary
+            int: The number of instances in the dictionary
+        Note:
+            This is the same as `dict.items()`"""
+        yield from self._dictionary.items()
+
+    def load_dictionary(self, filename: PathOrStr, encoding: str = "utf-8") -> None:
+        """Load in a pre-built word frequency list
+
+        Args:
+            filename (str): The filepath to the json (optionally gzipped) file to be loaded
+            encoding (str): The encoding of the dictionary"""
+        with load_file(filename, encoding) as data:
+            data = data if self._case_sensitive else data.lower()
+            self._dictionary.update(json.loads(data))
+            self._update_dictionary()
+
+    def load_json(self, data: dict[str, int]) -> None:
+        """Load in a pre-built word frequency list
+
+        Args:
+            data (dict): The dictionary to be loaded"""
+        self._dictionary.update(data)
+        self._update_dictionary()
+
+    def load_text_file(
+        self,
+        filename: PathOrStr,
+        encoding: str = "utf-8",
+        tokenizer: typing.Callable[[str], typing.Iterable[str]] | None = None,
+    ) -> None:
+        """Load in a text file from which to generate a word frequency list
+
+        Args:
+            filename (str): The filepath to the text file to be loaded
+            encoding (str): The encoding of the text file
+            tokenizer (function): The function to use to tokenize a string
+        """
+        with load_file(filename, encoding=encoding) as data:
+            self.load_text(data, tokenizer)
+
+    def load_text(
+        self,
+        text: KeyT,
+        tokenizer: typing.Callable[[str], typing.Iterable[str]] | None = None,
+    ) -> None:
+        """Load text from which to generate a word frequency list
+
+        Args:
+            text (str): The text to be loaded
+            tokenizer (function): The function to use to tokenize a string
+        """
+        text = ensure_unicode(text)
+        if tokenizer:
+            words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]
+        else:
+            words = self.tokenize(text)  # type: ignore[assignment]
+
+        self._dictionary.update(words)
+        self._update_dictionary()
+
+    def load_words(self, words: typing.Iterable[KeyT]) -> None:
+        """Load a list of words from which to generate a word frequency list
+
+        Args:
+            words (list): The list of words to be loaded"""
+        words = [ensure_unicode(w) for w in words]
+        self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
+        self._update_dictionary()
+
+    def add(self, word: KeyT, val: int = 1) -> None:
+        """Add a word to the word frequency list
+
+        Args:
+            word (str): The word to add
+            val (int): The number of times to insert the word"""
+        word = ensure_unicode(word)
+        self.load_json({word if self._case_sensitive else word.lower(): val})
+
+    def remove_words(self, words: typing.Iterable[KeyT]) -> None:
+        """Remove a list of words from the word frequency list
+
+        Args:
+            words (list): The list of words to remove"""
+        words = [ensure_unicode(w) for w in words]
+        for word in words:
+            self.pop(word)
+        self._update_dictionary()
+
+    def remove(self, word: KeyT) -> None:
+        """Remove a word from the word frequency list
+
+        Args:
+            word (str): The word to remove"""
+        self.pop(word)
+        self._update_dictionary()
+
+    def remove_by_threshold(self, threshold: int = 5) -> None:
+        """Remove all words at, or below, the provided threshold
+
+        Args:
+            threshold (int): The threshold at which a word is to be removed"""
+        to_remove = [k for k, v in self._dictionary.items() if v <= threshold]
+        self.remove_words(to_remove)
+
+    def _update_dictionary(self) -> None:
+        """Update the word frequency object"""
+        if not self._dictionary:
+            self._longest_word_length = 0
+            self._total_words = 0
+            self._unique_words = 0
+            self._letters = set()
+            return
+        keys = self._dictionary.keys()
+        self._longest_word_length = max(map(len, keys))
+        self._total_words = sum(self._dictionary.values())
+        self._unique_words = len(keys)
+        self._letters = set().union(*keys)
--- a/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/utils.py
+++ b/ocr-service/.venv/lib/python3.12/site-packages/spellchecker/utils.py
@@ -0,0 +1,147 @@
+"""Additional utility functions"""
+
+import contextlib
+import functools
+import gzip
+import re
+import typing
+import warnings
+from pathlib import Path
+
+from spellchecker.info import __version__
+
+KeyT = str | bytes
+PathOrStr = Path | str
+
+
+def fail_after(version: str) -> typing.Callable:
+    """Decorator to add to tests to ensure that they fail if a deprecated
+    feature is not removed before the specified version
+
+    Args:
+        version (str): The version to check against"""
+
+    def decorator_wrapper(func):
+        @functools.wraps(func)
+        def test_inner(*args, **kwargs):
+            if [int(x) for x in version.split(".")] <= [int(x) for x in __version__.split(".")]:
+                msg = (
+                    f"The function {func.__name__} must be fully removed as it is deprecated"
+                    f" and must be removed by version {version}"
+                )
+                raise AssertionError(msg)
+            return func(*args, **kwargs)
+
+        return test_inner
+
+    return decorator_wrapper
+
+
+def deprecated(message: str = "") -> typing.Callable:
+    """A simplistic decorator to mark functions as deprecated. The function
+    will pass a message to the user on the first use of the function
+
+    Args:
+        message (str): The message to display if the function is deprecated
+    """
+
+    def decorator_wrapper(func):
+        @functools.wraps(func)
+        def function_wrapper(*args, **kwargs):
+            func_name = func.__name__
+            if func_name not in function_wrapper.deprecated_items:
+                msg = f"Function {func.__name__} is now deprecated! {message}"
+                warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
+                function_wrapper.deprecated_items.add(func_name)
+
+            return func(*args, **kwargs)
+
+        # set this up the first time the decorator is called
+        function_wrapper.deprecated_items = set()
+
+        return function_wrapper
+
+    return decorator_wrapper
+
+
+def ensure_unicode(value: KeyT, encoding: str = "utf-8") -> str:
+    """Simplify checking if passed in data are bytes or a string and decode
+    bytes into unicode.
+
+    Args:
+        value (str): The input string (possibly bytes)
+        encoding (str): The encoding to use if input is bytes
+    Returns:
+        str: The encoded string
+    """
+    if isinstance(value, bytes):
+        return value.decode(encoding)
+    elif isinstance(value, list):
+        raise TypeError(f"The provided value {value} is not of type str or bytes")
+    return value
+
+
+@contextlib.contextmanager
+def __gzip_read(filename: PathOrStr, mode: str = "rb", encoding: str = "UTF-8") -> typing.Generator[KeyT, None, None]:
+    """Context manager to correctly handle the decoding of the output of the gzip file
+
+    Args:
+        filename (str): The filename to open
+        mode (str): The mode to read the data
+        encoding (str): The file encoding to use
+    Yields:
+        str: The string data from the gzip file read
+    """
+    with gzip.open(filename, mode=mode, encoding=encoding) as fobj:
+        yield fobj.read()
+
+
+@contextlib.contextmanager
+def load_file(filename: PathOrStr, encoding: str) -> typing.Generator[KeyT, None, None]:
+    """Context manager to handle opening a gzip or text file correctly and
+    reading all the data
+
+    Args:
+        filename (str): The filename to open
+        encoding (str): The file encoding to use
+    Yields:
+        str: The string data from the file read
+    """
+    if isinstance(filename, Path):
+        filename = str(filename)
+
+    if filename[-3:].lower() == ".gz":
+        with __gzip_read(filename, mode="rt", encoding=encoding) as data:
+            yield data
+    else:
+        with open(filename, encoding=encoding) as fobj:
+            yield fobj.read()
+
+
+def write_file(filepath: PathOrStr, encoding: str, gzipped: bool, data: str) -> None:
+    """Write the data to file either as a gzip file or text based on the
+    gzipped parameter
+
+    Args:
+        filepath (str): The filename to open
+        encoding (str): The file encoding to use
+        gzipped (bool): Whether the file should be gzipped or not
+        data (str): The data to be written out
+    """
+    if gzipped:
+        with gzip.open(filepath, "wt") as fobj:
+            fobj.write(data)
+    else:
+        with open(filepath, "w", encoding=encoding) as fobj:
+            fobj.write(data)
+
+
+def _parse_into_words(text: str) -> typing.Iterable[str]:
+    """Parse the text into words; currently removes punctuation except for
+    apostrophizes.
+
+    Args:
+        text (str): The text to split into words
+    """
+    # see: https://stackoverflow.com/a/12705513
+    return re.findall(r"(\w[\w']*\w|\w)", text)