refactor(document): move document domain core to document/ package
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
"""SpellChecker Module"""
|
||||
|
||||
from spellchecker.info import ( # noqa: F401
|
||||
__author__,
|
||||
__bugtrack_url__,
|
||||
__credits__,
|
||||
__email__,
|
||||
__license__, # noqa: F401
|
||||
__maintainer__,
|
||||
__url__,
|
||||
__version__,
|
||||
)
|
||||
from spellchecker.spellchecker import SpellChecker, WordFrequency
|
||||
|
||||
__all__ = ["SpellChecker", "WordFrequency"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,10 @@
|
||||
"""SpellChecker Information"""
|
||||
|
||||
__author__ = "Tyler Barrus"
|
||||
__maintainer__ = "Tyler Barrus"
|
||||
__email__ = "barrust@gmail.com"
|
||||
__license__ = "MIT"
|
||||
__version__ = "0.9.0"
|
||||
__credits__ = ["Peter Norvig"]
|
||||
__url__ = "https://github.com/barrust/pyspellchecker"
|
||||
__bugtrack_url__ = f"{__url__}/issues"
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,547 @@
|
||||
"""SpellChecker Module; simple, intuitive spell checker based on the post by
|
||||
Peter Norvig. See: https://norvig.com/spell-correct.html"""
|
||||
|
||||
import gzip
|
||||
import json
|
||||
import pkgutil
|
||||
import string
|
||||
import typing
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from collections.abc import Iterable
|
||||
|
||||
from spellchecker.utils import KeyT, PathOrStr, _parse_into_words, ensure_unicode, load_file, write_file
|
||||
|
||||
|
||||
class SpellChecker:
|
||||
"""The SpellChecker class encapsulates the basics needed to accomplish a
|
||||
simple spell checking algorithm. It is based on the work by
|
||||
Peter Norvig (https://norvig.com/spell-correct.html)
|
||||
|
||||
Args:
|
||||
language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
|
||||
`en`, `es`, `it`, `de`, `fr`, `pt`, `ru`, `lv`, `eu`, `nl` and `fa`. Defaults to `en`. A list of \
|
||||
languages may be provided and all languages will be loaded.
|
||||
local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
|
||||
will be loaded
|
||||
distance (int): The edit distance to use. Defaults to 2.
|
||||
case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \
|
||||
language dictionary.
|
||||
Note:
|
||||
Using a case sensitive dictionary can be slow to correct words.
|
||||
Raises:
|
||||
ValueError: If the provided language dictionary does not exist, if case_sensitive is True with a language \
|
||||
dictionary, or if both language and local_dictionary are specified."""
|
||||
|
||||
__slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
language: str | typing.Iterable[str] | None = "en",
|
||||
local_dictionary: PathOrStr | None = None,
|
||||
distance: int = 2,
|
||||
tokenizer: typing.Callable[[str], typing.Iterable[str]] | None = None,
|
||||
case_sensitive: bool = False,
|
||||
) -> None:
|
||||
self._distance = 2 # default
|
||||
self.distance = distance # use the setter value check
|
||||
|
||||
if tokenizer:
|
||||
self._tokenizer = tokenizer
|
||||
else:
|
||||
self._tokenizer = _parse_into_words
|
||||
|
||||
if language is not None and local_dictionary is not None:
|
||||
raise ValueError("Cannot specify both 'language' and 'local_dictionary'. Choose one.")
|
||||
|
||||
if language is not None and case_sensitive:
|
||||
raise ValueError("case_sensitive can only be True when not using a language dictionary.")
|
||||
|
||||
self._case_sensitive = case_sensitive if not language else False
|
||||
self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)
|
||||
|
||||
if local_dictionary:
|
||||
self._word_frequency.load_dictionary(local_dictionary)
|
||||
elif language:
|
||||
if not isinstance(language, Iterable) or isinstance(language, KeyT):
|
||||
language = [language]
|
||||
for lang in language:
|
||||
filename = f"resources/{lang.lower()}.json.gz"
|
||||
try:
|
||||
json_open = pkgutil.get_data("spellchecker", filename)
|
||||
except FileNotFoundError as exc:
|
||||
msg = f"The provided dictionary language ({lang.lower()}) does not exist!"
|
||||
raise ValueError(msg) from exc
|
||||
if json_open:
|
||||
lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))
|
||||
self._word_frequency.load_json(lang_dict)
|
||||
|
||||
def __contains__(self, key: KeyT) -> bool:
|
||||
"""setup easier known checks"""
|
||||
key = ensure_unicode(key)
|
||||
return key in self._word_frequency
|
||||
|
||||
def __getitem__(self, key: KeyT) -> int:
|
||||
"""setup easier frequency checks"""
|
||||
key = ensure_unicode(key)
|
||||
return self._word_frequency[key]
|
||||
|
||||
def __iter__(self) -> typing.Generator[str, None, None]:
|
||||
"""setup iter support"""
|
||||
yield from self._word_frequency.dictionary
|
||||
|
||||
@classmethod
|
||||
def languages(cls) -> typing.Iterable[str]:
|
||||
"""list: A list of all official languages supported by the library"""
|
||||
return ["en", "es", "fr", "it", "pt", "de", "ru", "ar", "lv", "eu", "nl", "fa"]
|
||||
|
||||
@property
|
||||
def word_frequency(self) -> "WordFrequency":
|
||||
"""WordFrequency: An encapsulation of the word frequency `dictionary`
|
||||
|
||||
Note:
|
||||
Not settable"""
|
||||
return self._word_frequency
|
||||
|
||||
@property
|
||||
def distance(self) -> int:
|
||||
"""int: The maximum edit distance to calculate
|
||||
|
||||
Note:
|
||||
Valid values are 1 or 2; if an invalid value is passed, defaults to 2"""
|
||||
return self._distance
|
||||
|
||||
@distance.setter
|
||||
def distance(self, val: int) -> None:
|
||||
"""set the distance parameter"""
|
||||
tmp = 2
|
||||
try:
|
||||
if 0 < int(val) <= 2:
|
||||
tmp = val
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
self._distance = tmp
|
||||
|
||||
def split_words(self, text: KeyT) -> typing.Iterable[str]:
|
||||
"""Split text into individual `words` using either a simple whitespace
|
||||
regex or the passed in tokenizer
|
||||
|
||||
Args:
|
||||
text (str): The text to split into individual words
|
||||
Returns:
|
||||
list(str): A listing of all words in the provided text"""
|
||||
text = ensure_unicode(text)
|
||||
return self._tokenizer(text)
|
||||
|
||||
def export(self, filepath: PathOrStr, encoding: str = "utf-8", gzipped: bool = True) -> None:
|
||||
"""Export the word frequency list for import in the future
|
||||
|
||||
Args:
|
||||
filepath (str): The filepath to the exported dictionary
|
||||
encoding (str): The encoding of the resulting output
|
||||
gzipped (bool): Whether to gzip the dictionary or not"""
|
||||
data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
|
||||
write_file(filepath, encoding, gzipped, data)
|
||||
|
||||
def word_usage_frequency(self, word: KeyT, total_words: int | None = None) -> float:
|
||||
"""Calculate the frequency to the `word` provided as seen across the
|
||||
entire dictionary
|
||||
|
||||
Args:
|
||||
word (str): The word for which the word probability is calculated
|
||||
total_words (int): The total number of words to use in the calculation; \
|
||||
use the default for using the whole word frequency
|
||||
Returns:
|
||||
float: The probability that the word is the correct word"""
|
||||
if not total_words:
|
||||
total_words = self._word_frequency.total_words
|
||||
word = ensure_unicode(word)
|
||||
return self._word_frequency.dictionary[word] / total_words
|
||||
|
||||
def correction(self, word: KeyT) -> str | None:
|
||||
"""The most probable correct spelling for the word
|
||||
|
||||
Args:
|
||||
word (str): The word to correct
|
||||
Returns:
|
||||
str: The most likely candidate or None if no correction is present"""
|
||||
word = ensure_unicode(word)
|
||||
candidates = self.candidates(word)
|
||||
if not candidates:
|
||||
return None
|
||||
# Prefer exact matches with incorrect diacritics
|
||||
word_no_accents = self._remove_diacritics(word)
|
||||
diacritics_candidates = [c for c in candidates if self._remove_diacritics(c) == word_no_accents]
|
||||
if diacritics_candidates:
|
||||
return max(diacritics_candidates, key=self.__getitem__)
|
||||
return max(candidates, key=self.__getitem__)
|
||||
|
||||
def candidates(self, word: KeyT) -> set[str] | None:
|
||||
"""Generate possible spelling corrections for the provided word up to
|
||||
an edit distance of two, if and only when needed
|
||||
|
||||
Args:
|
||||
word (str): The word for which to calculate candidate spellings
|
||||
Returns:
|
||||
set: The set of words that are possible candidates or None if there are no candidates"""
|
||||
word = ensure_unicode(word)
|
||||
if self.known([word]): # short-cut if word is correct already
|
||||
return {word}
|
||||
|
||||
if not self._check_if_should_check(word):
|
||||
return {word}
|
||||
|
||||
# get edit distance 1...
|
||||
res = list(self.edit_distance_1(word))
|
||||
tmp = self.known(res)
|
||||
if tmp:
|
||||
return tmp
|
||||
# if still not found, use the edit distance 1 to calc edit distance 2
|
||||
if self._distance == 2:
|
||||
tmp = self.known(list(self.__edit_distance_alt(res)))
|
||||
if tmp:
|
||||
return tmp
|
||||
return None
|
||||
|
||||
def known(self, words: typing.Iterable[KeyT]) -> set[str]:
|
||||
"""The subset of `words` that appear in the dictionary of words
|
||||
|
||||
Args:
|
||||
words (list): List of words to determine which are in the corpus
|
||||
Returns:
|
||||
set: The set of those words from the input that are in the corpus"""
|
||||
tmp_words = [ensure_unicode(w) for w in words]
|
||||
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]
|
||||
return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}
|
||||
|
||||
def unknown(self, words: typing.Iterable[KeyT]) -> set[str]:
|
||||
"""The subset of `words` that do not appear in the dictionary
|
||||
|
||||
Args:
|
||||
words (list): List of words to determine which are not in the corpus
|
||||
Returns:
|
||||
set: The set of those words from the input that are not in the corpus"""
|
||||
tmp_words = [ensure_unicode(w) for w in words]
|
||||
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
|
||||
return {w for w in tmp if w not in self._word_frequency.dictionary}
|
||||
|
||||
def edit_distance_1(self, word: KeyT) -> set[str]:
|
||||
"""Compute all strings that are one edit away from `word` using only
|
||||
the letters in the corpus
|
||||
|
||||
Args:
|
||||
word (str): The word for which to calculate the edit distance
|
||||
Returns:
|
||||
set: The set of strings that are edit distance one from the provided word"""
|
||||
tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
|
||||
if self._check_if_should_check(tmp_word) is False:
|
||||
return {tmp_word}
|
||||
letters = self._word_frequency.letters
|
||||
splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]
|
||||
deletes = [L + R[1:] for L, R in splits if R]
|
||||
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
|
||||
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
||||
inserts = [L + c + R for L, R in splits for c in letters]
|
||||
return set(deletes + transposes + replaces + inserts)
|
||||
|
||||
def edit_distance_2(self, word: KeyT) -> list[str]:
|
||||
"""Compute all strings that are two edits away from `word` using only
|
||||
the letters in the corpus
|
||||
|
||||
Args:
|
||||
word (str): The word for which to calculate the edit distance
|
||||
Returns:
|
||||
set: The set of strings that are edit distance two from the provided word"""
|
||||
word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
|
||||
return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]
|
||||
|
||||
def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> list[str]:
|
||||
"""Compute all strings that are 1 edits away from all the words using
|
||||
only the letters in the corpus
|
||||
|
||||
Args:
|
||||
words (list): The words for which to calculate the edit distance
|
||||
Returns:
|
||||
set: The set of strings that are edit distance two from the provided words"""
|
||||
tmp_words = [ensure_unicode(w) for w in words]
|
||||
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
|
||||
return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]
|
||||
|
||||
def _remove_diacritics(self, input_str: KeyT) -> str:
|
||||
"""Remove diacritics from the input string
|
||||
|
||||
Args:
|
||||
input_str (str): The string from which to remove diacritics
|
||||
Returns:
|
||||
str: The string with diacritics removed"""
|
||||
nfkd_form = unicodedata.normalize("NFKD", ensure_unicode(input_str))
|
||||
return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
|
||||
|
||||
def _check_if_should_check(self, word: str) -> bool:
|
||||
if len(word) == 1 and word in string.punctuation:
|
||||
return False
|
||||
if len(word) > self._word_frequency.longest_word_length + 3: # allow removal of up to 2 letters
|
||||
return False
|
||||
if word.lower() == "nan": # nan passes the float(word) so this will bypass that issue (#125)
|
||||
return True
|
||||
try: # check if it is a number (int, float, etc)
|
||||
float(word)
|
||||
return False
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class WordFrequency:
|
||||
"""Store the `dictionary` as a word frequency list while allowing for
|
||||
different methods to load the data and update over time"""
|
||||
|
||||
__slots__ = [
|
||||
"_dictionary",
|
||||
"_total_words",
|
||||
"_unique_words",
|
||||
"_letters",
|
||||
"_tokenizer",
|
||||
"_case_sensitive",
|
||||
"_longest_word_length",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: typing.Callable[[str], typing.Iterable[str]] | None = None,
|
||||
case_sensitive: bool = False,
|
||||
) -> None:
|
||||
self._dictionary: typing.Counter = Counter()
|
||||
self._total_words = 0
|
||||
self._unique_words = 0
|
||||
self._letters: set[str] = set()
|
||||
self._case_sensitive = case_sensitive
|
||||
self._longest_word_length = 0
|
||||
|
||||
self._tokenizer = _parse_into_words
|
||||
if tokenizer is not None:
|
||||
self._tokenizer = tokenizer # type: ignore
|
||||
|
||||
def __contains__(self, key: KeyT) -> bool:
|
||||
"""turn on contains"""
|
||||
key = ensure_unicode(key)
|
||||
key = key if self._case_sensitive else key.lower()
|
||||
return key in self._dictionary
|
||||
|
||||
def __getitem__(self, key: KeyT) -> int:
|
||||
"""turn on getitem"""
|
||||
key = ensure_unicode(key)
|
||||
key = key if self._case_sensitive else key.lower()
|
||||
return self._dictionary[key]
|
||||
|
||||
def __iter__(self) -> typing.Generator[str, None, None]:
|
||||
"""turn on iter support"""
|
||||
yield from self._dictionary
|
||||
|
||||
def pop(self, key: KeyT, default: int | None = None) -> int | None:
|
||||
"""Remove the key and return the associated value or default if not
|
||||
found
|
||||
|
||||
Args:
|
||||
key (str): The key to remove
|
||||
default (obj): The value to return if key is not present
|
||||
Returns:
|
||||
int | None: Returns the number of instances of key, or None if not in the dictionary"""
|
||||
key = ensure_unicode(key)
|
||||
return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)
|
||||
|
||||
@property
|
||||
def dictionary(self) -> dict[str, int]:
|
||||
"""Counter: A counting dictionary of all words in the corpus and the number
|
||||
of times each has been seen
|
||||
|
||||
Note:
|
||||
Not settable"""
|
||||
return self._dictionary
|
||||
|
||||
@property
|
||||
def total_words(self) -> int:
|
||||
"""int: The sum of all word occurrences in the word frequency dictionary
|
||||
|
||||
Note:
|
||||
Not settable"""
|
||||
return self._total_words
|
||||
|
||||
@property
|
||||
def unique_words(self) -> int:
|
||||
"""int: The total number of unique words in the word frequency list
|
||||
|
||||
Note:
|
||||
Not settable"""
|
||||
return self._unique_words
|
||||
|
||||
@property
|
||||
def letters(self) -> set[str]:
|
||||
"""set: The listing of all letters found within the corpus
|
||||
|
||||
Note:
|
||||
Not settable"""
|
||||
return self._letters
|
||||
|
||||
@property
|
||||
def longest_word_length(self) -> int:
|
||||
"""int: The longest word length in the dictionary
|
||||
|
||||
Note:
|
||||
Not settable"""
|
||||
return self._longest_word_length
|
||||
|
||||
def tokenize(self, text: KeyT) -> typing.Iterator[str]:
|
||||
"""Tokenize the provided string object into individual words
|
||||
|
||||
Args:
|
||||
text (str): The string object to tokenize
|
||||
Yields:
|
||||
str: The next `word` in the tokenized string
|
||||
Note:
|
||||
This is the same as the `spellchecker.split_words()` unless a tokenizer function was provided."""
|
||||
tmp_text = ensure_unicode(text)
|
||||
for word in self._tokenizer(tmp_text):
|
||||
yield word if self._case_sensitive else word.lower()
|
||||
|
||||
def keys(self) -> typing.Iterator[str]:
|
||||
"""Iterator over the key of the dictionary
|
||||
|
||||
Yields:
|
||||
str: The next key in the dictionary
|
||||
Note:
|
||||
This is the same as `spellchecker.words()`"""
|
||||
yield from self._dictionary.keys()
|
||||
|
||||
def words(self) -> typing.Iterator[str]:
|
||||
"""Iterator over the words in the dictionary
|
||||
|
||||
Yields:
|
||||
str: The next word in the dictionary
|
||||
Note:
|
||||
This is the same as `spellchecker.keys()`"""
|
||||
yield from self._dictionary.keys()
|
||||
|
||||
def items(self) -> typing.Generator[tuple[str, int], None, None]:
|
||||
"""Iterator over the words in the dictionary
|
||||
|
||||
Yields:
|
||||
str: The next word in the dictionary
|
||||
int: The number of instances in the dictionary
|
||||
Note:
|
||||
This is the same as `dict.items()`"""
|
||||
yield from self._dictionary.items()
|
||||
|
||||
def load_dictionary(self, filename: PathOrStr, encoding: str = "utf-8") -> None:
|
||||
"""Load in a pre-built word frequency list
|
||||
|
||||
Args:
|
||||
filename (str): The filepath to the json (optionally gzipped) file to be loaded
|
||||
encoding (str): The encoding of the dictionary"""
|
||||
with load_file(filename, encoding) as data:
|
||||
data = data if self._case_sensitive else data.lower()
|
||||
self._dictionary.update(json.loads(data))
|
||||
self._update_dictionary()
|
||||
|
||||
def load_json(self, data: dict[str, int]) -> None:
|
||||
"""Load in a pre-built word frequency list
|
||||
|
||||
Args:
|
||||
data (dict): The dictionary to be loaded"""
|
||||
self._dictionary.update(data)
|
||||
self._update_dictionary()
|
||||
|
||||
def load_text_file(
|
||||
self,
|
||||
filename: PathOrStr,
|
||||
encoding: str = "utf-8",
|
||||
tokenizer: typing.Callable[[str], typing.Iterable[str]] | None = None,
|
||||
) -> None:
|
||||
"""Load in a text file from which to generate a word frequency list
|
||||
|
||||
Args:
|
||||
filename (str): The filepath to the text file to be loaded
|
||||
encoding (str): The encoding of the text file
|
||||
tokenizer (function): The function to use to tokenize a string
|
||||
"""
|
||||
with load_file(filename, encoding=encoding) as data:
|
||||
self.load_text(data, tokenizer)
|
||||
|
||||
def load_text(
|
||||
self,
|
||||
text: KeyT,
|
||||
tokenizer: typing.Callable[[str], typing.Iterable[str]] | None = None,
|
||||
) -> None:
|
||||
"""Load text from which to generate a word frequency list
|
||||
|
||||
Args:
|
||||
text (str): The text to be loaded
|
||||
tokenizer (function): The function to use to tokenize a string
|
||||
"""
|
||||
text = ensure_unicode(text)
|
||||
if tokenizer:
|
||||
words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]
|
||||
else:
|
||||
words = self.tokenize(text) # type: ignore[assignment]
|
||||
|
||||
self._dictionary.update(words)
|
||||
self._update_dictionary()
|
||||
|
||||
def load_words(self, words: typing.Iterable[KeyT]) -> None:
|
||||
"""Load a list of words from which to generate a word frequency list
|
||||
|
||||
Args:
|
||||
words (list): The list of words to be loaded"""
|
||||
words = [ensure_unicode(w) for w in words]
|
||||
self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
|
||||
self._update_dictionary()
|
||||
|
||||
def add(self, word: KeyT, val: int = 1) -> None:
|
||||
"""Add a word to the word frequency list
|
||||
|
||||
Args:
|
||||
word (str): The word to add
|
||||
val (int): The number of times to insert the word"""
|
||||
word = ensure_unicode(word)
|
||||
self.load_json({word if self._case_sensitive else word.lower(): val})
|
||||
|
||||
def remove_words(self, words: typing.Iterable[KeyT]) -> None:
|
||||
"""Remove a list of words from the word frequency list
|
||||
|
||||
Args:
|
||||
words (list): The list of words to remove"""
|
||||
words = [ensure_unicode(w) for w in words]
|
||||
for word in words:
|
||||
self.pop(word)
|
||||
self._update_dictionary()
|
||||
|
||||
def remove(self, word: KeyT) -> None:
|
||||
"""Remove a word from the word frequency list
|
||||
|
||||
Args:
|
||||
word (str): The word to remove"""
|
||||
self.pop(word)
|
||||
self._update_dictionary()
|
||||
|
||||
def remove_by_threshold(self, threshold: int = 5) -> None:
|
||||
"""Remove all words at, or below, the provided threshold
|
||||
|
||||
Args:
|
||||
threshold (int): The threshold at which a word is to be removed"""
|
||||
to_remove = [k for k, v in self._dictionary.items() if v <= threshold]
|
||||
self.remove_words(to_remove)
|
||||
|
||||
def _update_dictionary(self) -> None:
|
||||
"""Update the word frequency object"""
|
||||
if not self._dictionary:
|
||||
self._longest_word_length = 0
|
||||
self._total_words = 0
|
||||
self._unique_words = 0
|
||||
self._letters = set()
|
||||
return
|
||||
keys = self._dictionary.keys()
|
||||
self._longest_word_length = max(map(len, keys))
|
||||
self._total_words = sum(self._dictionary.values())
|
||||
self._unique_words = len(keys)
|
||||
self._letters = set().union(*keys)
|
||||
@@ -0,0 +1,147 @@
|
||||
"""Additional utility functions"""
|
||||
|
||||
import contextlib
|
||||
import functools
|
||||
import gzip
|
||||
import re
|
||||
import typing
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
from spellchecker.info import __version__
|
||||
|
||||
KeyT = str | bytes
|
||||
PathOrStr = Path | str
|
||||
|
||||
|
||||
def fail_after(version: str) -> typing.Callable:
|
||||
"""Decorator to add to tests to ensure that they fail if a deprecated
|
||||
feature is not removed before the specified version
|
||||
|
||||
Args:
|
||||
version (str): The version to check against"""
|
||||
|
||||
def decorator_wrapper(func):
|
||||
@functools.wraps(func)
|
||||
def test_inner(*args, **kwargs):
|
||||
if [int(x) for x in version.split(".")] <= [int(x) for x in __version__.split(".")]:
|
||||
msg = (
|
||||
f"The function {func.__name__} must be fully removed as it is deprecated"
|
||||
f" and must be removed by version {version}"
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return test_inner
|
||||
|
||||
return decorator_wrapper
|
||||
|
||||
|
||||
def deprecated(message: str = "") -> typing.Callable:
|
||||
"""A simplistic decorator to mark functions as deprecated. The function
|
||||
will pass a message to the user on the first use of the function
|
||||
|
||||
Args:
|
||||
message (str): The message to display if the function is deprecated
|
||||
"""
|
||||
|
||||
def decorator_wrapper(func):
|
||||
@functools.wraps(func)
|
||||
def function_wrapper(*args, **kwargs):
|
||||
func_name = func.__name__
|
||||
if func_name not in function_wrapper.deprecated_items:
|
||||
msg = f"Function {func.__name__} is now deprecated! {message}"
|
||||
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
||||
function_wrapper.deprecated_items.add(func_name)
|
||||
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# set this up the first time the decorator is called
|
||||
function_wrapper.deprecated_items = set()
|
||||
|
||||
return function_wrapper
|
||||
|
||||
return decorator_wrapper
|
||||
|
||||
|
||||
def ensure_unicode(value: KeyT, encoding: str = "utf-8") -> str:
|
||||
"""Simplify checking if passed in data are bytes or a string and decode
|
||||
bytes into unicode.
|
||||
|
||||
Args:
|
||||
value (str): The input string (possibly bytes)
|
||||
encoding (str): The encoding to use if input is bytes
|
||||
Returns:
|
||||
str: The encoded string
|
||||
"""
|
||||
if isinstance(value, bytes):
|
||||
return value.decode(encoding)
|
||||
elif isinstance(value, list):
|
||||
raise TypeError(f"The provided value {value} is not of type str or bytes")
|
||||
return value
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def __gzip_read(filename: PathOrStr, mode: str = "rb", encoding: str = "UTF-8") -> typing.Generator[KeyT, None, None]:
|
||||
"""Context manager to correctly handle the decoding of the output of the gzip file
|
||||
|
||||
Args:
|
||||
filename (str): The filename to open
|
||||
mode (str): The mode to read the data
|
||||
encoding (str): The file encoding to use
|
||||
Yields:
|
||||
str: The string data from the gzip file read
|
||||
"""
|
||||
with gzip.open(filename, mode=mode, encoding=encoding) as fobj:
|
||||
yield fobj.read()
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def load_file(filename: PathOrStr, encoding: str) -> typing.Generator[KeyT, None, None]:
|
||||
"""Context manager to handle opening a gzip or text file correctly and
|
||||
reading all the data
|
||||
|
||||
Args:
|
||||
filename (str): The filename to open
|
||||
encoding (str): The file encoding to use
|
||||
Yields:
|
||||
str: The string data from the file read
|
||||
"""
|
||||
if isinstance(filename, Path):
|
||||
filename = str(filename)
|
||||
|
||||
if filename[-3:].lower() == ".gz":
|
||||
with __gzip_read(filename, mode="rt", encoding=encoding) as data:
|
||||
yield data
|
||||
else:
|
||||
with open(filename, encoding=encoding) as fobj:
|
||||
yield fobj.read()
|
||||
|
||||
|
||||
def write_file(filepath: PathOrStr, encoding: str, gzipped: bool, data: str) -> None:
|
||||
"""Write the data to file either as a gzip file or text based on the
|
||||
gzipped parameter
|
||||
|
||||
Args:
|
||||
filepath (str): The filename to open
|
||||
encoding (str): The file encoding to use
|
||||
gzipped (bool): Whether the file should be gzipped or not
|
||||
data (str): The data to be written out
|
||||
"""
|
||||
if gzipped:
|
||||
with gzip.open(filepath, "wt") as fobj:
|
||||
fobj.write(data)
|
||||
else:
|
||||
with open(filepath, "w", encoding=encoding) as fobj:
|
||||
fobj.write(data)
|
||||
|
||||
|
||||
def _parse_into_words(text: str) -> typing.Iterable[str]:
|
||||
"""Parse the text into words; currently removes punctuation except for
|
||||
apostrophizes.
|
||||
|
||||
Args:
|
||||
text (str): The text to split into words
|
||||
"""
|
||||
# see: https://stackoverflow.com/a/12705513
|
||||
return re.findall(r"(\w[\w']*\w|\w)", text)
|
||||
Reference in New Issue
Block a user