Core Classes
SpellChecker
The main spell checking class.from myspellchecker import SpellChecker
class SpellChecker:
"""
Main spell checker class for Myanmar text validation.
Attributes:
config: SpellCheckerConfig instance
provider: Dictionary provider
segmenter: Text segmenter
"""
def __init__(
self,
config: SpellCheckerConfig = None,
segmenter: Segmenter = None,
provider: DictionaryProvider = None,
syllable_validator: SyllableValidator = None,
word_validator: WordValidator = None,
context_validator: ContextValidator = None,
factory: ComponentFactoryProtocol = None,
):
"""
Initialize SpellChecker.
Args:
config: Configuration settings (default: balanced preset)
segmenter: Custom Segmenter for text tokenization (default: DefaultSegmenter)
provider: Dictionary provider (default: SQLiteProvider)
syllable_validator: Custom SyllableValidator (advanced use)
word_validator: Custom WordValidator (advanced use)
context_validator: Custom ContextValidator (advanced use)
factory: Custom ComponentFactory for dependency injection (advanced use)
"""
# --- Factory Methods ---
@classmethod
def create_default(cls) -> "SpellChecker":
"""Create SpellChecker with default settings (balanced performance/accuracy)."""
@classmethod
def create_fast(cls) -> "SpellChecker":
"""Create SpellChecker optimized for speed (disables context checking, NER, phonetic)."""
@classmethod
def create_accurate(cls) -> "SpellChecker":
"""Create SpellChecker optimized for accuracy (higher edit distance, lower thresholds)."""
@classmethod
def create_minimal(cls) -> "SpellChecker":
"""Create SpellChecker with minimal features (basic syllable validation only)."""
# --- Core Methods ---
def check(
self,
text: str,
level: ValidationLevel = ValidationLevel.SYLLABLE,
use_semantic: Optional[bool] = None,
) -> Response:
"""
Check text for spelling errors.
Args:
text: Myanmar text to check
level: Validation level (SYLLABLE or WORD)
use_semantic: Override semantic checking for this call
Returns:
Response containing errors and suggestions
"""
async def check_async(
self,
text: str,
level: ValidationLevel = ValidationLevel.SYLLABLE,
use_semantic: Optional[bool] = None,
) -> Response:
"""
Asynchronously check text for spelling errors.
Runs the CPU-bound check() in a separate thread via asyncio.to_thread().
Args:
text: Myanmar text to check
level: Validation level (SYLLABLE or WORD)
use_semantic: Override semantic checking for this call
Returns:
Response containing errors and suggestions
"""
def check_batch(
self,
texts: list[str],
level: ValidationLevel = ValidationLevel.SYLLABLE,
) -> list[Response]:
"""
Check multiple texts sequentially.
Args:
texts: List of texts to check
level: Validation level (SYLLABLE or WORD)
Returns:
List of Response objects
"""
async def check_batch_async(
self,
texts: list[str],
level: ValidationLevel = ValidationLevel.SYLLABLE,
max_concurrency: int = 4,
use_semantic: bool | None = None,
) -> list[Response]:
"""
Asynchronously check multiple texts with configurable concurrency.
Args:
texts: List of texts to check
level: Validation level (SYLLABLE or WORD)
max_concurrency: Maximum concurrent operations (default: 4)
use_semantic: Override semantic checking (None uses config default)
Returns:
List of Response objects
"""
def get_pos_tags(self, text: str = "", words: list[str] = None) -> list[str]:
"""
Get the most likely POS tag sequence for text or pre-segmented words.
Args:
text: Input text to tag (optional if words is provided)
words: Pre-segmented words (optional if text is provided)
Returns:
List of POS tags, one per word.
"""
def segment_and_tag(self, text: str) -> tuple[list[str], list[str]]:
"""
Perform joint word segmentation and POS tagging.
Uses joint Viterbi decoder if enabled (config.joint.enabled=True),
otherwise falls back to sequential segmentation then tagging.
Args:
text: Text to segment
Returns:
Tuple of (words, tags)
"""
def close(self) -> None:
"""Close and release resources."""
def __enter__(self) -> "SpellChecker":
"""Context manager entry."""
def __exit__(self, *args) -> None:
"""Context manager exit with cleanup."""
# --- Properties ---
@property
def symspell(self) -> Optional[SymSpell]:
"""Access SymSpell instance for direct suggestion lookups."""
@property
def context_checker(self) -> Optional[NgramContextChecker]:
"""Access NgramContextChecker for N-gram probability lookups."""
@property
def syllable_rule_validator(self) -> Optional[SyllableRuleValidator]:
"""Access SyllableRuleValidator for Myanmar orthographic validation."""
@property
def name_heuristic(self) -> Optional[NameHeuristic]:
"""Access NameHeuristic for proper noun detection."""
@property
def semantic_checker(self) -> Optional[SemanticChecker]:
"""Access SemanticChecker for AI-powered error detection."""
@property
def phonetic_hasher(self) -> Optional[PhoneticHasher]:
"""Access PhoneticHasher for phonetic similarity matching."""
SpellCheckerBuilder
Fluent builder for SpellChecker construction.from myspellchecker.core import SpellCheckerBuilder
class SpellCheckerBuilder:
"""Fluent builder for SpellChecker instances."""
def with_config(self, config: SpellCheckerConfig) -> "SpellCheckerBuilder":
"""Set the full configuration object."""
def with_provider(self, provider: DictionaryProvider) -> "SpellCheckerBuilder":
"""Set a custom dictionary provider."""
def with_segmenter(self, segmenter: Segmenter) -> "SpellCheckerBuilder":
"""Set a custom text segmenter."""
def with_phonetic(self, enabled: bool = True) -> "SpellCheckerBuilder":
"""Enable or disable phonetic similarity matching."""
def with_context_checking(self, enabled: bool = True) -> "SpellCheckerBuilder":
"""Enable or disable N-gram context checking."""
def with_ner(self, enabled: bool = True) -> "SpellCheckerBuilder":
"""Enable or disable Named Entity Recognition heuristics."""
def with_rule_based_validation(self, enabled: bool = True) -> "SpellCheckerBuilder":
"""Enable or disable rule-based syllable validation."""
def with_max_edit_distance(self, distance: int) -> "SpellCheckerBuilder":
"""Set maximum edit distance for suggestions (1-3)."""
def with_max_suggestions(self, count: int) -> "SpellCheckerBuilder":
"""Set maximum number of suggestions per error."""
def with_symspell_prefix_length(self, length: int) -> "SpellCheckerBuilder":
"""Set SymSpell prefix length for performance optimization (typically 5-10)."""
def with_cache_size(self, size: int) -> "SpellCheckerBuilder":
"""Set provider cache size for memory optimization."""
def with_bigram_threshold(self, threshold: float) -> "SpellCheckerBuilder":
"""Set probability threshold for flagging bigram errors (0.0-1.0)."""
def with_trigram_threshold(self, threshold: float) -> "SpellCheckerBuilder":
"""Set probability threshold for flagging trigram errors (0.0-1.0)."""
def with_semantic_model(
self,
model_path: str = None,
tokenizer_path: str = None,
model: Any = None,
tokenizer: Any = None,
) -> "SpellCheckerBuilder":
"""Configure semantic checking model (paths or pre-loaded instances)."""
def with_word_engine(
self, engine: Literal["myword", "crf", "transformer"]
) -> "SpellCheckerBuilder":
"""Set the word segmentation engine."""
def build(self) -> SpellChecker:
"""Construct SpellChecker with all configured options."""
from myspellchecker.core.builder import SpellCheckerBuilder
from myspellchecker.providers import SQLiteProvider
# Using custom provider
provider = SQLiteProvider(database_path="/path/to/db.sqlite")
checker = (
SpellCheckerBuilder()
.with_provider(provider)
.with_phonetic(True)
.with_context_checking(True)
.build()
)
ConfigPresets
Pre-configured SpellCheckerConfig instances for common use cases.from myspellchecker.core.builder import ConfigPresets
# Use a preset directly
checker = SpellChecker(config=ConfigPresets.FAST)
# Customize a preset (each access returns a deep copy, safe to modify)
config = ConfigPresets.ACCURATE
config.max_suggestions = 10
checker = SpellChecker(config=config)
DEFAULT, FAST, ACCURATE, MINIMAL, STRICT.
Configuration Classes
SpellCheckerConfig
Main configuration class (Pydantic BaseModel).from myspellchecker.core.config import SpellCheckerConfig, get_profile
class SpellCheckerConfig(BaseModel):
"""Spell checker configuration (Pydantic BaseModel)."""
# Core dependencies (runtime objects)
segmenter: Optional[Segmenter] = None
provider: Optional[DictionaryProvider] = None
# Suggestion settings
max_suggestions: int = 5
max_edit_distance: int = 2 # Range: 1-3
# Feature toggles
use_phonetic: bool = True
use_context_checker: bool = True
use_ner: bool = True
use_rule_based_validation: bool = True
# Word segmentation
word_engine: Literal["myword", "crf", "transformer"] = "myword"
seg_model: Optional[str] = None # Custom model for transformer engine
seg_device: int = -1 # -1=CPU, 0+=GPU (transformer only)
# Safety limits
max_text_length: int = 100_000 # Maximum input characters (prevents resource exhaustion)
# Behavior
fallback_to_empty_provider: bool = False # Allow empty MemoryProvider if DB not found
# Nested configurations (each defaults to a new instance with its own defaults)
symspell: SymSpellConfig = SymSpellConfig()
ngram_context: NgramContextConfig = NgramContextConfig()
phonetic: PhoneticConfig = PhoneticConfig()
pos_tagger: POSTaggerConfig = POSTaggerConfig()
semantic: SemanticConfig = SemanticConfig()
validation: ValidationConfig = ValidationConfig()
provider_config: ProviderConfig = ProviderConfig()
joint: JointConfig = JointConfig()
cache: AlgorithmCacheConfig = AlgorithmCacheConfig()
ranker: RankerConfig = RankerConfig()
frequency_guards: FrequencyGuardConfig = FrequencyGuardConfig()
compound_resolver: CompoundResolverConfig = CompoundResolverConfig()
reduplication: ReduplicationConfig = ReduplicationConfig()
neural_reranker: NeuralRerankerConfig = NeuralRerankerConfig()
ner: Optional[NERConfig] = None # NER model config (None = use heuristic fallback)
# Use get_profile() for presets:
config = get_profile("development") # Fast iteration, minimal validation
config = get_profile("production") # Balanced (default)
config = get_profile("testing") # Deterministic, reproducible
config = get_profile("fast") # Maximum speed
config = get_profile("accurate") # Maximum accuracy
ValidationLevel
Enum for validation depth.from myspellchecker.core.constants import ValidationLevel
class ValidationLevel(str, Enum):
SYLLABLE = "syllable" # Fast syllable-only validation
WORD = "word" # Thorough word + context validation
Note: Validation level is passed to check() and other methods, not as a configuration option.
POSTaggerConfig
POS tagger configuration.class POSTaggerConfig(BaseModel):
"""POS tagger configuration (pydantic model)."""
tagger_type: str = "rule_based" # "rule_based", "viterbi", "transformer"
model_name: str | None = None # HuggingFace model ID (for transformer)
device: int = -1 # -1 for CPU, 0+ for GPU
batch_size: int = 32
cache_size: int = 10000 # LRU cache size
use_morphology_fallback: bool = True
beam_width: int = 10 # For Viterbi tagger
unknown_tag: str = "UNK" # Tag for unknown words
SemanticConfig
Semantic checker configuration.class SemanticConfig(BaseModel):
"""Semantic checker configuration (Pydantic BaseModel)."""
model_path: str = None
tokenizer_path: str = None
model: Any = None # Pre-loaded ONNX session
tokenizer: Any = None # Pre-loaded tokenizer
num_threads: int = 0 # ONNX inference threads (0 = auto-detect all cores)
predict_top_k: int = 5 # Top-K predictions
check_top_k: int = 10 # Tokens to check
use_semantic_refinement: bool = True
use_proactive_scanning: bool = False # AI-powered error detection
proactive_confidence_threshold: float = 0.85 # Threshold for proactive scanning
Response Classes
Response
Result of spell checking.from myspellchecker.core.response import Response
@dataclass
class Response:
"""Result of spell checking."""
text: str
"""Original input text (unchanged)."""
corrected_text: str
"""Auto-corrected text using top suggestions."""
has_errors: bool
"""True if any errors detected."""
level: str
"""Validation level used ('syllable' or 'word')."""
errors: list[Error]
"""List of Error objects (SyllableError, WordError, ContextError, GrammarError)."""
metadata: dict
"""Additional metadata (processing_time, layers_applied, etc.)."""
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
def to_json(self, indent: int = 2) -> str:
"""Convert to JSON string."""
Error
Base error class.from myspellchecker.core.response import Error, SyllableError, WordError, ContextError, GrammarError
from myspellchecker.core.constants import ErrorType
@dataclass
class Error:
"""Spelling error."""
text: str
"""The erroneous text (syllable or word)."""
position: int
"""Character position in original text (0-indexed)."""
suggestions: list[str]
"""Suggested corrections, ranked by likelihood."""
error_type: str
"""Type of error ('invalid_syllable', 'invalid_word', etc.)."""
confidence: float = 1.0
"""Confidence score (0.0-1.0). Higher = more certain."""
def to_dict(self) -> dict:
"""Convert to dictionary."""
def to_json(self, indent: int = 2) -> str:
"""Convert to JSON string."""
@dataclass
class SyllableError(Error):
"""Invalid syllable error (Layer 1). Default error_type: 'invalid_syllable'."""
error_type: str = "invalid_syllable"
@dataclass
class WordError(Error):
"""Invalid word error (Layer 2). Default error_type: 'invalid_word'."""
syllable_count: int = 0
error_type: str = "invalid_word"
@dataclass
class ContextError(Error):
"""Context error - unlikely word sequence (Layer 3). Default error_type: 'context_probability'."""
probability: float = 0.0
prev_word: str = ""
error_type: str = "context_probability"
@dataclass
class GrammarError(Error):
"""Grammar-related errors. Default error_type: 'grammar_error'."""
reason: str = ""
error_type: str = "grammar_error"
@property
def word(self) -> str:
"""Alias for 'text' for backward compatibility."""
@property
def suggestion(self) -> str:
"""Return first suggestion for backward compatibility."""
class ErrorType(str, Enum):
# --- Core validation errors ---
SYLLABLE = "invalid_syllable"
WORD = "invalid_word"
CONTEXT_PROBABILITY = "context_probability"
GRAMMAR = "grammar_error"
# --- Syllable-level errors ---
PARTICLE_TYPO = "particle_typo"
MEDIAL_CONFUSION = "medial_confusion"
# --- Colloquial variant errors ---
COLLOQUIAL_VARIANT = "colloquial_variant"
COLLOQUIAL_INFO = "colloquial_info"
# --- Validation strategy errors ---
QUESTION_STRUCTURE = "question_structure"
SYNTAX_ERROR = "syntax_error"
HOMOPHONE_ERROR = "homophone_error"
TONE_AMBIGUITY = "tone_ambiguity"
POS_SEQUENCE_ERROR = "pos_sequence_error"
SEMANTIC_ERROR = "semantic_error"
CONFUSABLE_ERROR = "confusable_error"
# --- Encoding errors ---
ZAWGYI_ENCODING = "zawgyi_encoding"
# --- Grammar checker errors ---
MIXED_REGISTER = "mixed_register"
ASPECT_TYPO = "aspect_typo"
INVALID_SEQUENCE = "invalid_sequence"
INCOMPLETE_ASPECT = "incomplete_aspect"
TYPO = "typo"
AGREEMENT = "agreement"
COMPOUND_TYPO = "compound_typo"
INCOMPLETE_REDUPLICATION = "incomplete_reduplication"
CLASSIFIER_TYPO = "classifier_typo"
# --- Text-level detector errors ---
COLLOQUIAL_CONTRACTION = "colloquial_contraction"
PARTICLE_CONFUSION = "particle_confusion"
HA_HTOE_CONFUSION = "ha_htoe_confusion"
DANGLING_PARTICLE = "dangling_particle"
DANGLING_WORD = "dangling_word"
MISSING_CONJUNCTION = "missing_conjunction"
TENSE_MISMATCH = "tense_mismatch"
REGISTER_MIXING = "register_mixing"
# --- Grammar checker class-level errors ---
NEGATION_ERROR = "negation_error"
REGISTER_ERROR = "register_error"
MERGED_WORD = "merged_word"
ASPECT_ERROR = "aspect_error"
CLASSIFIER_ERROR = "classifier_error"
COMPOUND_ERROR = "compound_error"
# --- Orthography errors ---
MEDIAL_ORDER_ERROR = "medial_order_error"
MEDIAL_COMPATIBILITY_ERROR = "medial_compatibility_error"
VOWEL_AFTER_ASAT = "vowel_after_asat"
BROKEN_VIRAMA = "broken_virama"
BROKEN_STACKING = "broken_stacking"
BROKEN_COMPOUND = "broken_compound"
LEADING_VOWEL_E = "leading_vowel_e"
INCOMPLETE_STACKING = "incomplete_stacking"
# --- Syntactic/semantic errors ---
NEGATION_SFP_MISMATCH = "negation_sfp_mismatch"
MERGED_SFP_CONJUNCTION = "merged_sfp_conjunction"
ASPECT_ADVERB_CONFLICT = "aspect_adverb_conflict"
# --- Punctuation errors ---
DUPLICATE_PUNCTUATION = "duplicate_punctuation"
WRONG_PUNCTUATION = "wrong_punctuation"
MISSING_PUNCTUATION = "missing_punctuation"
# --- Additional detection ---
MISSING_ASAT = "missing_asat"
PARTICLE_MISUSE = "particle_misuse"
COLLOCATION_ERROR = "collocation_error"
Provider Classes
DictionaryProvider
Abstract provider interface.from myspellchecker.providers.base import DictionaryProvider
class DictionaryProvider(ABC):
"""Dictionary data provider interface."""
# --- Core abstract methods (must be implemented) ---
def is_valid_syllable(self, syllable: str) -> bool:
"""Check if syllable exists."""
def is_valid_word(self, word: str) -> bool:
"""Check if word exists."""
def get_syllable_frequency(self, syllable: str) -> int:
"""Get syllable corpus frequency count."""
def get_word_frequency(self, word: str) -> int:
"""Get word corpus frequency count."""
def get_word_pos(self, word: str) -> str | None:
"""Get word POS tag (pipe-separated for multi-POS, e.g. 'N|V')."""
def get_bigram_probability(self, prev_word: str, current_word: str) -> float:
"""Get conditional probability P(current_word | prev_word)."""
def get_trigram_probability(self, w1: str, w2: str, w3: str) -> float:
"""Get conditional probability P(w3 | w1, w2)."""
def get_fourgram_probability(self, w1: str, w2: str, w3: str, w4: str) -> float:
"""Get conditional probability P(w4 | w1, w2, w3)."""
def get_fivegram_probability(self, w1: str, w2: str, w3: str, w4: str, w5: str) -> float:
"""Get conditional probability P(w5 | w1, w2, w3, w4)."""
def get_top_continuations(self, prev_word: str, limit: int = 20) -> list[tuple[str, float]]:
"""Get most likely words to follow prev_word, as (word, probability) tuples."""
def get_all_syllables(self) -> Iterator[tuple[str, int]]:
"""Get iterator over all (syllable, frequency) pairs. Used for SymSpell indexing."""
def get_all_words(self) -> Iterator[tuple[str, int]]:
"""Get iterator over all (word, frequency) pairs. Used for SymSpell indexing."""
def get_pos_unigram_probabilities(self) -> dict[str, float]:
"""Get all POS unigram probabilities."""
def get_pos_bigram_probabilities(self) -> dict[tuple[str, str], float]:
"""Get all POS bigram probabilities."""
def get_pos_trigram_probabilities(self) -> dict[tuple[str, str, str], float]:
"""Get all POS trigram probabilities."""
# --- Bulk operations (default implementations, override for optimization) ---
def is_valid_syllables_bulk(self, syllables: list[str]) -> dict[str, bool]:
"""Check validity of multiple syllables in a single operation."""
def is_valid_words_bulk(self, words: list[str]) -> dict[str, bool]:
"""Check validity of multiple words in a single operation."""
def get_syllable_frequencies_bulk(self, syllables: list[str]) -> dict[str, int]:
"""Get corpus frequencies for multiple syllables."""
def get_word_frequencies_bulk(self, words: list[str]) -> dict[str, int]:
"""Get corpus frequencies for multiple words."""
def get_word_pos_bulk(self, words: list[str]) -> dict[str, str | None]:
"""Get POS tags for multiple words."""
# --- Convenience methods ---
def has_syllable(self, syllable: str) -> bool:
"""Pure existence check for syllable (delegates to is_valid_syllable)."""
def has_word(self, word: str) -> bool:
"""Pure existence check for word (delegates to is_valid_word)."""
def __contains__(self, item: str) -> bool:
"""Support 'in' operator: checks syllables first, then words."""
# --- Factory method ---
@classmethod
def create(cls, provider_type: str = "sqlite", **kwargs) -> "DictionaryProvider":
"""Factory method to create provider instances ('sqlite', 'memory', 'json', 'csv')."""
Note:close()is not defined on the base class. It is available onSQLiteProviderto release connection pool resources.
SQLiteProvider
SQLite-based provider.from myspellchecker.providers import SQLiteProvider
class SQLiteProvider(DictionaryProvider):
"""SQLite-based dictionary provider."""
def __init__(
self,
database_path: str | None = None,
cache_size: int = 8192,
check_same_thread: bool = False,
pos_tagger: POSTaggerBase = None,
pool_min_size: int | None = None,
pool_max_size: int | None = None,
pool_timeout: float | None = None,
pool_max_connection_age: float | None = None,
sqlite_timeout: float | None = None,
cache_manager: CacheManager = None,
curated_min_frequency: int = 0,
):
"""
Initialize SQLite provider.
Args:
database_path: Database path (None for default)
cache_size: LRU cache size for frequency lookups (default: 8192)
check_same_thread: Allow sharing connection between threads (default: False)
pos_tagger: Optional POS tagger for OOV word tagging
pool_min_size: Minimum connections in pool (default: ConnectionPoolConfig.min_size)
pool_max_size: Maximum connections in pool (default: ConnectionPoolConfig.max_size)
pool_timeout: Connection checkout timeout in seconds (default: ConnectionPoolConfig.timeout)
pool_max_connection_age: Max connection age before recreation (default: ConnectionPoolConfig.max_connection_age)
sqlite_timeout: SQLite busy timeout in seconds (default: ConnectionPoolConfig value)
cache_manager: Optional CacheManager for dependency injection
curated_min_frequency: Minimum frequency for curated lexicon entries (default: 0)
"""
MemoryProvider
In-memory provider optimized for fast lookups.from myspellchecker.providers import MemoryProvider
class MemoryProvider(DictionaryProvider):
"""In-memory dictionary provider using Python dictionaries."""
def __init__(
self,
syllables: dict[str, int] = None,
words: dict[str, int] = None,
bigrams: dict[tuple[str, str], float] = None,
trigrams: dict[tuple[str, str, str], float] = None,
word_pos: dict[str, str] = None,
):
"""
Initialize MemoryProvider with optional pre-populated data.
Args:
syllables: Dictionary mapping syllable -> frequency count
words: Dictionary mapping word -> frequency count
bigrams: Dictionary mapping (prev_word, curr_word) -> probability
trigrams: Dictionary mapping (word1, word2, word3) -> probability
word_pos: Dictionary mapping word -> POS tag
"""
def add_syllable(self, syllable: str, frequency: int = 1) -> None:
"""Add a syllable with optional frequency."""
def add_word(self, word: str, frequency: int = 1) -> None:
"""Add a word with optional frequency."""
Algorithm Classes
SymSpell
Symmetric delete spell checking.from myspellchecker.algorithms.symspell import SymSpell, Suggestion
class SymSpell:
"""SymSpell algorithm for O(1) suggestions."""
def __init__(
self,
provider: DictionaryProvider,
max_edit_distance: int = 2,
prefix_length: int = 10,
count_threshold: int = 1,
):
"""
Initialize SymSpell with a dictionary provider.
Note: The class constructor default for count_threshold is 1,
but SymSpellConfig sets its default to 50. When constructed
via SpellCheckerConfig, the config value (50) takes precedence.
"""
def build_index(self, levels: list[str]) -> None:
"""Build delete index for specified levels ('syllable', 'word')."""
def lookup(
self,
term: str,
level: str = "syllable",
max_suggestions: int = 5,
include_known: bool = False,
use_phonetic: bool = False,
) -> list[Suggestion]:
"""
Look up suggestions for a term.
Returns:
List of Suggestion with term, edit_distance, frequency
"""
NgramContextChecker
N-gram based context checker.from myspellchecker.algorithms.ngram_context_checker import NgramContextChecker
class NgramContextChecker:
"""N-gram based context validation."""
def __init__(
self,
provider: DictionaryProvider,
config: NgramContextConfig | None = None,
symspell: SymSpell | None = None,
pos_unigram_probs: dict[str, float] | None = None,
pos_bigram_probs: dict[tuple[str, str], float] | None = None,
):
"""Initialize context checker.
All thresholds and weights are configured via NgramContextConfig.
"""
def get_smoothed_bigram_probability(self, word1: str, word2: str) -> float:
"""Get smoothed P(word2 | word1)."""
def get_smoothed_trigram_probability(self, word1: str, word2: str, word3: str) -> float:
"""Get smoothed P(word3 | word1, word2)."""
def is_contextual_error(
self,
prev_word: str,
current_word: str,
prev_prev_word: Optional[str] = None,
next_word: Optional[str] = None,
threshold: Optional[float] = None,
) -> bool:
"""Check if a word is a contextual error given surrounding context."""
def suggest(
self,
prev_word: str,
current_word: str,
max_edit_distance: int = 2,
next_word: Optional[str] = None,
) -> list[ContextSuggestion]:
"""Generate context-aware suggestions for a word."""
SemanticChecker
Deep learning based context checker.from myspellchecker.algorithms.semantic_checker import SemanticChecker
class SemanticChecker:
"""ONNX-based semantic context checker."""
def __init__(
self,
model_path: str = None,
tokenizer_path: str = None,
model: Any = None,
tokenizer: Any = None,
num_threads: int = 1,
predict_top_k: int = 5,
check_top_k: int = 10,
use_pytorch: bool = False,
allow_extended_myanmar: bool = False,
):
"""Initialize semantic checker."""
def is_semantic_error(
self,
sentence: str,
word: str,
neighbors: list[str],
) -> Optional[str]:
"""Check if word is a semantic error using AI. Returns suggestion or None."""
def predict_mask(
self,
sentence: str,
target_word: str,
top_k: int = None,
occurrence: int = 0,
) -> list[tuple[str, float]]:
"""Predict most likely words for a masked position."""
Segmenter Classes
DefaultSegmenter
Default text segmenter.from myspellchecker.segmenters import DefaultSegmenter
class DefaultSegmenter(Segmenter):
"""Default Myanmar text segmenter using a hybrid approach."""
def __init__(
self,
word_engine: str = "myword",
allow_extended_myanmar: bool = False,
seg_model: Optional[str] = None,
seg_device: int = -1,
):
"""
Initialize segmenter.
Args:
word_engine: Word segmentation engine ("myword", "crf", or "transformer")
allow_extended_myanmar: Accept Extended Myanmar characters (U+1050-U+109F,
U+AA60-U+AA7F, U+A9E0-U+A9FF)
seg_model: Custom model name for transformer engine (optional)
seg_device: Device for transformer inference (-1=CPU, 0+=GPU)
"""
def segment_syllables(self, text: str) -> list[str]:
"""Segment text into syllables."""
def segment_words(self, text: str) -> list[str]:
"""Segment text into words."""
def segment_sentences(self, text: str) -> list[str]:
"""Segment text into sentences using heuristics."""
def load_custom_dictionary(self, words: list[str]) -> None:
"""Load custom dictionary words (myword engine only)."""
Utility Functions
Text Normalization
from myspellchecker.text.normalize import (
normalize,
normalize_for_lookup,
)
def normalize(
text: str,
form: Literal["NFC", "NFD", "NFKC", "NFKD"] = "NFC",
remove_zero_width: bool = True,
reorder_diacritics: bool = True,
normalize_variants: bool = False,
normalize_tall_aa: bool = True,
normalize_u_asat: bool = True,
) -> str:
"""
Normalize Myanmar text with configurable steps.
Args:
text: Input Myanmar text
form: Unicode normalization form
remove_zero_width: Remove zero-width characters
reorder_diacritics: Apply Myanmar-specific diacritic reordering (UTN #11)
normalize_variants: Map character variants to canonical forms
normalize_tall_aa: Correct Tall AA after Medial Wa (default: True)
normalize_u_asat: Convert independent vowel U + asat to consonant form (default: True)
"""
def normalize_for_lookup(
text: str,
convert_zawgyi: bool = True,
config: Optional[ZawgyiConfig] = None,
) -> str:
"""Unified normalization for all dictionary/index lookups (includes Zawgyi conversion)."""
# For direct Cython function access (requires compiled extensions):
from myspellchecker.text.normalize_c import (
remove_zero_width_chars,
reorder_myanmar_diacritics,
get_myanmar_ratio,
)
# For higher-level normalization with presets:
from myspellchecker.text.normalization_service import (
NormalizationService,
normalize_for_spell_checking,
normalize_for_lookup,
normalize_for_comparison,
)
Logging Configuration
from myspellchecker.utils.logging_utils import configure_logging
def configure_logging(
level: Union[int, str] = logging.INFO,
format_string: str = None,
stream: TextIO = None,
json_output: bool = False,
debug_mode: bool = False,
) -> None:
"""Configure logging for the library."""
Exceptions
from myspellchecker.core.exceptions import (
MyanmarSpellcheckError,
ConfigurationError,
InvalidConfigError,
DataLoadingError,
MissingDatabaseError,
ProcessingError,
ValidationError,
TokenizationError,
NormalizationError,
ProviderError,
ConnectionPoolError,
PipelineError,
IngestionError,
PackagingError,
ModelError,
ModelLoadError,
InferenceError,
MissingDependencyError,
InsufficientStorageError,
CacheError,
)
MyanmarSpellcheckError (base)
ConfigurationError
InvalidConfigError
DataLoadingError
MissingDatabaseError
ProcessingError
ValidationError
TokenizationError
NormalizationError
ProviderError
ConnectionPoolError
PipelineError
IngestionError
PackagingError
ModelError
ModelLoadError
InferenceError
MissingDependencyError
InsufficientStorageError
CacheError
class MyanmarSpellcheckError(Exception):
"""Base exception for all spell checker errors."""
class ConfigurationError(MyanmarSpellcheckError):
"""Configuration-related errors."""
class InvalidConfigError(ConfigurationError):
"""Specific configuration value is invalid."""
class DataLoadingError(MyanmarSpellcheckError):
"""Data loading errors."""
class MissingDatabaseError(DataLoadingError):
"""Spell checker database not found. Includes searched_paths and suggestion attributes."""
class ProcessingError(MyanmarSpellcheckError):
"""Text processing errors (base for validation/tokenization/normalization)."""
class ValidationError(ProcessingError):
"""Validation processing errors."""
class TokenizationError(ProcessingError):
"""Text tokenization/segmentation errors."""
class NormalizationError(ProcessingError):
"""Text normalization errors."""
class ProviderError(MyanmarSpellcheckError):
"""Provider-related errors."""
class ConnectionPoolError(ProviderError):
"""Connection pool errors (exhaustion, creation failures)."""
class PipelineError(MyanmarSpellcheckError):
"""Data pipeline errors."""
class IngestionError(PipelineError):
"""Corpus ingestion errors. Has failed_files and missing_files attributes."""
class PackagingError(PipelineError):
"""Database packaging errors."""
class ModelError(MyanmarSpellcheckError):
"""Machine learning model errors."""
class ModelLoadError(ModelError):
"""Model loading failures."""
class InferenceError(ModelError):
"""Model inference failures."""
class MissingDependencyError(MyanmarSpellcheckError):
"""Required external dependency is missing."""
class InsufficientStorageError(MyanmarSpellcheckError):
"""Not enough disk space for operation."""
class CacheError(MyanmarSpellcheckError):
"""Caching operation failures."""
Convenience Functions
check_text()
Quick one-off spell check without constructing a SpellChecker instance:from myspellchecker import check_text
result = check_text("မြန်မာ", level="syllable", database_path=None)
# Returns: Response object
| Parameter | Type | Default | Description |
|---|---|---|---|
text | str | required | Myanmar text to check |
level | str | "syllable" | Validation level: "syllable" or "word" |
database_path | str | None | None | Custom database path (None = auto-detect) |
Internationalization (i18n)
from myspellchecker import set_language, get_language, get_message, get_supported_languages
# Get available languages
langs = get_supported_languages() # ["en", "my", ...]
# Set language for error messages
set_language("my")
# Get current language
lang = get_language() # "my"
# Get localized message
msg = get_message("invalid_syllable")
classify_action()
Classify the recommended action for an error based on its type and confidence:from myspellchecker import classify_action, ActionType
action = classify_action(error_type="particle_typo", confidence=0.95)
# Returns: ActionType.AUTO_FIX
action = classify_action(error_type="context_probability", confidence=0.6)
# Returns: ActionType.SUGGEST
ActionType.AUTO_FIX (safe to apply), ActionType.SUGGEST (show to user), or ActionType.INFORM (advisory only).
Streaming
StreamingChecker
Memory-efficient streaming interface for processing large text files.from myspellchecker.core.streaming import StreamingChecker, StreamingConfig, StreamingStats, ChunkResult
class StreamingChecker:
"""Streaming interface for SpellChecker."""
def __init__(
self,
checker: SpellChecker,
config: StreamingConfig | None = None,
):
"""
Args:
checker: SpellChecker instance to use for validation.
config: StreamingConfig for tuning behavior (default: StreamingConfig()).
"""
def check_stream(
self,
input_stream: TextIO | IO[str] | Iterator[str],
level: ValidationLevel = ValidationLevel.SYLLABLE,
on_progress: Callable[[StreamingStats], None] | None = None,
stats: StreamingStats | None = None,
) -> Iterator[ChunkResult]:
"""Stream spell check results line-by-line from an input stream."""
async def check_stream_async(
self,
input_stream: AsyncTextReader | AsyncIterator[str],
level: ValidationLevel = ValidationLevel.SYLLABLE,
on_progress: Callable[[StreamingStats], None] | None = None,
stats: StreamingStats | None = None,
) -> AsyncIterator[ChunkResult]:
"""Async version of check_stream. Uses asyncio.to_thread for CPU-bound checking."""
def check_sentences(
self,
text: str,
level: ValidationLevel = ValidationLevel.WORD,
on_progress: Callable[[StreamingStats], None] | None = None,
) -> Iterator[ChunkResult]:
"""Check text sentence-by-sentence with cross-sentence context preservation."""
StreamingConfig
class StreamingConfig:
chunk_size: int = 100 # Lines per chunk
max_memory_mb: int = 100 # Memory limit before backpressure
sentence_boundary_pattern: str = r"[။!?]+" # Sentence boundary regex
enable_cross_sentence_context: bool = True
progress_interval: int = 1000 # Lines between progress callbacks
timeout_per_chunk: float = 30.0 # Max seconds per chunk
StreamingStats
class StreamingStats:
bytes_processed: int = 0
lines_processed: int = 0
sentences_processed: int = 0
errors_found: int = 0
chunks_processed: int = 0
current_memory_mb: float = 0.0
@property
def elapsed_time(self) -> float: ... # Seconds since start
@property
def lines_per_second(self) -> float: ... # Processing rate
def to_dict(self) -> dict[str, Any]: ... # For serialization
ChunkResult
class ChunkResult:
response: Response # The spell check result
line_number: int # Source line number
chunk_index: int # Sequential chunk index
is_final: bool # True for the last chunk
from myspellchecker import SpellChecker
from myspellchecker.core.streaming import StreamingChecker
checker = SpellChecker()
streaming = StreamingChecker(checker)
with open("large_file.txt") as f:
for result in streaming.check_stream(f):
if result.response.has_errors:
print(f"Line {result.line_number}: {result.response.errors}")
Module Index
| Module | Description | Documentation |
|---|---|---|
myspellchecker | Main package exports | This page |
myspellchecker.core | Core classes and config | This page |
myspellchecker.algorithms | Spell check algorithms | Algorithms |
myspellchecker.providers | Dictionary providers | Provider Capabilities |
myspellchecker.segmenters | Text segmenters | This page |
myspellchecker.tokenizers | Low-level tokenizers | Tokenizers API |
myspellchecker.utils | Utility functions | This page |
myspellchecker.data_pipeline | Dictionary building | Data Pipeline |
myspellchecker.training | Model training | Training |
Next Steps
- Getting Started - Quick start guide
- Configuration - Configuration options
- CLI Reference - Command-line interface