Overview - mySpellChecker

This page covers the public classes, configuration, response objects, providers, algorithms, and utility functions available in the library.

Core Classes

SpellChecker

The main spell checking class.

from myspellchecker import SpellChecker

class SpellChecker:
    """
    Main spell checker class for Myanmar text validation.

    Attributes:
        config: SpellCheckerConfig instance
        provider: Dictionary provider
        segmenter: Text segmenter
    """

    def __init__(
        self,
        config: SpellCheckerConfig = None,
        segmenter: Segmenter = None,
        provider: DictionaryProvider = None,
        syllable_validator: SyllableValidator = None,
        word_validator: WordValidator = None,
        context_validator: ContextValidator = None,
        factory: ComponentFactoryProtocol = None,
    ):
        """
        Initialize SpellChecker.

        Args:
            config: Configuration settings (default: balanced preset)
            segmenter: Custom Segmenter for text tokenization (default: DefaultSegmenter)
            provider: Dictionary provider (default: SQLiteProvider)
            syllable_validator: Custom SyllableValidator (advanced use)
            word_validator: Custom WordValidator (advanced use)
            context_validator: Custom ContextValidator (advanced use)
            factory: Custom ComponentFactory for dependency injection (advanced use)
        """

    # --- Factory Methods ---

    @classmethod
    def create_default(cls) -> "SpellChecker":
        """Create SpellChecker with default settings (balanced performance/accuracy)."""

    @classmethod
    def create_fast(cls) -> "SpellChecker":
        """Create SpellChecker optimized for speed (disables context checking, NER, phonetic)."""

    @classmethod
    def create_accurate(cls) -> "SpellChecker":
        """Create SpellChecker optimized for accuracy (higher edit distance, lower thresholds)."""

    @classmethod
    def create_minimal(cls) -> "SpellChecker":
        """Create SpellChecker with minimal features (basic syllable validation only)."""

    # --- Core Methods ---

    def check(
        self,
        text: str,
        level: ValidationLevel = ValidationLevel.SYLLABLE,
        use_semantic: Optional[bool] = None,
    ) -> Response:
        """
        Check text for spelling errors.

        Args:
            text: Myanmar text to check
            level: Validation level (SYLLABLE or WORD)
            use_semantic: Override semantic checking for this call

        Returns:
            Response containing errors and suggestions
        """

    async def check_async(
        self,
        text: str,
        level: ValidationLevel = ValidationLevel.SYLLABLE,
        use_semantic: Optional[bool] = None,
    ) -> Response:
        """
        Asynchronously check text for spelling errors.

        Runs the CPU-bound check() in a separate thread via asyncio.to_thread().

        Args:
            text: Myanmar text to check
            level: Validation level (SYLLABLE or WORD)
            use_semantic: Override semantic checking for this call

        Returns:
            Response containing errors and suggestions
        """

    def check_batch(
        self,
        texts: list[str],
        level: ValidationLevel = ValidationLevel.SYLLABLE,
    ) -> list[Response]:
        """
        Check multiple texts sequentially.

        Args:
            texts: List of texts to check
            level: Validation level (SYLLABLE or WORD)

        Returns:
            List of Response objects
        """

    async def check_batch_async(
        self,
        texts: list[str],
        level: ValidationLevel = ValidationLevel.SYLLABLE,
        max_concurrency: int = 4,
        use_semantic: bool | None = None,
    ) -> list[Response]:
        """
        Asynchronously check multiple texts with configurable concurrency.

        Args:
            texts: List of texts to check
            level: Validation level (SYLLABLE or WORD)
            max_concurrency: Maximum concurrent operations (default: 4)
            use_semantic: Override semantic checking (None uses config default)

        Returns:
            List of Response objects
        """

    def get_pos_tags(self, text: str = "", words: list[str] = None) -> list[str]:
        """
        Get the most likely POS tag sequence for text or pre-segmented words.

        Args:
            text: Input text to tag (optional if words is provided)
            words: Pre-segmented words (optional if text is provided)

        Returns:
            List of POS tags, one per word.
        """

    def segment_and_tag(self, text: str) -> tuple[list[str], list[str]]:
        """
        Perform joint word segmentation and POS tagging.

        Uses joint Viterbi decoder if enabled (config.joint.enabled=True),
        otherwise falls back to sequential segmentation then tagging.

        Args:
            text: Text to segment

        Returns:
            Tuple of (words, tags)
        """

    def close(self) -> None:
        """Close and release resources."""

    def __enter__(self) -> "SpellChecker":
        """Context manager entry."""

    def __exit__(self, *args) -> None:
        """Context manager exit with cleanup."""

    # --- Properties ---

    @property
    def symspell(self) -> Optional[SymSpell]:
        """Access SymSpell instance for direct suggestion lookups."""

    @property
    def context_checker(self) -> Optional[NgramContextChecker]:
        """Access NgramContextChecker for N-gram probability lookups."""

    @property
    def syllable_rule_validator(self) -> Optional[SyllableRuleValidator]:
        """Access SyllableRuleValidator for Myanmar orthographic validation."""

    @property
    def name_heuristic(self) -> Optional[NameHeuristic]:
        """Access NameHeuristic for proper noun detection."""

    @property
    def semantic_checker(self) -> Optional[SemanticChecker]:
        """Access SemanticChecker for AI-powered error detection."""

    @property
    def phonetic_hasher(self) -> Optional[PhoneticHasher]:
        """Access PhoneticHasher for phonetic similarity matching."""

SpellCheckerBuilder

Fluent builder for SpellChecker construction.

from myspellchecker.core import SpellCheckerBuilder

class SpellCheckerBuilder:
    """Fluent builder for SpellChecker instances."""

    def with_config(self, config: SpellCheckerConfig) -> "SpellCheckerBuilder":
        """Set the full configuration object."""

    def with_provider(self, provider: DictionaryProvider) -> "SpellCheckerBuilder":
        """Set a custom dictionary provider."""

    def with_segmenter(self, segmenter: Segmenter) -> "SpellCheckerBuilder":
        """Set a custom text segmenter."""

    def with_phonetic(self, enabled: bool = True) -> "SpellCheckerBuilder":
        """Enable or disable phonetic similarity matching."""

    def with_context_checking(self, enabled: bool = True) -> "SpellCheckerBuilder":
        """Enable or disable N-gram context checking."""

    def with_ner(self, enabled: bool = True) -> "SpellCheckerBuilder":
        """Enable or disable Named Entity Recognition heuristics."""

    def with_rule_based_validation(self, enabled: bool = True) -> "SpellCheckerBuilder":
        """Enable or disable rule-based syllable validation."""

    def with_max_edit_distance(self, distance: int) -> "SpellCheckerBuilder":
        """Set maximum edit distance for suggestions (1-3)."""

    def with_max_suggestions(self, count: int) -> "SpellCheckerBuilder":
        """Set maximum number of suggestions per error."""

    def with_symspell_prefix_length(self, length: int) -> "SpellCheckerBuilder":
        """Set SymSpell prefix length for performance optimization (typically 5-10)."""

    def with_cache_size(self, size: int) -> "SpellCheckerBuilder":
        """Set provider cache size for memory optimization."""

    def with_bigram_threshold(self, threshold: float) -> "SpellCheckerBuilder":
        """Set probability threshold for flagging bigram errors (0.0-1.0)."""

    def with_trigram_threshold(self, threshold: float) -> "SpellCheckerBuilder":
        """Set probability threshold for flagging trigram errors (0.0-1.0)."""

    def with_semantic_model(
        self,
        model_path: str = None,
        tokenizer_path: str = None,
        model: Any = None,
        tokenizer: Any = None,
    ) -> "SpellCheckerBuilder":
        """Configure semantic checking model (paths or pre-loaded instances)."""

    def with_word_engine(
        self, engine: Literal["myword", "crf", "transformer"]
    ) -> "SpellCheckerBuilder":
        """Set the word segmentation engine."""

    def build(self) -> SpellChecker:
        """Construct SpellChecker with all configured options."""

Example:

from myspellchecker.core.builder import SpellCheckerBuilder
from myspellchecker.providers import SQLiteProvider

# Using custom provider
provider = SQLiteProvider(database_path="/path/to/db.sqlite")
checker = (
    SpellCheckerBuilder()
    .with_provider(provider)
    .with_phonetic(True)
    .with_context_checking(True)
    .build()
)

ConfigPresets

Pre-configured SpellCheckerConfig instances for common use cases.

from myspellchecker.core.builder import ConfigPresets

# Use a preset directly
checker = SpellChecker(config=ConfigPresets.FAST)

# Customize a preset (each access returns a deep copy, safe to modify)
config = ConfigPresets.ACCURATE
config.max_suggestions = 10
checker = SpellChecker(config=config)

Available presets: DEFAULT, FAST, ACCURATE, MINIMAL, STRICT.

Configuration Classes

SpellCheckerConfig

Main configuration class (Pydantic BaseModel).

from myspellchecker.core.config import SpellCheckerConfig, get_profile

class SpellCheckerConfig(BaseModel):
    """Spell checker configuration (Pydantic BaseModel)."""

    # Core dependencies (runtime objects)
    segmenter: Optional[Segmenter] = None
    provider: Optional[DictionaryProvider] = None

    # Suggestion settings
    max_suggestions: int = 5
    max_edit_distance: int = 2  # Range: 1-3

    # Feature toggles
    use_phonetic: bool = True
    use_context_checker: bool = True
    use_ner: bool = True
    use_rule_based_validation: bool = True

    # Word segmentation
    word_engine: Literal["myword", "crf", "transformer"] = "myword"
    seg_model: Optional[str] = None      # Custom model for transformer engine
    seg_device: int = -1                  # -1=CPU, 0+=GPU (transformer only)

    # Safety limits
    max_text_length: int = 100_000  # Maximum input characters (prevents resource exhaustion)

    # Behavior
    fallback_to_empty_provider: bool = False  # Allow empty MemoryProvider if DB not found

    # Nested configurations (each defaults to a new instance with its own defaults)
    symspell: SymSpellConfig = SymSpellConfig()
    ngram_context: NgramContextConfig = NgramContextConfig()
    phonetic: PhoneticConfig = PhoneticConfig()
    pos_tagger: POSTaggerConfig = POSTaggerConfig()
    semantic: SemanticConfig = SemanticConfig()
    validation: ValidationConfig = ValidationConfig()
    provider_config: ProviderConfig = ProviderConfig()
    joint: JointConfig = JointConfig()
    cache: AlgorithmCacheConfig = AlgorithmCacheConfig()
    ranker: RankerConfig = RankerConfig()
    frequency_guards: FrequencyGuardConfig = FrequencyGuardConfig()
    compound_resolver: CompoundResolverConfig = CompoundResolverConfig()
    reduplication: ReduplicationConfig = ReduplicationConfig()
    neural_reranker: NeuralRerankerConfig = NeuralRerankerConfig()
    ner: Optional[NERConfig] = None  # NER model config (None = use heuristic fallback)

# Use get_profile() for presets:
config = get_profile("development") # Fast iteration, minimal validation
config = get_profile("production")  # Balanced (default)
config = get_profile("testing")     # Deterministic, reproducible
config = get_profile("fast")        # Maximum speed
config = get_profile("accurate")    # Maximum accuracy

ValidationLevel

Enum for validation depth.

from myspellchecker.core.constants import ValidationLevel

class ValidationLevel(str, Enum):
    SYLLABLE = "syllable"  # Fast syllable-only validation
    WORD = "word"          # Thorough word + context validation

Note: Validation level is passed to check() and other methods, not as a configuration option.

POSTaggerConfig

POS tagger configuration.

class POSTaggerConfig(BaseModel):
    """POS tagger configuration (pydantic model)."""

    tagger_type: str = "rule_based"  # "rule_based", "viterbi", "transformer"
    model_name: str | None = None    # HuggingFace model ID (for transformer)
    device: int = -1                 # -1 for CPU, 0+ for GPU
    batch_size: int = 32
    cache_size: int = 10000          # LRU cache size
    use_morphology_fallback: bool = True
    beam_width: int = 10             # For Viterbi tagger
    unknown_tag: str = "UNK"         # Tag for unknown words

SemanticConfig

Semantic checker configuration.

class SemanticConfig(BaseModel):
    """Semantic checker configuration (Pydantic BaseModel)."""

    model_path: str = None
    tokenizer_path: str = None
    model: Any = None  # Pre-loaded ONNX session
    tokenizer: Any = None  # Pre-loaded tokenizer
    num_threads: int = 0            # ONNX inference threads (0 = auto-detect all cores)
    predict_top_k: int = 5          # Top-K predictions
    check_top_k: int = 10           # Tokens to check
    use_semantic_refinement: bool = True
    use_proactive_scanning: bool = False  # AI-powered error detection
    proactive_confidence_threshold: float = 0.85  # Threshold for proactive scanning

Response Classes

Response

Result of spell checking.

from myspellchecker.core.response import Response

@dataclass
class Response:
    """Result of spell checking."""

    text: str
    """Original input text (unchanged)."""

    corrected_text: str
    """Auto-corrected text using top suggestions."""

    has_errors: bool
    """True if any errors detected."""

    level: str
    """Validation level used ('syllable' or 'word')."""

    errors: list[Error]
    """List of Error objects (SyllableError, WordError, ContextError, GrammarError)."""

    metadata: dict
    """Additional metadata (processing_time, layers_applied, etc.)."""

    def to_dict(self) -> dict:
        """Convert to dictionary for JSON serialization."""

    def to_json(self, indent: int = 2) -> str:
        """Convert to JSON string."""

Error

Base error class.

from myspellchecker.core.response import Error, SyllableError, WordError, ContextError, GrammarError
from myspellchecker.core.constants import ErrorType

@dataclass
class Error:
    """Spelling error."""

    text: str
    """The erroneous text (syllable or word)."""

    position: int
    """Character position in original text (0-indexed)."""

    suggestions: list[str]
    """Suggested corrections, ranked by likelihood."""

    error_type: str
    """Type of error ('invalid_syllable', 'invalid_word', etc.)."""

    confidence: float = 1.0
    """Confidence score (0.0-1.0). Higher = more certain."""

    def to_dict(self) -> dict:
        """Convert to dictionary."""

    def to_json(self, indent: int = 2) -> str:
        """Convert to JSON string."""

@dataclass
class SyllableError(Error):
    """Invalid syllable error (Layer 1). Default error_type: 'invalid_syllable'."""
    error_type: str = "invalid_syllable"

@dataclass
class WordError(Error):
    """Invalid word error (Layer 2). Default error_type: 'invalid_word'."""
    syllable_count: int = 0
    error_type: str = "invalid_word"

@dataclass
class ContextError(Error):
    """Context error - unlikely word sequence (Layer 3). Default error_type: 'context_probability'."""
    probability: float = 0.0
    prev_word: str = ""
    error_type: str = "context_probability"

@dataclass
class GrammarError(Error):
    """Grammar-related errors. Default error_type: 'grammar_error'."""
    reason: str = ""
    error_type: str = "grammar_error"

    @property
    def word(self) -> str:
        """Alias for 'text' for backward compatibility."""

    @property
    def suggestion(self) -> str:
        """Return first suggestion for backward compatibility."""

class ErrorType(str, Enum):
    # --- Core validation errors ---
    SYLLABLE = "invalid_syllable"
    WORD = "invalid_word"
    CONTEXT_PROBABILITY = "context_probability"
    GRAMMAR = "grammar_error"

    # --- Syllable-level errors ---
    PARTICLE_TYPO = "particle_typo"
    MEDIAL_CONFUSION = "medial_confusion"

    # --- Colloquial variant errors ---
    COLLOQUIAL_VARIANT = "colloquial_variant"
    COLLOQUIAL_INFO = "colloquial_info"

    # --- Validation strategy errors ---
    QUESTION_STRUCTURE = "question_structure"
    SYNTAX_ERROR = "syntax_error"
    HOMOPHONE_ERROR = "homophone_error"
    TONE_AMBIGUITY = "tone_ambiguity"
    POS_SEQUENCE_ERROR = "pos_sequence_error"
    SEMANTIC_ERROR = "semantic_error"
    CONFUSABLE_ERROR = "confusable_error"

    # --- Encoding errors ---
    ZAWGYI_ENCODING = "zawgyi_encoding"

    # --- Grammar checker errors ---
    MIXED_REGISTER = "mixed_register"
    ASPECT_TYPO = "aspect_typo"
    INVALID_SEQUENCE = "invalid_sequence"
    INCOMPLETE_ASPECT = "incomplete_aspect"
    TYPO = "typo"
    AGREEMENT = "agreement"
    COMPOUND_TYPO = "compound_typo"
    INCOMPLETE_REDUPLICATION = "incomplete_reduplication"
    CLASSIFIER_TYPO = "classifier_typo"

    # --- Text-level detector errors ---
    COLLOQUIAL_CONTRACTION = "colloquial_contraction"
    PARTICLE_CONFUSION = "particle_confusion"
    HA_HTOE_CONFUSION = "ha_htoe_confusion"
    DANGLING_PARTICLE = "dangling_particle"
    DANGLING_WORD = "dangling_word"
    MISSING_CONJUNCTION = "missing_conjunction"
    TENSE_MISMATCH = "tense_mismatch"
    REGISTER_MIXING = "register_mixing"

    # --- Grammar checker class-level errors ---
    NEGATION_ERROR = "negation_error"
    REGISTER_ERROR = "register_error"
    MERGED_WORD = "merged_word"
    ASPECT_ERROR = "aspect_error"
    CLASSIFIER_ERROR = "classifier_error"
    COMPOUND_ERROR = "compound_error"

    # --- Orthography errors ---
    MEDIAL_ORDER_ERROR = "medial_order_error"
    MEDIAL_COMPATIBILITY_ERROR = "medial_compatibility_error"
    VOWEL_AFTER_ASAT = "vowel_after_asat"
    BROKEN_VIRAMA = "broken_virama"
    BROKEN_STACKING = "broken_stacking"
    BROKEN_COMPOUND = "broken_compound"
    LEADING_VOWEL_E = "leading_vowel_e"
    INCOMPLETE_STACKING = "incomplete_stacking"

    # --- Syntactic/semantic errors ---
    NEGATION_SFP_MISMATCH = "negation_sfp_mismatch"
    MERGED_SFP_CONJUNCTION = "merged_sfp_conjunction"
    ASPECT_ADVERB_CONFLICT = "aspect_adverb_conflict"

    # --- Punctuation errors ---
    DUPLICATE_PUNCTUATION = "duplicate_punctuation"
    WRONG_PUNCTUATION = "wrong_punctuation"
    MISSING_PUNCTUATION = "missing_punctuation"

    # --- Additional detection ---
    MISSING_ASAT = "missing_asat"
    PARTICLE_MISUSE = "particle_misuse"
    COLLOCATION_ERROR = "collocation_error"

Provider Classes

DictionaryProvider

Abstract provider interface.

from myspellchecker.providers.base import DictionaryProvider

class DictionaryProvider(ABC):
    """Dictionary data provider interface."""

    # --- Core abstract methods (must be implemented) ---

    def is_valid_syllable(self, syllable: str) -> bool:
        """Check if syllable exists."""

    def is_valid_word(self, word: str) -> bool:
        """Check if word exists."""

    def get_syllable_frequency(self, syllable: str) -> int:
        """Get syllable corpus frequency count."""

    def get_word_frequency(self, word: str) -> int:
        """Get word corpus frequency count."""

    def get_word_pos(self, word: str) -> str | None:
        """Get word POS tag (pipe-separated for multi-POS, e.g. 'N|V')."""

    def get_bigram_probability(self, prev_word: str, current_word: str) -> float:
        """Get conditional probability P(current_word | prev_word)."""

    def get_trigram_probability(self, w1: str, w2: str, w3: str) -> float:
        """Get conditional probability P(w3 | w1, w2)."""

    def get_fourgram_probability(self, w1: str, w2: str, w3: str, w4: str) -> float:
        """Get conditional probability P(w4 | w1, w2, w3)."""

    def get_fivegram_probability(self, w1: str, w2: str, w3: str, w4: str, w5: str) -> float:
        """Get conditional probability P(w5 | w1, w2, w3, w4)."""

    def get_top_continuations(self, prev_word: str, limit: int = 20) -> list[tuple[str, float]]:
        """Get most likely words to follow prev_word, as (word, probability) tuples."""

    def get_all_syllables(self) -> Iterator[tuple[str, int]]:
        """Get iterator over all (syllable, frequency) pairs. Used for SymSpell indexing."""

    def get_all_words(self) -> Iterator[tuple[str, int]]:
        """Get iterator over all (word, frequency) pairs. Used for SymSpell indexing."""

    def get_pos_unigram_probabilities(self) -> dict[str, float]:
        """Get all POS unigram probabilities."""

    def get_pos_bigram_probabilities(self) -> dict[tuple[str, str], float]:
        """Get all POS bigram probabilities."""

    def get_pos_trigram_probabilities(self) -> dict[tuple[str, str, str], float]:
        """Get all POS trigram probabilities."""

    # --- Bulk operations (default implementations, override for optimization) ---

    def is_valid_syllables_bulk(self, syllables: list[str]) -> dict[str, bool]:
        """Check validity of multiple syllables in a single operation."""

    def is_valid_words_bulk(self, words: list[str]) -> dict[str, bool]:
        """Check validity of multiple words in a single operation."""

    def get_syllable_frequencies_bulk(self, syllables: list[str]) -> dict[str, int]:
        """Get corpus frequencies for multiple syllables."""

    def get_word_frequencies_bulk(self, words: list[str]) -> dict[str, int]:
        """Get corpus frequencies for multiple words."""

    def get_word_pos_bulk(self, words: list[str]) -> dict[str, str | None]:
        """Get POS tags for multiple words."""

    # --- Convenience methods ---

    def has_syllable(self, syllable: str) -> bool:
        """Pure existence check for syllable (delegates to is_valid_syllable)."""

    def has_word(self, word: str) -> bool:
        """Pure existence check for word (delegates to is_valid_word)."""

    def __contains__(self, item: str) -> bool:
        """Support 'in' operator: checks syllables first, then words."""

    # --- Factory method ---

    @classmethod
    def create(cls, provider_type: str = "sqlite", **kwargs) -> "DictionaryProvider":
        """Factory method to create provider instances ('sqlite', 'memory', 'json', 'csv')."""

Note: close() is not defined on the base class. It is available on SQLiteProvider to release connection pool resources.

SQLiteProvider

SQLite-based provider.

from myspellchecker.providers import SQLiteProvider

class SQLiteProvider(DictionaryProvider):
    """SQLite-based dictionary provider."""

    def __init__(
        self,
        database_path: str | None = None,
        cache_size: int = 8192,
        check_same_thread: bool = False,
        pos_tagger: POSTaggerBase = None,
        pool_min_size: int | None = None,
        pool_max_size: int | None = None,
        pool_timeout: float | None = None,
        pool_max_connection_age: float | None = None,
        sqlite_timeout: float | None = None,
        cache_manager: CacheManager = None,
        curated_min_frequency: int = 0,
    ):
        """
        Initialize SQLite provider.

        Args:
            database_path: Database path (None for default)
            cache_size: LRU cache size for frequency lookups (default: 8192)
            check_same_thread: Allow sharing connection between threads (default: False)
            pos_tagger: Optional POS tagger for OOV word tagging
            pool_min_size: Minimum connections in pool (default: ConnectionPoolConfig.min_size)
            pool_max_size: Maximum connections in pool (default: ConnectionPoolConfig.max_size)
            pool_timeout: Connection checkout timeout in seconds (default: ConnectionPoolConfig.timeout)
            pool_max_connection_age: Max connection age before recreation (default: ConnectionPoolConfig.max_connection_age)
            sqlite_timeout: SQLite busy timeout in seconds (default: ConnectionPoolConfig value)
            cache_manager: Optional CacheManager for dependency injection
            curated_min_frequency: Minimum frequency for curated lexicon entries (default: 0)
        """

MemoryProvider

In-memory provider optimized for fast lookups.

from myspellchecker.providers import MemoryProvider

class MemoryProvider(DictionaryProvider):
    """In-memory dictionary provider using Python dictionaries."""

    def __init__(
        self,
        syllables: dict[str, int] = None,
        words: dict[str, int] = None,
        bigrams: dict[tuple[str, str], float] = None,
        trigrams: dict[tuple[str, str, str], float] = None,
        word_pos: dict[str, str] = None,
    ):
        """
        Initialize MemoryProvider with optional pre-populated data.

        Args:
            syllables: Dictionary mapping syllable -> frequency count
            words: Dictionary mapping word -> frequency count
            bigrams: Dictionary mapping (prev_word, curr_word) -> probability
            trigrams: Dictionary mapping (word1, word2, word3) -> probability
            word_pos: Dictionary mapping word -> POS tag
        """

    def add_syllable(self, syllable: str, frequency: int = 1) -> None:
        """Add a syllable with optional frequency."""

    def add_word(self, word: str, frequency: int = 1) -> None:
        """Add a word with optional frequency."""

Algorithm Classes

SymSpell

Symmetric delete spell checking.

from myspellchecker.algorithms.symspell import SymSpell, Suggestion

class SymSpell:
    """SymSpell algorithm for O(1) suggestions."""

    def __init__(
        self,
        provider: DictionaryProvider,
        max_edit_distance: int = 2,
        prefix_length: int = 10,
        count_threshold: int = 1,
    ):
        """
        Initialize SymSpell with a dictionary provider.

        Note: The class constructor default for count_threshold is 1,
        but SymSpellConfig sets its default to 50. When constructed
        via SpellCheckerConfig, the config value (50) takes precedence.
        """

    def build_index(self, levels: list[str]) -> None:
        """Build delete index for specified levels ('syllable', 'word')."""

    def lookup(
        self,
        term: str,
        level: str = "syllable",
        max_suggestions: int = 5,
        include_known: bool = False,
        use_phonetic: bool = False,
    ) -> list[Suggestion]:
        """
        Look up suggestions for a term.

        Returns:
            List of Suggestion with term, edit_distance, frequency
        """

NgramContextChecker

N-gram based context checker.

from myspellchecker.algorithms.ngram_context_checker import NgramContextChecker

class NgramContextChecker:
    """N-gram based context validation."""

    def __init__(
        self,
        provider: DictionaryProvider,
        config: NgramContextConfig | None = None,
        symspell: SymSpell | None = None,
        pos_unigram_probs: dict[str, float] | None = None,
        pos_bigram_probs: dict[tuple[str, str], float] | None = None,
    ):
        """Initialize context checker.

        All thresholds and weights are configured via NgramContextConfig.
        """

    def get_smoothed_bigram_probability(self, word1: str, word2: str) -> float:
        """Get smoothed P(word2 | word1)."""

    def get_smoothed_trigram_probability(self, word1: str, word2: str, word3: str) -> float:
        """Get smoothed P(word3 | word1, word2)."""

    def is_contextual_error(
        self,
        prev_word: str,
        current_word: str,
        prev_prev_word: Optional[str] = None,
        next_word: Optional[str] = None,
        threshold: Optional[float] = None,
    ) -> bool:
        """Check if a word is a contextual error given surrounding context."""

    def suggest(
        self,
        prev_word: str,
        current_word: str,
        max_edit_distance: int = 2,
        next_word: Optional[str] = None,
    ) -> list[ContextSuggestion]:
        """Generate context-aware suggestions for a word."""

SemanticChecker

Deep learning based context checker.

from myspellchecker.algorithms.semantic_checker import SemanticChecker

class SemanticChecker:
    """ONNX-based semantic context checker."""

    def __init__(
        self,
        model_path: str = None,
        tokenizer_path: str = None,
        model: Any = None,
        tokenizer: Any = None,
        num_threads: int = 1,
        predict_top_k: int = 5,
        check_top_k: int = 10,
        use_pytorch: bool = False,
        allow_extended_myanmar: bool = False,
    ):
        """Initialize semantic checker."""

    def is_semantic_error(
        self,
        sentence: str,
        word: str,
        neighbors: list[str],
    ) -> Optional[str]:
        """Check if word is a semantic error using AI. Returns suggestion or None."""

    def predict_mask(
        self,
        sentence: str,
        target_word: str,
        top_k: int = None,
        occurrence: int = 0,
    ) -> list[tuple[str, float]]:
        """Predict most likely words for a masked position."""

Segmenter Classes

DefaultSegmenter

Default text segmenter.

from myspellchecker.segmenters import DefaultSegmenter

class DefaultSegmenter(Segmenter):
    """Default Myanmar text segmenter using a hybrid approach."""

    def __init__(
        self,
        word_engine: str = "myword",
        allow_extended_myanmar: bool = False,
        seg_model: Optional[str] = None,
        seg_device: int = -1,
    ):
        """
        Initialize segmenter.

        Args:
            word_engine: Word segmentation engine ("myword", "crf", or "transformer")
            allow_extended_myanmar: Accept Extended Myanmar characters (U+1050-U+109F,
                U+AA60-U+AA7F, U+A9E0-U+A9FF)
            seg_model: Custom model name for transformer engine (optional)
            seg_device: Device for transformer inference (-1=CPU, 0+=GPU)
        """

    def segment_syllables(self, text: str) -> list[str]:
        """Segment text into syllables."""

    def segment_words(self, text: str) -> list[str]:
        """Segment text into words."""

    def segment_sentences(self, text: str) -> list[str]:
        """Segment text into sentences using heuristics."""

    def load_custom_dictionary(self, words: list[str]) -> None:
        """Load custom dictionary words (myword engine only)."""

Utility Functions

Text Normalization

from myspellchecker.text.normalize import (
    normalize,
    normalize_for_lookup,
)

def normalize(
    text: str,
    form: Literal["NFC", "NFD", "NFKC", "NFKD"] = "NFC",
    remove_zero_width: bool = True,
    reorder_diacritics: bool = True,
    normalize_variants: bool = False,
    normalize_tall_aa: bool = True,
    normalize_u_asat: bool = True,
) -> str:
    """
    Normalize Myanmar text with configurable steps.

    Args:
        text: Input Myanmar text
        form: Unicode normalization form
        remove_zero_width: Remove zero-width characters
        reorder_diacritics: Apply Myanmar-specific diacritic reordering (UTN #11)
        normalize_variants: Map character variants to canonical forms
        normalize_tall_aa: Correct Tall AA after Medial Wa (default: True)
        normalize_u_asat: Convert independent vowel U + asat to consonant form (default: True)
    """

def normalize_for_lookup(
    text: str,
    convert_zawgyi: bool = True,
    config: Optional[ZawgyiConfig] = None,
) -> str:
    """Unified normalization for all dictionary/index lookups (includes Zawgyi conversion)."""

# For direct Cython function access (requires compiled extensions):
from myspellchecker.text.normalize_c import (
    remove_zero_width_chars,
    reorder_myanmar_diacritics,
    get_myanmar_ratio,
)

# For higher-level normalization with presets:
from myspellchecker.text.normalization_service import (
    NormalizationService,
    normalize_for_spell_checking,
    normalize_for_lookup,
    normalize_for_comparison,
)

Logging Configuration

from myspellchecker.utils.logging_utils import configure_logging

def configure_logging(
    level: Union[int, str] = logging.INFO,
    format_string: str = None,
    stream: TextIO = None,
    json_output: bool = False,
    debug_mode: bool = False,
) -> None:
    """Configure logging for the library."""

Exceptions

from myspellchecker.core.exceptions import (
    MyanmarSpellcheckError,
    ConfigurationError,
    InvalidConfigError,
    DataLoadingError,
    MissingDatabaseError,
    ProcessingError,
    ValidationError,
    TokenizationError,
    NormalizationError,
    ProviderError,
    ConnectionPoolError,
    PipelineError,
    IngestionError,
    PackagingError,
    ModelError,
    ModelLoadError,
    InferenceError,
    MissingDependencyError,
    InsufficientStorageError,
    CacheError,
)

Exception hierarchy:

MyanmarSpellcheckError (base)

ConfigurationError

InvalidConfigError

DataLoadingError

MissingDatabaseError

ProcessingError

ValidationError

TokenizationError

NormalizationError

ProviderError

ConnectionPoolError

PipelineError

IngestionError

PackagingError

ModelError

ModelLoadError

InferenceError

MissingDependencyError

InsufficientStorageError

CacheError

Key exceptions:

class MyanmarSpellcheckError(Exception):
    """Base exception for all spell checker errors."""

class ConfigurationError(MyanmarSpellcheckError):
    """Configuration-related errors."""

class InvalidConfigError(ConfigurationError):
    """Specific configuration value is invalid."""

class DataLoadingError(MyanmarSpellcheckError):
    """Data loading errors."""

class MissingDatabaseError(DataLoadingError):
    """Spell checker database not found. Includes searched_paths and suggestion attributes."""

class ProcessingError(MyanmarSpellcheckError):
    """Text processing errors (base for validation/tokenization/normalization)."""

class ValidationError(ProcessingError):
    """Validation processing errors."""

class TokenizationError(ProcessingError):
    """Text tokenization/segmentation errors."""

class NormalizationError(ProcessingError):
    """Text normalization errors."""

class ProviderError(MyanmarSpellcheckError):
    """Provider-related errors."""

class ConnectionPoolError(ProviderError):
    """Connection pool errors (exhaustion, creation failures)."""

class PipelineError(MyanmarSpellcheckError):
    """Data pipeline errors."""

class IngestionError(PipelineError):
    """Corpus ingestion errors. Has failed_files and missing_files attributes."""

class PackagingError(PipelineError):
    """Database packaging errors."""

class ModelError(MyanmarSpellcheckError):
    """Machine learning model errors."""

class ModelLoadError(ModelError):
    """Model loading failures."""

class InferenceError(ModelError):
    """Model inference failures."""

class MissingDependencyError(MyanmarSpellcheckError):
    """Required external dependency is missing."""

class InsufficientStorageError(MyanmarSpellcheckError):
    """Not enough disk space for operation."""

class CacheError(MyanmarSpellcheckError):
    """Caching operation failures."""

Convenience Functions

check_text()

Quick one-off spell check without constructing a SpellChecker instance:

from myspellchecker import check_text

result = check_text("မြန်မာ", level="syllable", database_path=None)
# Returns: Response object

Parameter	Type	Default	Description
`text`	str	required	Myanmar text to check
`level`	str	`"syllable"`	Validation level: `"syllable"` or `"word"`
`database_path`	str \| None	`None`	Custom database path (None = auto-detect)

Internationalization (i18n)

from myspellchecker import set_language, get_language, get_message, get_supported_languages

# Get available languages
langs = get_supported_languages()  # ["en", "my", ...]

# Set language for error messages
set_language("my")

# Get current language
lang = get_language()  # "my"

# Get localized message
msg = get_message("invalid_syllable")

classify_action()

Classify the recommended action for an error based on its type and confidence:

from myspellchecker import classify_action, ActionType

action = classify_action(error_type="particle_typo", confidence=0.95)
# Returns: ActionType.AUTO_FIX

action = classify_action(error_type="context_probability", confidence=0.6)
# Returns: ActionType.SUGGEST

Returns ActionType.AUTO_FIX (safe to apply), ActionType.SUGGEST (show to user), or ActionType.INFORM (advisory only).

Streaming

StreamingChecker

Memory-efficient streaming interface for processing large text files.

from myspellchecker.core.streaming import StreamingChecker, StreamingConfig, StreamingStats, ChunkResult

class StreamingChecker:
    """Streaming interface for SpellChecker."""

    def __init__(
        self,
        checker: SpellChecker,
        config: StreamingConfig | None = None,
    ):
        """
        Args:
            checker: SpellChecker instance to use for validation.
            config: StreamingConfig for tuning behavior (default: StreamingConfig()).
        """

    def check_stream(
        self,
        input_stream: TextIO | IO[str] | Iterator[str],
        level: ValidationLevel = ValidationLevel.SYLLABLE,
        on_progress: Callable[[StreamingStats], None] | None = None,
        stats: StreamingStats | None = None,
    ) -> Iterator[ChunkResult]:
        """Stream spell check results line-by-line from an input stream."""

    async def check_stream_async(
        self,
        input_stream: AsyncTextReader | AsyncIterator[str],
        level: ValidationLevel = ValidationLevel.SYLLABLE,
        on_progress: Callable[[StreamingStats], None] | None = None,
        stats: StreamingStats | None = None,
    ) -> AsyncIterator[ChunkResult]:
        """Async version of check_stream. Uses asyncio.to_thread for CPU-bound checking."""

    def check_sentences(
        self,
        text: str,
        level: ValidationLevel = ValidationLevel.WORD,
        on_progress: Callable[[StreamingStats], None] | None = None,
    ) -> Iterator[ChunkResult]:
        """Check text sentence-by-sentence with cross-sentence context preservation."""

StreamingConfig

class StreamingConfig:
    chunk_size: int = 100              # Lines per chunk
    max_memory_mb: int = 100           # Memory limit before backpressure
    sentence_boundary_pattern: str = r"[။!?]+"  # Sentence boundary regex
    enable_cross_sentence_context: bool = True
    progress_interval: int = 1000      # Lines between progress callbacks
    timeout_per_chunk: float = 30.0    # Max seconds per chunk

StreamingStats

class StreamingStats:
    bytes_processed: int = 0
    lines_processed: int = 0
    sentences_processed: int = 0
    errors_found: int = 0
    chunks_processed: int = 0
    current_memory_mb: float = 0.0

    @property
    def elapsed_time(self) -> float: ...      # Seconds since start
    @property
    def lines_per_second(self) -> float: ...  # Processing rate
    def to_dict(self) -> dict[str, Any]: ...  # For serialization

ChunkResult

class ChunkResult:
    response: Response    # The spell check result
    line_number: int      # Source line number
    chunk_index: int      # Sequential chunk index
    is_final: bool        # True for the last chunk

Example:

from myspellchecker import SpellChecker
from myspellchecker.core.streaming import StreamingChecker

checker = SpellChecker()
streaming = StreamingChecker(checker)

with open("large_file.txt") as f:
    for result in streaming.check_stream(f):
        if result.response.has_errors:
            print(f"Line {result.line_number}: {result.response.errors}")

Module Index

Module	Description	Documentation
`myspellchecker`	Main package exports	This page
`myspellchecker.core`	Core classes and config	This page
`myspellchecker.algorithms`	Spell check algorithms	Algorithms
`myspellchecker.providers`	Dictionary providers	Provider Capabilities
`myspellchecker.segmenters`	Text segmenters	This page
`myspellchecker.tokenizers`	Low-level tokenizers	Tokenizers API
`myspellchecker.utils`	Utility functions	This page
`myspellchecker.data_pipeline`	Dictionary building	Data Pipeline
`myspellchecker.training`	Model training	Training

Next Steps

Getting Started - Quick start guide
Configuration - Configuration options
CLI Reference - Command-line interface

Documentation Index

​Core Classes

​SpellChecker

​SpellCheckerBuilder

​ConfigPresets

​Configuration Classes

​SpellCheckerConfig

​ValidationLevel

​POSTaggerConfig

​SemanticConfig

​Response Classes

​Response

​Error

​Provider Classes

​DictionaryProvider

​SQLiteProvider

​MemoryProvider

​Algorithm Classes

​SymSpell

​NgramContextChecker

​SemanticChecker

​Segmenter Classes

​DefaultSegmenter

​Utility Functions

​Text Normalization

​Logging Configuration

​Exceptions

​Convenience Functions

​check_text()

​Internationalization (i18n)

​classify_action()

​Streaming

​StreamingChecker

​StreamingConfig

​StreamingStats

​ChunkResult

​Module Index

​Next Steps