Skip to main content
This page covers the public classes, configuration, response objects, providers, algorithms, and utility functions available in the library.

Core Classes

SpellChecker

The main spell checking class.
from myspellchecker import SpellChecker

class SpellChecker:
    """
    Main spell checker class for Myanmar text validation.

    Attributes:
        config: SpellCheckerConfig instance
        provider: Dictionary provider
        segmenter: Text segmenter
    """

    def __init__(
        self,
        config: SpellCheckerConfig = None,
        segmenter: Segmenter = None,
        provider: DictionaryProvider = None,
        syllable_validator: SyllableValidator = None,
        word_validator: WordValidator = None,
        context_validator: ContextValidator = None,
        factory: ComponentFactoryProtocol = None,
    ):
        """
        Initialize SpellChecker.

        Args:
            config: Configuration settings (default: balanced preset)
            segmenter: Custom Segmenter for text tokenization (default: DefaultSegmenter)
            provider: Dictionary provider (default: SQLiteProvider)
            syllable_validator: Custom SyllableValidator (advanced use)
            word_validator: Custom WordValidator (advanced use)
            context_validator: Custom ContextValidator (advanced use)
            factory: Custom ComponentFactory for dependency injection (advanced use)
        """

    # --- Factory Methods ---

    @classmethod
    def create_default(cls) -> "SpellChecker":
        """Create SpellChecker with default settings (balanced performance/accuracy)."""

    @classmethod
    def create_fast(cls) -> "SpellChecker":
        """Create SpellChecker optimized for speed (disables context checking, NER, phonetic)."""

    @classmethod
    def create_accurate(cls) -> "SpellChecker":
        """Create SpellChecker optimized for accuracy (higher edit distance, lower thresholds)."""

    @classmethod
    def create_minimal(cls) -> "SpellChecker":
        """Create SpellChecker with minimal features (basic syllable validation only)."""

    @classmethod
    def create_strict(cls) -> "SpellChecker":
        """Create SpellChecker with strict error detection (conservative thresholds)."""

    # --- Core Methods ---

    def check(
        self,
        text: str,
        level: ValidationLevel = ValidationLevel.SYLLABLE,
        use_semantic: Optional[bool] = None,
    ) -> Response:
        """
        Check text for spelling errors.

        Args:
            text: Myanmar text to check
            level: Validation level (SYLLABLE or WORD)
            use_semantic: Override semantic checking for this call

        Returns:
            Response containing errors and suggestions
        """

    async def check_async(
        self,
        text: str,
        level: ValidationLevel = ValidationLevel.SYLLABLE,
        use_semantic: Optional[bool] = None,
    ) -> Response:
        """
        Asynchronously check text for spelling errors.

        Runs the CPU-bound check() in a separate thread via asyncio.to_thread().

        Args:
            text: Myanmar text to check
            level: Validation level (SYLLABLE or WORD)
            use_semantic: Override semantic checking for this call

        Returns:
            Response containing errors and suggestions
        """

    def check_batch(
        self,
        texts: list[str],
        level: ValidationLevel = ValidationLevel.SYLLABLE,
    ) -> list[Response]:
        """
        Check multiple texts sequentially.

        Args:
            texts: List of texts to check
            level: Validation level (SYLLABLE or WORD)

        Returns:
            List of Response objects
        """

    async def check_batch_async(
        self,
        texts: list[str],
        level: ValidationLevel = ValidationLevel.SYLLABLE,
        max_concurrency: int = 4,
    ) -> list[Response]:
        """
        Asynchronously check multiple texts with configurable concurrency.

        Args:
            texts: List of texts to check
            level: Validation level (SYLLABLE or WORD)
            max_concurrency: Maximum concurrent operations (default: 4)

        Returns:
            List of Response objects
        """

    def get_pos_tags(self, text: str = "", words: list[str] = None) -> list[str]:
        """
        Get the most likely POS tag sequence for text or pre-segmented words.

        Args:
            text: Input text to tag (optional if words is provided)
            words: Pre-segmented words (optional if text is provided)

        Returns:
            List of POS tags, one per word.
        """

    def segment_and_tag(self, text: str) -> tuple[list[str], list[str]]:
        """
        Perform joint word segmentation and POS tagging.

        Uses joint Viterbi decoder if enabled (config.joint.enabled=True),
        otherwise falls back to sequential segmentation then tagging.

        Args:
            text: Text to segment

        Returns:
            Tuple of (words, tags)
        """

    def get_stemmer(self) -> "Stemmer":
        """Get a Stemmer instance for suffix stripping."""

    def close(self) -> None:
        """Close and release resources."""

    def __enter__(self) -> "SpellChecker":
        """Context manager entry."""

    def __exit__(self, *args) -> None:
        """Context manager exit with cleanup."""

    # --- Properties ---

    @property
    def symspell(self) -> Optional[SymSpell]:
        """Access SymSpell instance for direct suggestion lookups."""

    @property
    def context_checker(self) -> Optional[NgramContextChecker]:
        """Access NgramContextChecker for N-gram probability lookups."""

    @property
    def syllable_rule_validator(self) -> Optional[SyllableRuleValidator]:
        """Access SyllableRuleValidator for Myanmar orthographic validation."""

    @property
    def name_heuristic(self) -> Optional[NameHeuristic]:
        """Access NameHeuristic for proper noun detection."""

    @property
    def semantic_checker(self) -> Optional[SemanticChecker]:
        """Access SemanticChecker for AI-powered error detection."""

    @property
    def phonetic_hasher(self) -> Optional[PhoneticHasher]:
        """Access PhoneticHasher for phonetic similarity matching."""

SpellCheckerBuilder

Fluent builder for SpellChecker construction.
from myspellchecker.core import SpellCheckerBuilder

class SpellCheckerBuilder:
    """Fluent builder for SpellChecker instances."""

    def with_config(self, config: SpellCheckerConfig) -> "SpellCheckerBuilder":
        """Set the full configuration object."""

    def with_provider(self, provider: DictionaryProvider) -> "SpellCheckerBuilder":
        """Set a custom dictionary provider."""

    def with_segmenter(self, segmenter: Segmenter) -> "SpellCheckerBuilder":
        """Set a custom text segmenter."""

    def with_phonetic(self, enabled: bool = True) -> "SpellCheckerBuilder":
        """Enable or disable phonetic similarity matching."""

    def with_context_checking(self, enabled: bool = True) -> "SpellCheckerBuilder":
        """Enable or disable N-gram context checking."""

    def with_ner(self, enabled: bool = True) -> "SpellCheckerBuilder":
        """Enable or disable Named Entity Recognition heuristics."""

    def with_rule_based_validation(self, enabled: bool = True) -> "SpellCheckerBuilder":
        """Enable or disable rule-based syllable validation."""

    def with_max_edit_distance(self, distance: int) -> "SpellCheckerBuilder":
        """Set maximum edit distance for suggestions (1-3)."""

    def with_max_suggestions(self, count: int) -> "SpellCheckerBuilder":
        """Set maximum number of suggestions per error."""

    def with_symspell_prefix_length(self, length: int) -> "SpellCheckerBuilder":
        """Set SymSpell prefix length for performance optimization (typically 5-10)."""

    def with_cache_size(self, size: int) -> "SpellCheckerBuilder":
        """Set provider cache size for memory optimization."""

    def with_bigram_threshold(self, threshold: float) -> "SpellCheckerBuilder":
        """Set probability threshold for flagging bigram errors (0.0-1.0)."""

    def with_trigram_threshold(self, threshold: float) -> "SpellCheckerBuilder":
        """Set probability threshold for flagging trigram errors (0.0-1.0)."""

    def with_semantic_model(
        self,
        model_path: str = None,
        tokenizer_path: str = None,
        model: Any = None,
        tokenizer: Any = None,
    ) -> "SpellCheckerBuilder":
        """Configure semantic checking model (paths or pre-loaded instances)."""

    def with_word_engine(
        self, engine: Literal["myword", "crf", "transformer"]
    ) -> "SpellCheckerBuilder":
        """Set the word segmentation engine."""

    def with_pos_tagger(
        self,
        tagger_type: Literal["rule_based", "transformer", "viterbi", "custom"] = "rule_based",
        model_name: str = None,
        device: int = -1,
    ) -> "SpellCheckerBuilder":
        """Configure the Part-of-Speech tagger."""

    def with_joint_segmentation(
        self,
        enabled: bool = True,
        beam_width: int = 15,
    ) -> "SpellCheckerBuilder":
        """Configure Joint Segmentation and POS Tagging."""

    def build(self) -> SpellChecker:
        """Construct SpellChecker with all configured options."""
Example:
from myspellchecker.core.builder import SpellCheckerBuilder
from myspellchecker.providers import SQLiteProvider

# Using custom provider
provider = SQLiteProvider(database_path="/path/to/db.sqlite")
checker = (
    SpellCheckerBuilder()
    .with_provider(provider)
    .with_phonetic(True)
    .with_context_checking(True)
    .build()
)

ConfigPresets

Pre-configured SpellCheckerConfig instances for common use cases.
from myspellchecker.core.builder import ConfigPresets

# Use a preset directly
checker = SpellChecker(config=ConfigPresets.FAST)

# Customize a preset (each access returns a deep copy, safe to modify)
config = ConfigPresets.ACCURATE
config.max_suggestions = 10
checker = SpellChecker(config=config)
Available presets: DEFAULT, FAST, ACCURATE, MINIMAL, STRICT.

Configuration Classes

SpellCheckerConfig

Main configuration class (Pydantic BaseModel).
from myspellchecker.core.config import SpellCheckerConfig, get_profile

class SpellCheckerConfig(BaseModel):
    """Spell checker configuration (Pydantic BaseModel)."""

    # Core dependencies (runtime objects)
    segmenter: Optional[Segmenter] = None
    provider: Optional[DictionaryProvider] = None

    # Suggestion settings
    max_suggestions: int = 5
    max_edit_distance: int = 2  # Range: 1-3

    # Feature toggles
    use_phonetic: bool = True
    use_context_checker: bool = True
    use_ner: bool = True
    use_rule_based_validation: bool = True

    # Word segmentation
    word_engine: Literal["myword", "crf", "transformer"] = "myword"
    seg_model: Optional[str] = None      # Custom model for transformer engine
    seg_device: int = -1                  # -1=CPU, 0+=GPU (transformer only)

    # Behavior
    fallback_to_empty_provider: bool = False  # Allow empty MemoryProvider if DB not found

    # Nested configurations (each defaults to a new instance with its own defaults)
    symspell: SymSpellConfig = SymSpellConfig()
    ngram_context: NgramContextConfig = NgramContextConfig()
    phonetic: PhoneticConfig = PhoneticConfig()
    pos_tagger: POSTaggerConfig = POSTaggerConfig()
    semantic: SemanticConfig = SemanticConfig()
    validation: ValidationConfig = ValidationConfig()
    provider_config: ProviderConfig = ProviderConfig()
    joint: JointConfig = JointConfig()
    cache: AlgorithmCacheConfig = AlgorithmCacheConfig()
    ranker: RankerConfig = RankerConfig()

# Use get_profile() for presets:
config = get_profile("development") # Fast iteration, minimal validation
config = get_profile("production")  # Balanced (default)
config = get_profile("testing")     # Deterministic, reproducible
config = get_profile("fast")        # Maximum speed
config = get_profile("accurate")    # Maximum accuracy

ValidationLevel

Enum for validation depth.
from myspellchecker.core.constants import ValidationLevel

class ValidationLevel(str, Enum):
    SYLLABLE = "syllable"  # Fast syllable-only validation
    WORD = "word"          # Thorough word + context validation
Note: Validation level is passed to check() and other methods, not as a configuration option.

POSTaggerConfig

POS tagger configuration.
class POSTaggerConfig(BaseModel):
    """POS tagger configuration (pydantic model)."""

    tagger_type: str = "rule_based"  # "rule_based", "viterbi", "transformer"
    model_name: str | None = None    # HuggingFace model ID (for transformer)
    device: int = -1                 # -1 for CPU, 0+ for GPU
    batch_size: int = 32
    cache_size: int = 10000          # LRU cache size
    use_morphology_fallback: bool = True
    beam_width: int = 10             # For Viterbi tagger
    unknown_tag: str = "UNK"         # Tag for unknown words

SemanticConfig

Semantic checker configuration.
class SemanticConfig(BaseModel):
    """Semantic checker configuration (Pydantic BaseModel)."""

    model_path: str = None
    tokenizer_path: str = None
    model: Any = None  # Pre-loaded ONNX session
    tokenizer: Any = None  # Pre-loaded tokenizer
    num_threads: int = 1            # ONNX inference threads
    predict_top_k: int = 5          # Top-K predictions
    check_top_k: int = 10           # Tokens to check
    use_semantic_refinement: bool = True
    use_proactive_scanning: bool = False  # AI-powered error detection
    proactive_confidence_threshold: float = 0.5  # Threshold for proactive scanning

Response Classes

Response

Result of spell checking.
from myspellchecker.core.response import Response

@dataclass
class Response:
    """Result of spell checking."""

    text: str
    """Original input text (unchanged)."""

    corrected_text: str
    """Auto-corrected text using top suggestions."""

    has_errors: bool
    """True if any errors detected."""

    level: str
    """Validation level used ('syllable' or 'word')."""

    errors: list[Error]
    """List of Error objects (SyllableError, WordError, ContextError, GrammarError)."""

    metadata: dict
    """Additional metadata (processing_time, layers_applied, etc.)."""

    def to_dict(self) -> dict:
        """Convert to dictionary for JSON serialization."""

    def to_json(self, indent: int = 2) -> str:
        """Convert to JSON string."""

Error

Base error class.
from myspellchecker.core.response import Error, SyllableError, WordError, ContextError, GrammarError
from myspellchecker.core.constants import ErrorType

@dataclass
class Error:
    """Spelling error."""

    text: str
    """The erroneous text (syllable or word)."""

    position: int
    """Character position in original text (0-indexed)."""

    suggestions: list[str]
    """Suggested corrections, ranked by likelihood."""

    error_type: str
    """Type of error ('invalid_syllable', 'invalid_word', etc.)."""

    confidence: float = 1.0
    """Confidence score (0.0-1.0). Higher = more certain."""

    def to_dict(self) -> dict:
        """Convert to dictionary."""

    def to_json(self, indent: int = 2) -> str:
        """Convert to JSON string."""

    def get_localized_message(self, language: str = None) -> str:
        """Get a localized error message ('en' or 'my')."""

@dataclass
class SyllableError(Error):
    """Invalid syllable error (Layer 1). Default error_type: 'invalid_syllable'."""
    error_type: str = "invalid_syllable"

@dataclass
class WordError(Error):
    """Invalid word error (Layer 2). Default error_type: 'invalid_word'."""
    syllable_count: int = 0
    error_type: str = "invalid_word"

@dataclass
class ContextError(Error):
    """Context error - unlikely word sequence (Layer 3). Default error_type: 'context_probability'."""
    probability: float = 0.0
    prev_word: str = ""
    error_type: str = "context_probability"

@dataclass
class GrammarError(Error):
    """Grammar-related errors. Default error_type: 'grammar_error'."""
    reason: str = ""
    error_type: str = "grammar_error"

    @property
    def word(self) -> str:
        """Alias for 'text' for backward compatibility."""

    @property
    def suggestion(self) -> str:
        """Return first suggestion for backward compatibility."""

class ErrorType(str, Enum):
    SYLLABLE = "invalid_syllable"
    WORD = "invalid_word"
    CONTEXT_PROBABILITY = "context_probability"
    GRAMMAR = "grammar_error"
    PARTICLE_TYPO = "particle_typo"
    MEDIAL_CONFUSION = "medial_confusion"
    COLLOQUIAL_VARIANT = "colloquial_variant"
    COLLOQUIAL_INFO = "colloquial_info"
    QUESTION_STRUCTURE = "question_structure"
    SYNTAX_ERROR = "syntax_error"
    HOMOPHONE_ERROR = "homophone_error"
    TONE_AMBIGUITY = "tone_ambiguity"
    POS_SEQUENCE_ERROR = "pos_sequence_error"
    SEMANTIC_ERROR = "semantic_error"
    ZAWGYI_ENCODING = "zawgyi_encoding"
    MIXED_REGISTER = "mixed_register"
    ASPECT_TYPO = "aspect_typo"
    INVALID_SEQUENCE = "invalid_sequence"
    INCOMPLETE_ASPECT = "incomplete_aspect"
    TYPO = "typo"
    AGREEMENT = "agreement"
    COMPOUND_TYPO = "compound_typo"
    INCOMPLETE_REDUPLICATION = "incomplete_reduplication"
    CLASSIFIER_TYPO = "classifier_typo"

Provider Classes

DictionaryProvider

Abstract provider interface.
from myspellchecker.providers.base import DictionaryProvider

class DictionaryProvider(ABC):
    """Dictionary data provider interface."""

    # --- Core abstract methods (must be implemented) ---

    def is_valid_syllable(self, syllable: str) -> bool:
        """Check if syllable exists."""

    def is_valid_word(self, word: str) -> bool:
        """Check if word exists."""

    def get_syllable_frequency(self, syllable: str) -> int:
        """Get syllable corpus frequency count."""

    def get_word_frequency(self, word: str) -> int:
        """Get word corpus frequency count."""

    def get_word_pos(self, word: str) -> str | None:
        """Get word POS tag (pipe-separated for multi-POS, e.g. 'N|V')."""

    def get_bigram_probability(self, prev_word: str, current_word: str) -> float:
        """Get conditional probability P(current_word | prev_word)."""

    def get_trigram_probability(self, w1: str, w2: str, w3: str) -> float:
        """Get conditional probability P(w3 | w1, w2)."""

    def get_top_continuations(self, prev_word: str, limit: int = 20) -> list[tuple[str, float]]:
        """Get most likely words to follow prev_word, as (word, probability) tuples."""

    def get_all_syllables(self) -> Iterator[tuple[str, int]]:
        """Get iterator over all (syllable, frequency) pairs. Used for SymSpell indexing."""

    def get_all_words(self) -> Iterator[tuple[str, int]]:
        """Get iterator over all (word, frequency) pairs. Used for SymSpell indexing."""

    def get_pos_unigram_probabilities(self) -> dict[str, float]:
        """Get all POS unigram probabilities."""

    def get_pos_bigram_probabilities(self) -> dict[tuple[str, str], float]:
        """Get all POS bigram probabilities."""

    def get_pos_trigram_probabilities(self) -> dict[tuple[str, str, str], float]:
        """Get all POS trigram probabilities."""

    # --- Bulk operations (default implementations, override for optimization) ---

    def is_valid_syllables_bulk(self, syllables: list[str]) -> dict[str, bool]:
        """Check validity of multiple syllables in a single operation."""

    def is_valid_words_bulk(self, words: list[str]) -> dict[str, bool]:
        """Check validity of multiple words in a single operation."""

    def get_syllable_frequencies_bulk(self, syllables: list[str]) -> dict[str, int]:
        """Get corpus frequencies for multiple syllables."""

    def get_word_frequencies_bulk(self, words: list[str]) -> dict[str, int]:
        """Get corpus frequencies for multiple words."""

    def get_word_pos_bulk(self, words: list[str]) -> dict[str, str | None]:
        """Get POS tags for multiple words."""

    # --- Convenience methods ---

    def has_syllable(self, syllable: str) -> bool:
        """Pure existence check for syllable (delegates to is_valid_syllable)."""

    def has_word(self, word: str) -> bool:
        """Pure existence check for word (delegates to is_valid_word)."""

    def __contains__(self, item: str) -> bool:
        """Support 'in' operator: checks syllables first, then words."""

    # --- Factory method ---

    @classmethod
    def create(cls, provider_type: str = "sqlite", **kwargs) -> "DictionaryProvider":
        """Factory method to create provider instances ('sqlite', 'memory', 'json', 'csv')."""

    def close(self) -> None:
        """Close and cleanup (optional, not all providers need this)."""

SQLiteProvider

SQLite-based provider.
from myspellchecker.providers import SQLiteProvider

class SQLiteProvider(DictionaryProvider):
    """SQLite-based dictionary provider."""

    def __init__(
        self,
        database_path: str | None = None,
        cache_size: int = 8192,
        check_same_thread: bool = False,
        pos_tagger: POSTaggerBase = None,
        pool_min_size: int = 1,
        pool_max_size: int = 5,
        pool_timeout: float = 5.0,
        pool_max_connection_age: float = 3600.0,
        sqlite_timeout: float = 30.0,
        cache_manager: CacheManager = None,
    ):
        """
        Initialize SQLite provider.

        Args:
            database_path: Database path (None for default)
            cache_size: LRU cache size for frequency lookups (default: 8192)
            check_same_thread: Allow sharing connection between threads (default: False)
            pos_tagger: Optional POS tagger for OOV word tagging
            pool_min_size: Minimum connections in pool
            pool_max_size: Maximum connections in pool
            pool_timeout: Connection checkout timeout in seconds
            pool_max_connection_age: Max connection age before recreation (default: 3600.0)
            sqlite_timeout: SQLite busy timeout in seconds
            cache_manager: Optional CacheManager for dependency injection
        """

MemoryProvider

In-memory provider optimized for fast lookups.
from myspellchecker.providers import MemoryProvider

class MemoryProvider(DictionaryProvider):
    """In-memory dictionary provider using Python dictionaries."""

    def __init__(
        self,
        syllables: dict[str, int] = None,
        words: dict[str, int] = None,
        bigrams: dict[tuple[str, str], float] = None,
        trigrams: dict[tuple[str, str, str], float] = None,
        word_pos: dict[str, str] = None,
    ):
        """
        Initialize MemoryProvider with optional pre-populated data.

        Args:
            syllables: Dictionary mapping syllable -> frequency count
            words: Dictionary mapping word -> frequency count
            bigrams: Dictionary mapping (prev_word, curr_word) -> probability
            trigrams: Dictionary mapping (word1, word2, word3) -> probability
            word_pos: Dictionary mapping word -> POS tag
        """

    def add_syllable(self, syllable: str, frequency: int = 1) -> None:
        """Add a syllable with optional frequency."""

    def add_word(self, word: str, frequency: int = 1) -> None:
        """Add a word with optional frequency."""

Algorithm Classes

SymSpell

Symmetric delete spell checking.
from myspellchecker.algorithms.symspell import SymSpell, Suggestion

class SymSpell:
    """SymSpell algorithm for O(1) suggestions."""

    def __init__(
        self,
        provider: DictionaryProvider,
        max_edit_distance: int = 2,
        prefix_length: int = 10,
        count_threshold: int = 1,
    ):
        """
        Initialize SymSpell with a dictionary provider.

        Note: The class constructor default for count_threshold is 1,
        but SymSpellConfig sets its default to 50. When constructed
        via SpellCheckerConfig, the config value (50) takes precedence.
        """

    def build_index(self, levels: list[str]) -> None:
        """Build delete index for specified levels ('syllable', 'word')."""

    def lookup(
        self,
        term: str,
        level: str = "syllable",
        max_suggestions: int = 5,
        include_known: bool = False,
        use_phonetic: bool = False,
    ) -> list[Suggestion]:
        """
        Look up suggestions for a term.

        Returns:
            List of Suggestion with term, edit_distance, frequency
        """

NgramContextChecker

N-gram based context checker.
from myspellchecker.algorithms.ngram_context_checker import NgramContextChecker

class NgramContextChecker:
    """N-gram based context validation."""

    def __init__(
        self,
        provider: DictionaryProvider,
        threshold: float = 0.01,
        trigram_threshold: float = 0.005,
        right_context_threshold: float = None,
        max_suggestions: int = 5,
        edit_distance_weight: float = 0.3,
        probability_weight: float = 0.7,
        symspell: SymSpell = None,
        candidate_limit: int = 50,
        smoothing_strategy: SmoothingStrategy = SmoothingStrategy.STUPID_BACKOFF,
        backoff_weight: float = 0.4,
        add_k_smoothing: float = 0.0,
    ):
        """Initialize context checker."""

    def get_smoothed_bigram_probability(self, word1: str, word2: str) -> float:
        """Get smoothed P(word2 | word1)."""

    def get_smoothed_trigram_probability(self, word1: str, word2: str, word3: str) -> float:
        """Get smoothed P(word3 | word1, word2)."""

    def is_contextual_error(
        self,
        prev_word: str,
        current_word: str,
        prev_prev_word: Optional[str] = None,
        next_word: Optional[str] = None,
        threshold: Optional[float] = None,
    ) -> bool:
        """Check if a word is a contextual error given surrounding context."""

    def suggest(
        self,
        prev_word: str,
        current_word: str,
        max_edit_distance: int = 2,
        next_word: Optional[str] = None,
    ) -> list[ContextSuggestion]:
        """Generate context-aware suggestions for a word."""

    def analyze_sequence(
        self,
        words: list[str],
        min_probability: Optional[float] = None,
    ) -> list[tuple[int, float, bool]]:
        """Analyze word sequence for contextual errors."""

SemanticChecker

Deep learning based context checker.
from myspellchecker.algorithms.semantic_checker import SemanticChecker

class SemanticChecker:
    """ONNX-based semantic context checker."""

    def __init__(
        self,
        model_path: str = None,
        tokenizer_path: str = None,
        model: Any = None,
        tokenizer: Any = None,
        num_threads: int = 1,
        predict_top_k: int = 5,
        check_top_k: int = 10,
        use_pytorch: bool = False,
        allow_extended_myanmar: bool = False,
    ):
        """Initialize semantic checker."""

    def is_semantic_error(
        self,
        sentence: str,
        word: str,
        neighbors: list[str],
    ) -> Optional[str]:
        """Check if word is a semantic error using AI. Returns suggestion or None."""

    def predict_mask(
        self,
        sentence: str,
        target_word: str,
        top_k: int = None,
        occurrence: int = 0,
    ) -> list[tuple[str, float]]:
        """Predict most likely words for a masked position."""

Segmenter Classes

DefaultSegmenter

Default text segmenter.
from myspellchecker.segmenters import DefaultSegmenter

class DefaultSegmenter(Segmenter):
    """Default Myanmar text segmenter using a hybrid approach."""

    def __init__(
        self,
        word_engine: str = "myword",
        allow_extended_myanmar: bool = False,
        seg_model: Optional[str] = None,
        seg_device: int = -1,
    ):
        """
        Initialize segmenter.

        Args:
            word_engine: Word segmentation engine ("myword", "crf", or "transformer")
            allow_extended_myanmar: Accept Extended Myanmar characters (U+1050-U+109F,
                U+AA60-U+AA7F, U+A9E0-U+A9FF)
            seg_model: Custom model name for transformer engine (optional)
            seg_device: Device for transformer inference (-1=CPU, 0+=GPU)
        """

    def segment_syllables(self, text: str) -> list[str]:
        """Segment text into syllables."""

    def segment_words(self, text: str) -> list[str]:
        """Segment text into words."""

    def segment_sentences(self, text: str) -> list[str]:
        """Segment text into sentences using heuristics."""

    def load_custom_dictionary(self, words: list[str]) -> None:
        """Load custom dictionary words (myword engine only)."""

Utility Functions

Text Normalization

from myspellchecker.text.normalize import (
    normalize,
    normalize_for_lookup,
)

def normalize(
    text: str,
    form: Literal["NFC", "NFD", "NFKC", "NFKD"] = "NFC",
    remove_zero_width: bool = True,
    reorder_diacritics: bool = True,
    normalize_variants: bool = False,
    normalize_tall_aa: bool = True,
    normalize_u_asat: bool = True,
) -> str:
    """
    Normalize Myanmar text with configurable steps.

    Args:
        text: Input Myanmar text
        form: Unicode normalization form
        remove_zero_width: Remove zero-width characters
        reorder_diacritics: Apply Myanmar-specific diacritic reordering (UTN #11)
        normalize_variants: Map character variants to canonical forms
        normalize_tall_aa: Correct Tall AA after Medial Wa (default: True)
        normalize_u_asat: Convert independent vowel U + asat to consonant form (default: True)
    """

def normalize_for_lookup(
    text: str,
    convert_zawgyi: bool = True,
    config: Optional[ZawgyiConfig] = None,
) -> str:
    """Unified normalization for all dictionary/index lookups (includes Zawgyi conversion)."""

# For direct Cython function access (requires compiled extensions):
from myspellchecker.text.normalize_c import (
    remove_zero_width_chars,
    reorder_myanmar_diacritics,
    get_myanmar_ratio,
)

# For higher-level normalization with presets:
from myspellchecker.text.normalization_service import (
    NormalizationService,
    normalize_for_spell_checking,
    normalize_for_lookup,
    normalize_for_comparison,
)

Logging Configuration

from myspellchecker.utils.logging_utils import configure_logging

def configure_logging(
    level: Union[int, str] = logging.INFO,
    format_string: str = None,
    stream: TextIO = None,
    json_output: bool = False,
    debug_mode: bool = False,
) -> None:
    """Configure logging for the library."""

Exceptions

from myspellchecker.core.exceptions import (
    MyanmarSpellcheckError,
    ConfigurationError,
    InvalidConfigError,
    DataLoadingError,
    MissingDatabaseError,
    ProcessingError,
    ValidationError,
    TokenizationError,
    NormalizationError,
    ProviderError,
    ConnectionPoolError,
    PipelineError,
    IngestionError,
    PackagingError,
    ModelError,
    ModelLoadError,
    InferenceError,
    MissingDependencyError,
    InsufficientStorageError,
    CacheError,
)
Exception hierarchy:
MyanmarSpellcheckError (base)
├── ConfigurationError
│   └── InvalidConfigError
├── DataLoadingError
│   └── MissingDatabaseError
├── ProcessingError
│   ├── ValidationError
│   ├── TokenizationError
│   └── NormalizationError
├── ProviderError
│   └── ConnectionPoolError
├── PipelineError
│   ├── IngestionError
│   └── PackagingError
├── ModelError
│   ├── ModelLoadError
│   └── InferenceError
├── MissingDependencyError
├── InsufficientStorageError
└── CacheError
Key exceptions:
class MyanmarSpellcheckError(Exception):
    """Base exception for all spell checker errors."""

class ConfigurationError(MyanmarSpellcheckError):
    """Configuration-related errors."""

class InvalidConfigError(ConfigurationError):
    """Specific configuration value is invalid."""

class DataLoadingError(MyanmarSpellcheckError):
    """Data loading errors."""

class MissingDatabaseError(DataLoadingError):
    """Spell checker database not found. Includes searched_paths and suggestion attributes."""

class ProcessingError(MyanmarSpellcheckError):
    """Text processing errors (base for validation/tokenization/normalization)."""

class ValidationError(ProcessingError):
    """Validation processing errors."""

class TokenizationError(ProcessingError):
    """Text tokenization/segmentation errors."""

class NormalizationError(ProcessingError):
    """Text normalization errors."""

class ProviderError(MyanmarSpellcheckError):
    """Provider-related errors."""

class ConnectionPoolError(ProviderError):
    """Connection pool errors (exhaustion, creation failures)."""

class PipelineError(MyanmarSpellcheckError):
    """Data pipeline errors."""

class IngestionError(PipelineError):
    """Corpus ingestion errors. Has failed_files and missing_files attributes."""

class PackagingError(PipelineError):
    """Database packaging errors."""

class ModelError(MyanmarSpellcheckError):
    """Machine learning model errors."""

class ModelLoadError(ModelError):
    """Model loading failures."""

class InferenceError(ModelError):
    """Model inference failures."""

class MissingDependencyError(MyanmarSpellcheckError):
    """Required external dependency is missing."""

class InsufficientStorageError(MyanmarSpellcheckError):
    """Not enough disk space for operation."""

class CacheError(MyanmarSpellcheckError):
    """Caching operation failures."""

Module Index

ModuleDescriptionDocumentation
myspellcheckerMain package exportsThis page
myspellchecker.coreCore classes and configThis page
myspellchecker.algorithmsSpell check algorithmsAlgorithms
myspellchecker.providersDictionary providersProvider Capabilities
myspellchecker.segmentersText segmentersThis page
myspellchecker.tokenizersLow-level tokenizersTokenizers API
myspellchecker.utilsUtility functionsThis page
myspellchecker.data_pipelineDictionary buildingData Pipeline
myspellchecker.trainingModel trainingTraining

Next Steps