Skip to main content
mySpellChecker provides extensive configuration options to customize behavior for your specific use case.

Configuration Overview

Configuration is handled through Pydantic models that provide type safety and defaults:
from myspellchecker.core.config import (
    SpellCheckerConfig,
    POSTaggerConfig,
    SemanticConfig,
)
from myspellchecker.core.constants import ValidationLevel

Quick Reference

CategoryOptionDefaultDescription
Validationlevel (per-check)SYLLABLEValidation depth (SYLLABLE or WORD)
max_suggestions5Suggestions per error
max_edit_distance2Maximum edit distance for SymSpell
Contextuse_context_checkerTrueEnable N-gram context checking
use_phoneticTrueEnable phonetic suggestions
use_nerTrueEnable named entity recognition
Grammaruse_rule_based_validationTrueEnable grammar rules
Morphologyuse_reduplication_validationTrueAccept valid reduplications
use_compound_synthesisTrueAccept valid compound words
reduplication_min_base_frequency5Min frequency for reduplication base
compound_min_morpheme_frequency10Min frequency per compound morpheme
compound_max_parts4Max parts in compound split
POS Taggingtagger_typerule_basedTagger backend
Semanticmodel_pathNoneONNX model path (via SemanticConfig)

SpellCheckerConfig

The main configuration class:
from myspellchecker import SpellChecker
from myspellchecker.core.config import SpellCheckerConfig
from myspellchecker.core.constants import ValidationLevel
from myspellchecker.providers import SQLiteProvider

config = SpellCheckerConfig(
    # Suggestion settings
    max_suggestions=10,
    max_edit_distance=2,

    # Feature toggles
    use_phonetic=True,
    use_context_checker=True,  # Enable N-gram context checking
    use_rule_based_validation=True,  # Enable grammar rules
    use_ner=True,  # Enable named entity recognition
)

provider = SQLiteProvider(database_path="path/to/dictionary.db")
checker = SpellChecker(config=config, provider=provider)

# Validation level is specified per-check, not in configuration
result = checker.check(text, level=ValidationLevel.WORD)

Validation Level

Controls the depth of validation. Specified per-check, not in configuration:
from myspellchecker import SpellChecker
from myspellchecker.core.config import SpellCheckerConfig
from myspellchecker.core.constants import ValidationLevel
from myspellchecker.providers import SQLiteProvider

provider = SQLiteProvider(database_path="path/to/dictionary.db")
checker = SpellChecker(provider=provider)

# SYLLABLE: Layer 1 only (Very Fast)
# - Syllable structure validation
# - Dictionary syllable lookup
result = checker.check(text, level=ValidationLevel.SYLLABLE)

# WORD: Layers 1-2 (Fast)
# - All syllable checks
# - Word validation
# - SymSpell suggestions
result = checker.check(text, level=ValidationLevel.WORD)

# WORD with context: Enable context checking in config (Moderate)
# - All previous checks
# - N-gram context analysis
# - Optional semantic checking
config = SpellCheckerConfig(
    use_context_checker=True,  # Enable context validation
)
checker = SpellChecker(config=config)
result = checker.check(text, level=ValidationLevel.WORD)

Suggestion Settings

Control how suggestions are generated:
config = SpellCheckerConfig(
    # Maximum suggestions per error
    max_suggestions=10,

    # Maximum edit distance for SymSpell
    max_edit_distance=2,

    # Include phonetically similar suggestions
    use_phonetic=True,
)

Context Settings

Configure N-gram context checking via NgramContextConfig:
from myspellchecker.core.config import SpellCheckerConfig, NgramContextConfig

config = SpellCheckerConfig(
    # Enable context checking
    use_context_checker=True,

    # Configure N-gram context via nested config
    ngram_context=NgramContextConfig(
        threshold=0.01,  # General probability threshold
        bigram_threshold=0.0001,  # Bigram-specific threshold
        trigram_threshold=0.0001,  # Trigram-specific threshold
    ),
)

Performance Settings

Optimize for your use case via AlgorithmCacheConfig:
from myspellchecker.core.config import SpellCheckerConfig, AlgorithmCacheConfig

config = SpellCheckerConfig(
    # Configure caching via nested config
    cache=AlgorithmCacheConfig(
        syllable_cache_size=4096,   # LRU cache for syllable lookups
        word_cache_size=8192,       # LRU cache for word lookups
        bigram_cache_size=16384,    # LRU cache for bigram probabilities
    ),
)

POSTaggerConfig

Configure the Part-of-Speech tagger:
from myspellchecker.core.config import SpellCheckerConfig, POSTaggerConfig

# Rule-based (fastest, ~70% accuracy)
pos_config = POSTaggerConfig(tagger_type="rule_based")

# Viterbi HMM (balanced, ~85% accuracy)
pos_config = POSTaggerConfig(tagger_type="viterbi")

# Transformer (slowest, ~93% accuracy)
pos_config = POSTaggerConfig(
    tagger_type="transformer",
    model_name="chuuhtetnaing/myanmar-pos-model",
    device=0,  # GPU index, -1 for CPU
    batch_size=32,
)

# Apply to config
config = SpellCheckerConfig(
    pos_tagger=pos_config,
    use_rule_based_validation=True,
)

POSTaggerConfig Options

POSTaggerConfig(
    # Tagger type: "rule_based", "viterbi", "transformer", "custom"
    tagger_type="viterbi",

    # Model name (for transformer)
    model_name="chuuhtetnaing/myanmar-pos-model",

    # Device: GPU index or -1 for CPU
    device=-1,
)

SemanticConfig

Configure the semantic (AI) checker:
from myspellchecker.core.config import SpellCheckerConfig, SemanticConfig

semantic_config = SemanticConfig(
    # ONNX model path
    model_path="/path/to/model.onnx",

    # Tokenizer path
    tokenizer_path="/path/to/tokenizer",

    # ONNX Runtime threads (default: 1)
    num_threads=4,

    # Top K predictions to consider (default: 5)
    predict_top_k=5,

    # Top K to check against (default: 10)
    check_top_k=10,
)

config = SpellCheckerConfig(
    use_context_checker=True,
    semantic=semantic_config,
)
checker = SpellChecker(config=config)
# Use word-level validation per-check
result = checker.check(text, level=ValidationLevel.WORD)

Pre-loaded Models

You can also pass pre-loaded model instances:
import onnxruntime as ort
from transformers import AutoTokenizer

# Pre-load for sharing across instances
model = ort.InferenceSession("model.onnx")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

semantic_config = SemanticConfig(
    model=model,
    tokenizer=tokenizer,
)

ValidationConfig

Configure error detection thresholds, confidence scores, and validation behavior:
from myspellchecker.core.config import SpellCheckerConfig
from myspellchecker.core.config.validation_configs import ValidationConfig

validation = ValidationConfig(
    # Confidence scores for different error types
    syllable_error_confidence=1.0,    # Syllable errors (highest certainty)
    word_error_confidence=0.8,        # Word errors
    context_error_confidence_high=0.9,  # High-confidence context errors
    context_error_confidence_low=0.6,   # Low-confidence context errors

    # Validation thresholds
    max_syllable_length=12,           # Maximum valid syllable length
    syllable_corruption_threshold=3,   # Threshold for corrupted syllables
    is_myanmar_text_threshold=0.5,    # Myanmar text detection threshold

    # Zawgyi handling
    use_zawgyi_detection=True,     # Detect Zawgyi encoding
    use_zawgyi_conversion=True,    # Auto-convert Zawgyi to Unicode
    zawgyi_confidence_threshold=0.95,  # Zawgyi detection confidence

    # Colloquial variant handling
    colloquial_strictness="lenient",  # "strict", "lenient", or "off"
    colloquial_info_confidence=0.3,   # Confidence for colloquial info notes

    # Extended Myanmar support
    allow_extended_myanmar=False,     # Allow non-Burmese Myanmar scripts

    # Strategy-specific confidence thresholds
    medial_confusion_confidence=0.85,
    tone_validation_confidence=0.5,
    syntactic_validation_confidence=0.9,
    pos_sequence_confidence=0.85,
    question_structure_confidence=0.7,
    homophone_confidence=0.8,
    homophone_improvement_ratio=5.0,
    semantic_min_word_length=2,

    # Debug options
    strict_validation=True,
    raise_on_strategy_error=False,    # Set True for debugging
)

config = SpellCheckerConfig(validation=validation)

Colloquial Variant Handling

Control how informal/colloquial spellings are handled:
# Strict mode: Flag all colloquial variants as errors
validation = ValidationConfig(colloquial_strictness="strict")

# Lenient mode (default): Accept with informational note
validation = ValidationConfig(colloquial_strictness="lenient")

# Off: No special handling
validation = ValidationConfig(colloquial_strictness="off")

Extended Myanmar Support

Enable support for non-Burmese Myanmar scripts (Shan, Mon, Karen, etc.):
# Enable extended Myanmar characters
validation = ValidationConfig(allow_extended_myanmar=True)
Note: By default, mySpellChecker only validates standard Burmese text. Extended Myanmar blocks (U+AA60-AA7F, U+A9E0-A9FF) are for other Myanmar-script languages.

Configuration Profiles

Built-in profiles for common scenarios using get_profile():
from myspellchecker.core.config.profiles import get_profile

# Fast: Maximum speed, minimal features
config = get_profile("fast")
# Equivalent to:
# - max_edit_distance=1
# - max_suggestions=3
# - use_context_checker=False
# - use_phonetic=False
# - use_ner=False
# - use_rule_based_validation=True

# Production: Good speed and accuracy
config = get_profile("production")
# Equivalent to:
# - max_edit_distance=2
# - max_suggestions=5
# - use_phonetic=True
# - use_context_checker=True
# - use_ner=True
# - use_rule_based_validation=True

# Accurate: Maximum accuracy
config = get_profile("accurate")
# Equivalent to:
# - max_edit_distance=3
# - max_suggestions=10
# - use_phonetic=True
# - use_context_checker=True
# - use_ner=True
# - use_rule_based_validation=True

# Development: For local development
config = get_profile("development")

# Testing: For running tests
config = get_profile("testing")

Environment Variables

Configuration can be set via environment variables with the MYSPELL_ prefix:
# Database path
export MYSPELL_DATABASE_PATH=/path/to/dictionary.db

# Core settings
export MYSPELL_MAX_EDIT_DISTANCE=2
export MYSPELL_MAX_SUGGESTIONS=5

# Enable features
export MYSPELL_USE_CONTEXT_CHECKER=true
export MYSPELL_USE_PHONETIC=true

# POS Tagger settings
export MYSPELL_POS_TAGGER_TYPE=viterbi
export MYSPELL_POS_TAGGER_BEAM_WIDTH=10

# N-gram context settings
export MYSPELL_NGRAM_BIGRAM_THRESHOLD=0.0001
Load from environment:
from myspellchecker.core.config.loader import ConfigLoader

loader = ConfigLoader()
config = loader.load(use_env=True)

Configuration File

Configuration can be defined programmatically. Here’s an example of a comprehensive configuration:
from myspellchecker import SpellChecker
from myspellchecker.core.config import (
    SpellCheckerConfig,
    POSTaggerConfig,
    NgramContextConfig,
    AlgorithmCacheConfig,
)

config = SpellCheckerConfig(
    # Suggestions
    max_suggestions=10,
    max_edit_distance=2,

    # Context checking
    use_context_checker=True,
    ngram_context=NgramContextConfig(
        bigram_threshold=0.001,
    ),

    # Grammar
    use_rule_based_validation=True,

    # POS tagger
    pos_tagger=POSTaggerConfig(
        tagger_type="viterbi",
        cache_size=10000,
    ),

    # Cache configuration
    cache=AlgorithmCacheConfig(
        syllable_cache_size=4096,
        word_cache_size=8192,
    ),
)

checker = SpellChecker(config=config)

Provider Configuration

Configure the dictionary provider:
from myspellchecker import SpellChecker
from myspellchecker.providers import SQLiteProvider

# SQLite with connection pooling
provider = SQLiteProvider(
    database_path="/path/to/dictionary.db",
    pool_max_size=5,
    pool_timeout=5.0,
    check_same_thread=False,
)

checker = SpellChecker(provider=provider)

SQLiteProvider Options

SQLiteProvider(
    # Database path (None for default)
    database_path="/path/to/db.sqlite",

    # Connection pool sizing
    pool_min_size=1,      # Minimum connections in pool
    pool_max_size=5,      # Maximum connections in pool

    # Connection timeouts
    pool_timeout=5.0,     # Checkout timeout in seconds
    sqlite_timeout=30.0,  # SQLite busy timeout in seconds

    # Cache size for frequency lookups
    cache_size=8192,
)

MemoryProvider Options

from myspellchecker.providers import MemoryProvider

provider = MemoryProvider(
    # Pre-load data
    syllables={"မြန်": 1500, "မာ": 2300},
    words={"မြန်မာ": 1000},
    bigrams={("မြန်မာ", "နိုင်ငံ"): 0.5},
)

Logging Configuration

Configure logging output:
from myspellchecker.utils.logging_utils import configure_logging

# Basic configuration
configure_logging(level="INFO")

# With JSON output
configure_logging(level="DEBUG", json_output=True)

# Custom format
configure_logging(
    level="INFO",
    format_string="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

Complete Example

from myspellchecker import SpellChecker
from myspellchecker.core.config import (
    SpellCheckerConfig,
    AlgorithmCacheConfig,
    POSTaggerConfig,
    SemanticConfig,
)
from myspellchecker.core.constants import ValidationLevel
from myspellchecker.providers import SQLiteProvider

# Full configuration
config = SpellCheckerConfig(
    # Context checking
    use_context_checker=True,

    # Suggestions
    max_suggestions=10,
    max_edit_distance=2,
    use_phonetic=True,

    # Grammar
    use_rule_based_validation=True,

    # POS Tagger
    pos_tagger=POSTaggerConfig(
        tagger_type="viterbi",
    ),

    # Semantic (optional)
    semantic=SemanticConfig(
        model_path="models/semantic.onnx",
        tokenizer_path="models/tokenizer",
        num_threads=4,
    ),

    # Cache configuration
    cache=AlgorithmCacheConfig(
        syllable_cache_size=4096,
        word_cache_size=8192,
    ),
)

# Provider with pooling
provider = SQLiteProvider(
    database_path="/path/to/dictionary.db",
    pool_max_size=5,
)

# Create checker
checker = SpellChecker(config=config, provider=provider)
# Validation level is specified per-check
result = checker.check(text, level=ValidationLevel.WORD)

Next Steps