Overview - mySpellChecker

Every major component in the checking pipeline is behind an interface: segmenters, dictionary providers, validation strategies, and POS taggers. To customize behavior, implement the corresponding base class and pass your instance to SpellChecker. No monkey-patching or library forks required.

Custom Segmenter

If you have a proprietary segmentation model or want to use a different library:

Subclass Segmenter.
Implement segment_syllables and segment_words.
Pass it to SpellChecker.

from myspellchecker.segmenters import Segmenter
from typing import List

class SpaceSegmenter(Segmenter):
    """Custom segmenter that uses spaces as word boundaries."""

    def segment_syllables(self, text: str) -> List[str]:
        # Your syllable segmentation logic here
        return list(text)

    def segment_words(self, text: str) -> List[str]:
        return text.split(" ")

# Use the custom segmenter
checker = SpellChecker(segmenter=SpaceSegmenter())

BiLSTM Segmenter Example

from myspellchecker.segmenters import Segmenter
import torch

class BiLSTMSegmenter(Segmenter):
    """Segmenter using a trained BiLSTM model."""

    def __init__(self, model_path: str):
        self.model = torch.load(model_path)

    def segment_syllables(self, text: str) -> List[str]:
        with torch.no_grad():
            # Your model inference logic
            predictions = self.model(text)
            return self._decode_predictions(predictions)

    def segment_words(self, text: str) -> List[str]:
        syllables = self.segment_syllables(text)
        # Group syllables into words based on your model
        return self._group_into_words(syllables)

Custom Dictionary Provider

To load data from a Redis cache, API, or other source:

Subclass DictionaryProvider.
Implement required abstract methods.

Redis Provider Example

from myspellchecker.providers import DictionaryProvider
from typing import List, Optional, Tuple
import redis

class RedisProvider(DictionaryProvider):
    """Dictionary provider backed by Redis cache."""

    def __init__(self, redis_client, prefix: str = "spell:"):
        self.redis = redis_client
        self.prefix = prefix

    def is_valid_word(self, word: str) -> bool:
        return self.redis.exists(f"{self.prefix}word:{word}")

    def is_valid_syllable(self, syllable: str) -> bool:
        return self.redis.exists(f"{self.prefix}syl:{syllable}")

    def get_word_frequency(self, word: str) -> int:
        freq = self.redis.get(f"{self.prefix}freq:{word}")
        return int(freq) if freq else 0

    def get_bigram_probability(self, word1: str, word2: str) -> float:
        prob = self.redis.get(f"{self.prefix}bigram:{word1}:{word2}")
        return float(prob) if prob else 0.0

    def get_trigram_probability(self, word1: str, word2: str, word3: str) -> float:
        prob = self.redis.get(f"{self.prefix}trigram:{word1}:{word2}:{word3}")
        return float(prob) if prob else 0.0

    def get_top_continuations(self, prev_word: str, limit: int = 20) -> List[Tuple[str, float]]:
        # Use Redis SCAN for prefix matching
        cursor = 0
        results = []
        while len(results) < limit:
            cursor, keys = self.redis.scan(
                cursor, f"{self.prefix}word:{prefix}*", count=100
            )
            for key in keys:
                word = key.decode().replace(f"{self.prefix}word:", "")
                freq = self.get_word_frequency(word)
                results.append((word, freq))
            if cursor == 0:
                break
        return sorted(results, key=lambda x: -x[1])[:limit]

    def close(self) -> None:
        self.redis.close()

# Usage
redis_client = redis.Redis(host='localhost', port=6379, db=0)
provider = RedisProvider(redis_client)
checker = SpellChecker(provider=provider)

REST API Provider Example

import requests
from myspellchecker.providers import DictionaryProvider

class APIProvider(DictionaryProvider):
    """Dictionary provider using REST API backend."""

    def __init__(self, base_url: str, api_key: Optional[str] = None):
        self.base_url = base_url.rstrip("/")
        self.session = requests.Session()
        if api_key:
            self.session.headers["Authorization"] = f"Bearer {api_key}"

    def is_valid_word(self, word: str) -> bool:
        response = self.session.get(f"{self.base_url}/words/{word}")
        return response.status_code == 200

    def get_word_frequency(self, word: str) -> int:
        response = self.session.get(f"{self.base_url}/frequency/{word}")
        if response.status_code == 200:
            return response.json().get("frequency", 0)
        return 0

    # ... implement other methods similarly

Custom Validation Strategies

The ContextValidator uses a strategy pattern for extensible validation.

Creating a Custom Strategy

from myspellchecker.core.validation_strategies import ValidationStrategy, ValidationContext
from myspellchecker.core.response import Error
from typing import List

class ProfanityFilterStrategy(ValidationStrategy):
    """Strategy to detect and flag profanity."""

    def __init__(self, blocked_words: List[str]):
        self.blocked_words = set(blocked_words)

    def priority(self) -> int:
        """Lower values run first. Default strategies use 10-70."""
        return 25  # Run after tone validation (10) but before POS (30)

    def validate(self, context: ValidationContext) -> List[Error]:
        errors = []
        for i, word in enumerate(context.words):
            if word.lower() in self.blocked_words:
                errors.append(Error(
                    text=word,
                    position=context.word_positions[i],
                    suggestions=["[redacted]"],
                    error_type="profanity",
                    confidence=1.0,
                ))
        return errors

# Register the strategy
from myspellchecker.core.context_validator import ContextValidator

strategies = [
    ProfanityFilterStrategy(["bad_word1", "bad_word2"]),
    # ... other strategies
]
validator = ContextValidator(config, segmenter, strategies=strategies)

Strategy Priority Guidelines

Priority	Category	Description
10	Tone	Tone mark disambiguation
15	Orthography	Medial order and compatibility validation
20	Syntactic	Grammar rule validation
25	Broken Compound	Broken compound detection
30	POS	Part-of-speech sequence validation
35	Custom	Custom validation strategies (use 35 or other open slot)
40	Question	Question particle validation
45	Homophone	Homophone confusion detection
47	Confusable	Confusable variant detection
48	Confusable Semantic	Confusable semantic detection (AI)
50	N-gram	Statistical context validation
70	Semantic	AI-powered semantic validation

Factory Pattern Usage

mySpellChecker uses factories for creating configured components.

Using Component Factory

from myspellchecker.core.component_factory import ComponentFactory
from myspellchecker.core.config import SpellCheckerConfig

config = SpellCheckerConfig(
    max_edit_distance=2,
    use_phonetic=True,
    use_context_checker=True,
)

factory = ComponentFactory(config)

# Create individual components
symspell = factory.create_symspell(provider)
components = factory.create_all(provider, segmenter)

POS Tagger Factory

from myspellchecker.algorithms.pos_tagger_factory import POSTaggerFactory
from myspellchecker.core.config import POSTaggerConfig

# Create rule-based tagger (fast, no dependencies)
tagger = POSTaggerFactory.create("rule_based")

# Create Viterbi tagger (better accuracy)
tagger = POSTaggerFactory.create("viterbi", provider=provider)

# Create transformer tagger (best accuracy, requires torch)
tagger = POSTaggerFactory.create(
    "transformer",
    model_name="chuuhtetnaing/myanmar-pos-model",
    device=0  # GPU
)

Advanced Configuration Patterns

Environment-Based Configuration

from myspellchecker.core.config.loader import ConfigLoader

loader = ConfigLoader()

# Load from profile with environment overrides
config = loader.load(
    profile="production",
    use_env=True,  # Read MYSPELL_* environment variables
)

# Available environment variables:
# MYSPELL_MAX_EDIT_DISTANCE=3
# MYSPELL_MAX_SUGGESTIONS=10
# MYSPELL_USE_CONTEXT_CHECKER=true
# MYSPELL_DATABASE_PATH=/path/to/custom.db

Profile-Based Configuration

from myspellchecker.core.config.loader import load_config

# Fast profile - optimized for speed
config = load_config(profile="fast")

# Accurate profile - maximum accuracy
config = load_config(profile="accurate")

# Production profile - balanced
config = load_config(profile="production")

Programmatic Configuration

from myspellchecker.core.config import (
    SpellCheckerConfig,
    SymSpellConfig,
    NgramContextConfig,
    POSTaggerConfig,
)

config = SpellCheckerConfig(
    max_edit_distance=2,
    max_suggestions=10,
    use_phonetic=True,
    use_context_checker=True,
    symspell=SymSpellConfig(
        prefix_length=10,
        beam_width=150,
    ),
    ngram_context=NgramContextConfig(
        bigram_threshold=0.0005,
        trigram_threshold=0.00005,
    ),
    pos_tagger=POSTaggerConfig(
        tagger_type="transformer",
        model_name="chuuhtetnaing/myanmar-pos-model",
    ),
)

checker = SpellChecker(config=config)

Configuration from File

# myspellchecker.yaml
preset: production
max_edit_distance: 2
max_suggestions: 10

symspell:
  prefix_length: 10
  beam_width: 150

ngram_context:
  bigram_threshold: 0.0005
  trigram_threshold: 0.00005

pos_tagger:
  tagger_type: transformer
  model_name: chuuhtetnaing/myanmar-pos-model

from myspellchecker.core.config.loader import load_config

config = load_config(config_file="myspellchecker.yaml")

​Custom Segmenter

​BiLSTM Segmenter Example

​Custom Dictionary Provider

​Redis Provider Example

​REST API Provider Example

​Custom Validation Strategies

​Creating a Custom Strategy

​Strategy Priority Guidelines

​Factory Pattern Usage

​Using Component Factory

​POS Tagger Factory

​Advanced Configuration Patterns

​Environment-Based Configuration

​Profile-Based Configuration

​Programmatic Configuration

​Configuration from File