Documentation Index
Fetch the complete documentation index at: https://docs.myspellchecker.com/llms.txt
Use this file to discover all available pages before exploring further.
Every major component in the checking pipeline is behind an interface: segmenters, dictionary providers, validation strategies, and POS taggers. To customize behavior, implement the corresponding base class and pass your instance to SpellChecker. No monkey-patching or library forks required.
Custom Segmenter
If you have a proprietary segmentation model or want to use a different library:
- Subclass
Segmenter.
- Implement
segment_syllables and segment_words.
- Pass it to
SpellChecker.
from myspellchecker.segmenters import Segmenter
from typing import List
class SpaceSegmenter(Segmenter):
"""Custom segmenter that uses spaces as word boundaries."""
def segment_syllables(self, text: str) -> List[str]:
# Your syllable segmentation logic here
return list(text)
def segment_words(self, text: str) -> List[str]:
return text.split(" ")
# Use the custom segmenter
checker = SpellChecker(segmenter=SpaceSegmenter())
BiLSTM Segmenter Example
from myspellchecker.segmenters import Segmenter
import torch
class BiLSTMSegmenter(Segmenter):
"""Segmenter using a trained BiLSTM model."""
def __init__(self, model_path: str):
self.model = torch.load(model_path)
def segment_syllables(self, text: str) -> List[str]:
with torch.no_grad():
# Your model inference logic
predictions = self.model(text)
return self._decode_predictions(predictions)
def segment_words(self, text: str) -> List[str]:
syllables = self.segment_syllables(text)
# Group syllables into words based on your model
return self._group_into_words(syllables)
Custom Dictionary Provider
To load data from a Redis cache, API, or other source:
- Subclass
DictionaryProvider.
- Implement required abstract methods.
Redis Provider Example
from myspellchecker.providers import DictionaryProvider
from typing import List, Optional, Tuple
import redis
class RedisProvider(DictionaryProvider):
"""Dictionary provider backed by Redis cache."""
def __init__(self, redis_client, prefix: str = "spell:"):
self.redis = redis_client
self.prefix = prefix
def is_valid_word(self, word: str) -> bool:
return self.redis.exists(f"{self.prefix}word:{word}")
def is_valid_syllable(self, syllable: str) -> bool:
return self.redis.exists(f"{self.prefix}syl:{syllable}")
def get_word_frequency(self, word: str) -> int:
freq = self.redis.get(f"{self.prefix}freq:{word}")
return int(freq) if freq else 0
def get_bigram_probability(self, word1: str, word2: str) -> float:
prob = self.redis.get(f"{self.prefix}bigram:{word1}:{word2}")
return float(prob) if prob else 0.0
def get_trigram_probability(self, word1: str, word2: str, word3: str) -> float:
prob = self.redis.get(f"{self.prefix}trigram:{word1}:{word2}:{word3}")
return float(prob) if prob else 0.0
def get_top_continuations(self, prev_word: str, limit: int = 20) -> List[Tuple[str, float]]:
# Use Redis SCAN for prefix matching
cursor = 0
results = []
while len(results) < limit:
cursor, keys = self.redis.scan(
cursor, f"{self.prefix}word:{prefix}*", count=100
)
for key in keys:
word = key.decode().replace(f"{self.prefix}word:", "")
freq = self.get_word_frequency(word)
results.append((word, freq))
if cursor == 0:
break
return sorted(results, key=lambda x: -x[1])[:limit]
def close(self) -> None:
self.redis.close()
# Usage
redis_client = redis.Redis(host='localhost', port=6379, db=0)
provider = RedisProvider(redis_client)
checker = SpellChecker(provider=provider)
REST API Provider Example
import requests
from myspellchecker.providers import DictionaryProvider
class APIProvider(DictionaryProvider):
"""Dictionary provider using REST API backend."""
def __init__(self, base_url: str, api_key: Optional[str] = None):
self.base_url = base_url.rstrip("/")
self.session = requests.Session()
if api_key:
self.session.headers["Authorization"] = f"Bearer {api_key}"
def is_valid_word(self, word: str) -> bool:
response = self.session.get(f"{self.base_url}/words/{word}")
return response.status_code == 200
def get_word_frequency(self, word: str) -> int:
response = self.session.get(f"{self.base_url}/frequency/{word}")
if response.status_code == 200:
return response.json().get("frequency", 0)
return 0
# ... implement other methods similarly
Custom Validation Strategies
The ContextValidator uses a strategy pattern for extensible validation.
Creating a Custom Strategy
from myspellchecker.core.validation_strategies import ValidationStrategy, ValidationContext
from myspellchecker.core.response import Error
from typing import List
class ProfanityFilterStrategy(ValidationStrategy):
"""Strategy to detect and flag profanity."""
def __init__(self, blocked_words: List[str]):
self.blocked_words = set(blocked_words)
def priority(self) -> int:
"""Lower values run first. Default strategies use 10-70."""
return 25 # Run after tone validation (10) but before POS (30)
def validate(self, context: ValidationContext) -> List[Error]:
errors = []
for i, word in enumerate(context.words):
if word.lower() in self.blocked_words:
errors.append(Error(
text=word,
position=context.word_positions[i],
suggestions=["[redacted]"],
error_type="profanity",
confidence=1.0,
))
return errors
# Register the strategy
from myspellchecker.core.context_validator import ContextValidator
strategies = [
ProfanityFilterStrategy(["bad_word1", "bad_word2"]),
# ... other strategies
]
validator = ContextValidator(config, segmenter, strategies=strategies)
Strategy Priority Guidelines
| Priority | Category | Description |
|---|
| 10 | Tone | Tone mark disambiguation |
| 15 | Orthography | Medial order and compatibility validation |
| 20 | Syntactic | Grammar rule validation |
| 25 | Broken Compound | Broken compound detection |
| 30 | POS | Part-of-speech sequence validation |
| 35 | Custom | Custom validation strategies (use 35 or other open slot) |
| 40 | Question | Question particle validation |
| 45 | Homophone | Homophone confusion detection |
| 47 | Confusable | Confusable variant detection |
| 48 | Confusable Semantic | Confusable semantic detection (AI) |
| 50 | N-gram | Statistical context validation |
| 70 | Semantic | AI-powered semantic validation |
Factory Pattern Usage
mySpellChecker uses factories for creating configured components.
Using Component Factory
from myspellchecker.core.component_factory import ComponentFactory
from myspellchecker.core.config import SpellCheckerConfig
config = SpellCheckerConfig(
max_edit_distance=2,
use_phonetic=True,
use_context_checker=True,
)
factory = ComponentFactory(config)
# Create individual components
symspell = factory.create_symspell(provider)
components = factory.create_all(provider, segmenter)
POS Tagger Factory
from myspellchecker.algorithms.pos_tagger_factory import POSTaggerFactory
from myspellchecker.core.config import POSTaggerConfig
# Create rule-based tagger (fast, no dependencies)
tagger = POSTaggerFactory.create("rule_based")
# Create Viterbi tagger (better accuracy)
tagger = POSTaggerFactory.create("viterbi", provider=provider)
# Create transformer tagger (best accuracy, requires torch)
tagger = POSTaggerFactory.create(
"transformer",
model_name="chuuhtetnaing/myanmar-pos-model",
device=0 # GPU
)
Advanced Configuration Patterns
Environment-Based Configuration
from myspellchecker.core.config.loader import ConfigLoader
loader = ConfigLoader()
# Load from profile with environment overrides
config = loader.load(
profile="production",
use_env=True, # Read MYSPELL_* environment variables
)
# Available environment variables:
# MYSPELL_MAX_EDIT_DISTANCE=3
# MYSPELL_MAX_SUGGESTIONS=10
# MYSPELL_USE_CONTEXT_CHECKER=true
# MYSPELL_DATABASE_PATH=/path/to/custom.db
Profile-Based Configuration
from myspellchecker.core.config.loader import load_config
# Fast profile - optimized for speed
config = load_config(profile="fast")
# Accurate profile - maximum accuracy
config = load_config(profile="accurate")
# Production profile - balanced
config = load_config(profile="production")
Programmatic Configuration
from myspellchecker.core.config import (
SpellCheckerConfig,
SymSpellConfig,
NgramContextConfig,
POSTaggerConfig,
)
config = SpellCheckerConfig(
max_edit_distance=2,
max_suggestions=10,
use_phonetic=True,
use_context_checker=True,
symspell=SymSpellConfig(
prefix_length=10,
beam_width=150,
),
ngram_context=NgramContextConfig(
bigram_threshold=0.0005,
trigram_threshold=0.00005,
),
pos_tagger=POSTaggerConfig(
tagger_type="transformer",
model_name="chuuhtetnaing/myanmar-pos-model",
),
)
checker = SpellChecker(config=config)
Configuration from File
# myspellchecker.yaml
preset: production
max_edit_distance: 2
max_suggestions: 10
symspell:
prefix_length: 10
beam_width: 150
ngram_context:
bigram_threshold: 0.0005
trigram_threshold: 0.00005
pos_tagger:
tagger_type: transformer
model_name: chuuhtetnaing/myanmar-pos-model
from myspellchecker.core.config.loader import load_config
config = load_config(config_file="myspellchecker.yaml")