Overview
mySpellChecker is designed for extensibility at multiple levels:| Extension Point | Purpose | Complexity |
|---|---|---|
| Configuration | Adjust behavior | Low |
| Dictionary Provider | Custom storage | Medium |
| Validator | Custom validation | Medium |
| Segmenter | Custom tokenization | Medium |
| POS Tagger | Custom tagging | High |
| Algorithm | Custom algorithms | High |
Configuration Extensions
Custom Configuration
Copy
Ask AI
from myspellchecker import SpellChecker
from myspellchecker.core.config import SpellCheckerConfig
from myspellchecker.core.constants import ValidationLevel
# Create custom configuration
config = SpellCheckerConfig(
# Feature settings
use_context_checker=True,
use_rule_based_validation=True,
# Performance settings
max_edit_distance=2,
max_suggestions=5,
# Feature toggles
use_phonetic=True,
# Semantic checking is disabled by default (no model_path configured)
)
checker = SpellChecker(config=config)
# Validation level is specified per-check
result = checker.check(text, level=ValidationLevel.WORD)
Environment-Based Configuration
Copy
Ask AI
import os
from myspellchecker import SpellChecker
from myspellchecker.core.config import SpellCheckerConfig
from myspellchecker.core.constants import ValidationLevel
def get_config_from_env():
"""Create configuration from environment variables."""
return SpellCheckerConfig(
max_edit_distance=int(os.getenv("SPELLCHECK_EDIT_DIST", "2")),
max_suggestions=int(os.getenv("SPELLCHECK_MAX_SUGGEST", "5")),
)
# Validation level from environment
level = ValidationLevel[os.getenv("SPELLCHECK_LEVEL", "WORD")]
checker = SpellChecker(config=get_config_from_env())
result = checker.check(text, level=level)
Custom Dictionary Providers
Built-in Providers
mySpellChecker includes four built-in providers:| Provider | Use Case | Performance | Persistence |
|---|---|---|---|
SQLiteProvider | Production | Fast (indexed) | Disk |
MemoryProvider | Testing, small apps | Fastest (O(1)) | None |
JSONProvider | Testing, config | Medium | File |
CSVProvider | Testing, import | Medium | File |
Copy
Ask AI
from myspellchecker.providers import (
SQLiteProvider,
MemoryProvider,
JSONProvider,
CSVProvider,
DictionaryProvider, # Base class for custom providers
)
# Using SQLiteProvider (default for production)
provider = SQLiteProvider(database_path="myspell.db")
# Using MemoryProvider (ideal for testing)
provider = MemoryProvider(
syllables={"မြန်": 1500, "မာ": 2300},
words={"မြန်မာ": 850},
bigrams={("မြန်မာ", "နိုင်ငံ"): 0.45},
)
# Using factory method
provider = DictionaryProvider.create("memory", syllables={"မြန်": 100})
Provider Interface
To create a custom provider, extendDictionaryProvider and implement all abstract methods:
Copy
Ask AI
from abc import ABC, abstractmethod
from typing import Dict, Iterator, List, Optional, Tuple
from myspellchecker.providers.base import DictionaryProvider
class DictionaryProvider(ABC):
"""Base class for dictionary providers."""
# === Core Validation (Layer 1 & 2) ===
@abstractmethod
def is_valid_syllable(self, syllable: str) -> bool:
"""Check if syllable exists in dictionary."""
pass
@abstractmethod
def is_valid_word(self, word: str) -> bool:
"""Check if word exists in dictionary."""
pass
# === Frequency Data (for suggestion ranking) ===
@abstractmethod
def get_syllable_frequency(self, syllable: str) -> int:
"""Get corpus frequency for a syllable (0 if not found)."""
pass
@abstractmethod
def get_word_frequency(self, word: str) -> int:
"""Get corpus frequency for a word (0 if not found)."""
pass
# === POS Data (Layer 2.5 Grammar) ===
@abstractmethod
def get_word_pos(self, word: str) -> Optional[str]:
"""Get POS tag for a word (None if not found)."""
pass
# === N-gram Data (Layer 3 Context) ===
@abstractmethod
def get_bigram_probability(self, prev_word: str, current_word: str) -> float:
"""Get P(current_word | prev_word) probability."""
pass
@abstractmethod
def get_trigram_probability(self, w1: str, w2: str, w3: str) -> float:
"""Get P(w3 | w1, w2) probability."""
pass
@abstractmethod
def get_top_continuations(self, prev_word: str, limit: int = 20) -> List[Tuple[str, float]]:
"""Get most likely words to follow prev_word."""
pass
# === POS Probability Tables (for Viterbi tagger) ===
@abstractmethod
def get_pos_unigram_probabilities(self) -> Dict[str, float]:
"""Get all POS unigram probabilities."""
pass
@abstractmethod
def get_pos_bigram_probabilities(self) -> Dict[Tuple[str, str], float]:
"""Get all POS bigram probabilities."""
pass
@abstractmethod
def get_pos_trigram_probabilities(self) -> Dict[Tuple[str, str, str], float]:
"""Get all POS trigram probabilities."""
pass
# === Iteration (for SymSpell index building) ===
@abstractmethod
def get_all_syllables(self) -> Iterator[Tuple[str, int]]:
"""Iterate all (syllable, frequency) pairs."""
pass
@abstractmethod
def get_all_words(self) -> Iterator[Tuple[str, int]]:
"""Iterate all (word, frequency) pairs."""
pass
Example: Redis Provider
Copy
Ask AI
import redis
import json
from typing import Dict, Iterator, List, Optional, Tuple
from myspellchecker.providers.base import DictionaryProvider
class RedisProvider(DictionaryProvider):
"""Redis-backed dictionary provider for distributed deployments."""
def __init__(self, host: str = "localhost", port: int = 6379, db: int = 0):
self.client = redis.Redis(host=host, port=port, db=db, decode_responses=True)
# === Core Validation ===
def is_valid_syllable(self, syllable: str) -> bool:
return self.client.hexists("syllables", syllable)
def is_valid_word(self, word: str) -> bool:
return self.client.hexists("words", word)
# === Frequency Data ===
def get_syllable_frequency(self, syllable: str) -> int:
freq = self.client.hget("syllables", syllable)
return int(freq) if freq else 0
def get_word_frequency(self, word: str) -> int:
freq = self.client.hget("words", word)
return int(freq) if freq else 0
# === POS Data ===
def get_word_pos(self, word: str) -> Optional[str]:
return self.client.hget("word_pos", word)
# === N-gram Data ===
def get_bigram_probability(self, prev_word: str, current_word: str) -> float:
key = f"{prev_word}:{current_word}"
prob = self.client.hget("bigrams", key)
return float(prob) if prob else 0.0
def get_trigram_probability(self, w1: str, w2: str, w3: str) -> float:
key = f"{w1}:{w2}:{w3}"
prob = self.client.hget("trigrams", key)
return float(prob) if prob else 0.0
def get_top_continuations(self, prev_word: str, limit: int = 20) -> List[Tuple[str, float]]:
# Use Redis sorted set for efficient top-N queries
results = self.client.zrevrange(f"continuations:{prev_word}", 0, limit - 1, withscores=True)
return [(word, score) for word, score in results]
# === POS Probabilities (return empty if not using Viterbi) ===
def get_pos_unigram_probabilities(self) -> Dict[str, float]:
data = self.client.hgetall("pos_unigrams")
return {k: float(v) for k, v in data.items()}
def get_pos_bigram_probabilities(self) -> Dict[Tuple[str, str], float]:
data = self.client.hgetall("pos_bigrams")
return {tuple(k.split(":")): float(v) for k, v in data.items()}
def get_pos_trigram_probabilities(self) -> Dict[Tuple[str, str, str], float]:
data = self.client.hgetall("pos_trigrams")
return {tuple(k.split(":")): float(v) for k, v in data.items()}
# === Iteration ===
def get_all_syllables(self) -> Iterator[Tuple[str, int]]:
for syllable, freq in self.client.hscan_iter("syllables"):
yield syllable, int(freq)
def get_all_words(self) -> Iterator[Tuple[str, int]]:
for word, freq in self.client.hscan_iter("words"):
yield word, int(freq)
# Usage
from myspellchecker import SpellChecker
redis_provider = RedisProvider(host="redis.example.com")
checker = SpellChecker(provider=redis_provider)
Example: API Provider
Copy
Ask AI
import requests
from typing import Dict, Iterator, List, Optional, Tuple
from myspellchecker.providers.base import DictionaryProvider
class APIProvider(DictionaryProvider):
"""API-backed dictionary provider for microservice architecture."""
def __init__(self, base_url: str, api_key: str = None):
self.base_url = base_url.rstrip("/")
self.headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
self._session = requests.Session()
self._session.headers.update(self.headers)
def _get(self, endpoint: str, params: dict = None):
response = self._session.get(f"{self.base_url}{endpoint}", params=params)
return response.json() if response.status_code == 200 else None
# === Core Validation ===
def is_valid_syllable(self, syllable: str) -> bool:
result = self._get(f"/syllables/{syllable}/exists")
return result.get("exists", False) if result else False
def is_valid_word(self, word: str) -> bool:
result = self._get(f"/words/{word}/exists")
return result.get("exists", False) if result else False
# === Frequency Data ===
def get_syllable_frequency(self, syllable: str) -> int:
result = self._get(f"/syllables/{syllable}")
return result.get("frequency", 0) if result else 0
def get_word_frequency(self, word: str) -> int:
result = self._get(f"/words/{word}")
return result.get("frequency", 0) if result else 0
# === POS Data ===
def get_word_pos(self, word: str) -> Optional[str]:
result = self._get(f"/words/{word}/pos")
return result.get("pos") if result else None
# === N-gram Data ===
def get_bigram_probability(self, prev_word: str, current_word: str) -> float:
result = self._get("/bigrams", params={"w1": prev_word, "w2": current_word})
return result.get("probability", 0.0) if result else 0.0
def get_trigram_probability(self, w1: str, w2: str, w3: str) -> float:
result = self._get("/trigrams", params={"w1": w1, "w2": w2, "w3": w3})
return result.get("probability", 0.0) if result else 0.0
def get_top_continuations(self, prev_word: str, limit: int = 20) -> List[Tuple[str, float]]:
result = self._get(f"/continuations/{prev_word}", params={"limit": limit})
if result:
return [(item["word"], item["probability"]) for item in result.get("items", [])]
return []
# === POS Probabilities ===
def get_pos_unigram_probabilities(self) -> Dict[str, float]:
result = self._get("/pos/unigrams")
return result.get("probabilities", {}) if result else {}
def get_pos_bigram_probabilities(self) -> Dict[Tuple[str, str], float]:
result = self._get("/pos/bigrams")
if result:
return {tuple(k.split(":")): v for k, v in result.get("probabilities", {}).items()}
return {}
def get_pos_trigram_probabilities(self) -> Dict[Tuple[str, str, str], float]:
result = self._get("/pos/trigrams")
if result:
return {tuple(k.split(":")): v for k, v in result.get("probabilities", {}).items()}
return {}
# === Iteration ===
def get_all_syllables(self) -> Iterator[Tuple[str, int]]:
# Paginated API call
offset = 0
while True:
result = self._get("/syllables", params={"offset": offset, "limit": 1000})
if not result or not result.get("items"):
break
for item in result["items"]:
yield item["syllable"], item["frequency"]
offset += 1000
def get_all_words(self) -> Iterator[Tuple[str, int]]:
offset = 0
while True:
result = self._get("/words", params={"offset": offset, "limit": 1000})
if not result or not result.get("items"):
break
for item in result["items"]:
yield item["word"], item["frequency"]
offset += 1000
Custom Validators
Validator Interface
Copy
Ask AI
from abc import ABC, abstractmethod
from typing import List
from myspellchecker.core.response import Error
class Validator(ABC):
"""Base class for validators."""
@abstractmethod
def validate(self, text: str) -> List[Error]:
"""Validate text and return errors."""
pass
Example: Domain-Specific Validator
Copy
Ask AI
from typing import List
from myspellchecker.core.response import WordError
class MedicalTermValidator:
"""Validator for medical terminology."""
def __init__(self, terms_file: str):
self.medical_terms = self._load_terms(terms_file)
def _load_terms(self, file_path: str) -> set:
with open(file_path, "r", encoding="utf-8") as f:
return set(line.strip() for line in f)
def validate(self, text: str, words: List[str]) -> List[WordError]:
errors = []
for i, word in enumerate(words):
# Check for potential medical term misspellings
if self._looks_like_medical_term(word):
if word not in self.medical_terms:
suggestions = self._find_similar_terms(word)
position = self._get_position(text, word, i)
errors.append(WordError(
text=word,
position=position,
suggestions=suggestions,
confidence=0.8, # Lower confidence for domain-specific validation
))
return errors
def _looks_like_medical_term(self, word: str) -> bool:
# Heuristics for medical terms
medical_suffixes = ["disease", "medicine", "surgery"]
return any(word.endswith(suffix) for suffix in medical_suffixes)
def _find_similar_terms(self, word: str) -> List[str]:
# Find similar medical terms
return []
Integrating Custom Validator
Copy
Ask AI
from myspellchecker import SpellChecker
class ExtendedSpellChecker(SpellChecker):
"""SpellChecker with custom validators."""
def __init__(self, *args, custom_validators=None, **kwargs):
super().__init__(*args, **kwargs)
self.custom_validators = custom_validators or []
def check(self, text: str):
# Run standard validation
result = super().check(text)
# Run custom validators
for validator in self.custom_validators:
custom_errors = validator.validate(text)
result.errors.extend(custom_errors)
return result
# Usage
medical_validator = MedicalTermValidator("medical_terms.txt")
checker = ExtendedSpellChecker(custom_validators=[medical_validator])
Custom Segmenters
Segmenter Interface
Copy
Ask AI
from abc import ABC, abstractmethod
from typing import List
class Segmenter(ABC):
"""Base class for text segmenters."""
@abstractmethod
def segment_syllables(self, text: str) -> List[str]:
"""Segment text into syllables."""
pass
@abstractmethod
def segment_words(self, text: str) -> List[str]:
"""Segment text into words."""
pass
Example: ML-Based Segmenter
Copy
Ask AI
from typing import List
import torch
class NeuralSegmenter:
"""Neural network-based segmenter."""
def __init__(self, model_path: str):
self.model = torch.load(model_path)
self.model.set_mode_inference()
def segment_syllables(self, text: str) -> List[str]:
# Use neural model to predict syllable boundaries
with torch.no_grad():
# Tokenize input
tokens = self._tokenize(text)
# Predict boundaries
predictions = self.model(tokens)
# Convert to syllables
return self._predictions_to_syllables(text, predictions)
def segment_words(self, text: str) -> List[str]:
# Similar implementation for word segmentation
pass
def _tokenize(self, text: str):
# Convert text to model input
pass
def _predictions_to_syllables(self, text: str, predictions) -> List[str]:
# Convert model output to syllable list
pass
Custom POS Taggers
POS Tagger Interface
Copy
Ask AI
from abc import ABC, abstractmethod
from typing import List, Tuple
class POSTaggerBase(ABC):
"""Base class for POS taggers."""
@abstractmethod
def tag_word(self, word: str) -> str:
"""Tag a single word with its POS tag."""
pass
@abstractmethod
def tag_sequence(self, words: List[str]) -> List[str]:
"""Tag a sequence of words with their POS tags."""
pass
Example: Transformer-Based Tagger
Copy
Ask AI
from typing import List, Tuple
from transformers import AutoModelForTokenClassification, AutoTokenizer
class TransformerPOSTagger:
"""Transformer-based POS tagger."""
def __init__(self, model_name: str, device: int = -1):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForTokenClassification.from_pretrained(model_name)
self.device = "cuda" if device >= 0 else "cpu"
self.model.to(self.device)
def tag_word(self, word: str) -> str:
tags = self.tag_sequence([word])
return tags[0] if tags else "N"
def tag_sequence(self, words: List[str]) -> List[str]:
# Tokenize
text = " ".join(words)
inputs = self.tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True,
).to(self.device)
# Predict
outputs = self.model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
# Convert to tags
tags = []
for word, pred in zip(words, predictions[0], strict=False):
tag = self.model.config.id2label[pred.item()]
tags.append(tag)
return tags
Custom Grammar Rules
Grammar Rule Interface
Copy
Ask AI
from abc import ABC, abstractmethod
from typing import List, Tuple
from myspellchecker.core.response import Error
class GrammarRule(ABC):
"""Base class for grammar rules."""
@abstractmethod
def check(
self,
words: List[str],
tags: List[str],
) -> List[Error]:
"""Apply grammar rule and return errors."""
pass
Example: Custom Grammar Rule
Copy
Ask AI
from typing import List
from myspellchecker.core.response import GrammarError
class FormalRegisterRule:
"""Check for formal/informal register consistency."""
FORMAL_PARTICLES = {"formal1", "formal2", "formal3"}
INFORMAL_PARTICLES = {"informal1", "informal2", "informal3"}
def check(
self,
words: List[str],
tags: List[str],
text: str,
) -> List[GrammarError]:
errors = []
has_formal = any(w in self.FORMAL_PARTICLES for w in words)
has_informal = any(w in self.INFORMAL_PARTICLES for w in words)
if has_formal and has_informal:
# Find position of first mixed register word
position = 0
for w in words:
if w in self.INFORMAL_PARTICLES:
position = text.find(w)
break
errors.append(GrammarError(
text=w,
position=position,
suggestions=["Use consistent register throughout"],
error_type="register_consistency",
confidence=0.9,
reason="Mixed formal and informal register detected",
))
return errors
Plugin Architecture
Note: The plugin system is a planned feature for future versions. The API shown below is subject to change.
Creating a Plugin
Copy
Ask AI
# my_plugin/__init__.py
# NOTE: Plugin base class will be available in a future release
from myspellchecker.plugins import Plugin # Planned
class MyPlugin(Plugin):
"""Custom spell checker plugin."""
name = "my_plugin"
version = "1.0.0"
def __init__(self, config: dict = None):
self.config = config or {}
def on_load(self, checker):
"""Called when plugin is loaded."""
print(f"Loading {self.name} v{self.version}")
def on_check_start(self, text: str):
"""Called before checking starts."""
pass
def on_check_end(self, text: str, result):
"""Called after checking completes."""
pass
def on_error_found(self, error):
"""Called when an error is found."""
pass
Registering a Plugin (Planned)
Copy
Ask AI
# NOTE: register_plugin() is not yet implemented. This shows the planned API.
from myspellchecker import SpellChecker
from my_plugin import MyPlugin
checker = SpellChecker()
checker.register_plugin(MyPlugin(config={"option": "value"})) # Planned API
Best Practices
Extension Guidelines
- Follow interfaces: Implement all required methods
- Handle errors: Graceful degradation on failures
- Document behavior: Clear docstrings and examples
- Test thoroughly: Unit and integration tests
- Consider performance: Profile extensions
Performance Considerations
Copy
Ask AI
# Good: Cache expensive operations
class CachedProvider(DictionaryProvider):
def __init__(self, base_provider):
self.base = base_provider
self.cache = {}
def is_valid_word(self, text: str) -> bool:
if text not in self.cache:
self.cache[text] = self.base.is_valid_word(text)
return self.cache[text]
# Bad: No caching
class SlowProvider(DictionaryProvider):
def is_valid_word(self, text: str) -> bool:
return expensive_lookup(text) # Called every time
Error Handling
Copy
Ask AI
class SafeProvider(DictionaryProvider):
def lookup_word(self, text: str):
try:
return self._do_lookup(text)
except ConnectionError:
# Graceful degradation
return None
except Exception as e:
# Log and continue
logging.error(f"Lookup failed: {e}")
return None
See Also
- Architecture Overview - System design
- Component Diagram - Component relationships
- API Reference - Detailed API documentation