class SchemaManager:
"""Manages database schema for spell checker dictionaries.
Provides centralized definitions for all tables and indexes,
ensuring consistency across dictionary builds.
Args:
conn: SQLite database connection
cursor: SQLite database cursor
console: Optional PipelineConsole for output
"""
# Table definitions
TABLES = {
"syllables": '''
CREATE TABLE IF NOT EXISTS syllables (
id INTEGER PRIMARY KEY AUTOINCREMENT,
syllable TEXT UNIQUE NOT NULL,
frequency INTEGER DEFAULT 0
)
''',
"words": '''
CREATE TABLE IF NOT EXISTS words (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word TEXT UNIQUE NOT NULL,
syllable_count INTEGER,
frequency INTEGER DEFAULT 0,
pos_tag TEXT,
is_curated INTEGER DEFAULT 0,
inferred_pos TEXT,
inferred_confidence REAL,
inferred_source TEXT
)
''',
"bigrams": '''
CREATE TABLE IF NOT EXISTS bigrams (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word1_id INTEGER,
word2_id INTEGER,
probability REAL DEFAULT 0.0,
count INTEGER DEFAULT 0,
FOREIGN KEY(word1_id) REFERENCES words(id),
FOREIGN KEY(word2_id) REFERENCES words(id),
UNIQUE(word1_id, word2_id)
)
''',
"trigrams": '''
CREATE TABLE IF NOT EXISTS trigrams (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word1_id INTEGER,
word2_id INTEGER,
word3_id INTEGER,
probability REAL DEFAULT 0.0,
count INTEGER DEFAULT 0,
FOREIGN KEY(word1_id) REFERENCES words(id),
FOREIGN KEY(word2_id) REFERENCES words(id),
FOREIGN KEY(word3_id) REFERENCES words(id),
UNIQUE(word1_id, word2_id, word3_id)
)
''',
"pos_unigrams": '''
CREATE TABLE IF NOT EXISTS pos_unigrams (
pos TEXT UNIQUE NOT NULL,
probability REAL DEFAULT 0.0
)
''',
"pos_bigrams": '''
CREATE TABLE IF NOT EXISTS pos_bigrams (
pos1 TEXT NOT NULL,
pos2 TEXT NOT NULL,
probability REAL DEFAULT 0.0,
UNIQUE(pos1, pos2)
)
''',
"pos_trigrams": '''
CREATE TABLE IF NOT EXISTS pos_trigrams (
pos1 TEXT NOT NULL,
pos2 TEXT NOT NULL,
pos3 TEXT NOT NULL,
probability REAL DEFAULT 0.0,
UNIQUE(pos1, pos2, pos3)
)
''',
"processed_files": '''
CREATE TABLE IF NOT EXISTS processed_files (
path TEXT PRIMARY KEY,
mtime REAL,
size INTEGER
)
''',
"fourgrams": '''
CREATE TABLE IF NOT EXISTS fourgrams (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word1_id INTEGER,
word2_id INTEGER,
word3_id INTEGER,
word4_id INTEGER,
probability REAL DEFAULT 0.0,
count INTEGER DEFAULT 0,
FOREIGN KEY(word1_id) REFERENCES words(id),
FOREIGN KEY(word2_id) REFERENCES words(id),
FOREIGN KEY(word3_id) REFERENCES words(id),
FOREIGN KEY(word4_id) REFERENCES words(id),
UNIQUE(word1_id, word2_id, word3_id, word4_id)
)
''',
"fivegrams": '''
CREATE TABLE IF NOT EXISTS fivegrams (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word1_id INTEGER,
word2_id INTEGER,
word3_id INTEGER,
word4_id INTEGER,
word5_id INTEGER,
probability REAL DEFAULT 0.0,
count INTEGER DEFAULT 0,
FOREIGN KEY(word1_id) REFERENCES words(id),
FOREIGN KEY(word2_id) REFERENCES words(id),
FOREIGN KEY(word3_id) REFERENCES words(id),
FOREIGN KEY(word4_id) REFERENCES words(id),
FOREIGN KEY(word5_id) REFERENCES words(id),
UNIQUE(word1_id, word2_id, word3_id, word4_id, word5_id)
)
''',
"metadata": '''
CREATE TABLE IF NOT EXISTS metadata (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)
''',
"confusable_pairs": '''
CREATE TABLE IF NOT EXISTS confusable_pairs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word1 TEXT NOT NULL,
word2 TEXT NOT NULL,
confusion_type TEXT NOT NULL,
context_overlap REAL DEFAULT 0.0,
freq_ratio REAL,
suppress INTEGER DEFAULT 0,
source TEXT DEFAULT 'mined',
UNIQUE(word1, word2, confusion_type)
)
''',
"compound_confusions": '''
CREATE TABLE IF NOT EXISTS compound_confusions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
compound TEXT NOT NULL,
part1 TEXT NOT NULL,
part2 TEXT NOT NULL,
compound_freq INTEGER DEFAULT 0,
split_freq INTEGER DEFAULT 0,
pmi REAL DEFAULT 0.0,
UNIQUE(compound, part1, part2)
)
''',
"collocations": '''
CREATE TABLE IF NOT EXISTS collocations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word1 TEXT NOT NULL,
word2 TEXT NOT NULL,
pmi REAL NOT NULL,
npmi REAL,
count INTEGER NOT NULL,
UNIQUE(word1, word2)
)
''',
"register_tags": '''
CREATE TABLE IF NOT EXISTS register_tags (
word TEXT PRIMARY KEY,
register TEXT NOT NULL,
confidence REAL DEFAULT 0.0,
formal_count INTEGER DEFAULT 0,
informal_count INTEGER DEFAULT 0
)
''',
}
# Index definitions for query optimization
INDEXES = {
"idx_syllables_text": "CREATE INDEX IF NOT EXISTS idx_syllables_text ON syllables(syllable)",
"idx_words_text": "CREATE INDEX IF NOT EXISTS idx_words_text ON words(word)",
"idx_bigrams_w1_w2": "CREATE INDEX IF NOT EXISTS idx_bigrams_w1_w2 ON bigrams(word1_id, word2_id)",
"idx_trigrams_w1_w2_w3": "CREATE INDEX IF NOT EXISTS idx_trigrams_w1_w2_w3 ON trigrams(word1_id, word2_id, word3_id)",
"idx_fourgrams_w1_w2_w3_w4": "CREATE INDEX IF NOT EXISTS idx_fourgrams_w1_w2_w3_w4 ON fourgrams(word1_id, word2_id, word3_id, word4_id)",
"idx_fivegrams_w1_w2_w3_w4_w5": "CREATE INDEX IF NOT EXISTS idx_fivegrams_w1_w2_w3_w4_w5 ON fivegrams(word1_id, word2_id, word3_id, word4_id, word5_id)",
"idx_words_frequency": "CREATE INDEX IF NOT EXISTS idx_words_frequency ON words(frequency)",
"idx_confusable_word1": "CREATE INDEX IF NOT EXISTS idx_confusable_word1 ON confusable_pairs(word1)",
"idx_confusable_word2": "CREATE INDEX IF NOT EXISTS idx_confusable_word2 ON confusable_pairs(word2)",
"idx_compound_word": "CREATE INDEX IF NOT EXISTS idx_compound_word ON compound_confusions(compound)",
"idx_compound_parts": "CREATE INDEX IF NOT EXISTS idx_compound_parts ON compound_confusions(part1, part2)",
"idx_colloc_word1": "CREATE INDEX IF NOT EXISTS idx_colloc_word1 ON collocations(word1)",
"idx_colloc_word2": "CREATE INDEX IF NOT EXISTS idx_colloc_word2 ON collocations(word2)",
}