Skip to main content
All dictionary tables (syllables, words, bigrams, trigrams, and processed-file tracking) are defined in one place and applied through the SchemaManager. This keeps schema evolution consistent across pipeline stages and manual database operations.

Overview

from myspellchecker.data_pipeline.schema_manager import SchemaManager
import sqlite3

# SchemaManager requires connection and cursor
with sqlite3.connect("mydict.db") as conn:
    cursor = conn.cursor()
    manager = SchemaManager(conn, cursor)

    # Create all tables and indexes
    manager.create_schema()

SchemaManager Class

Manages database schema with predefined tables and indexes:
class SchemaManager:
    """Manages database schema for spell checker dictionaries.

    Provides centralized definitions for all tables and indexes,
    ensuring consistency across dictionary builds.

    Args:
        conn: SQLite database connection
        cursor: SQLite database cursor
        console: Optional PipelineConsole for output
    """

    # Table definitions
    TABLES = {
        "syllables": '''
            CREATE TABLE IF NOT EXISTS syllables (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                syllable TEXT UNIQUE NOT NULL,
                frequency INTEGER DEFAULT 0
            )
        ''',

        "words": '''
            CREATE TABLE IF NOT EXISTS words (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word TEXT UNIQUE NOT NULL,
                syllable_count INTEGER,
                frequency INTEGER DEFAULT 0,
                pos_tag TEXT,
                is_curated INTEGER DEFAULT 0,
                inferred_pos TEXT,
                inferred_confidence REAL,
                inferred_source TEXT
            )
        ''',

        "bigrams": '''
            CREATE TABLE IF NOT EXISTS bigrams (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word1_id INTEGER,
                word2_id INTEGER,
                probability REAL DEFAULT 0.0,
                count INTEGER DEFAULT 0,
                FOREIGN KEY(word1_id) REFERENCES words(id),
                FOREIGN KEY(word2_id) REFERENCES words(id),
                UNIQUE(word1_id, word2_id)
            )
        ''',

        "trigrams": '''
            CREATE TABLE IF NOT EXISTS trigrams (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word1_id INTEGER,
                word2_id INTEGER,
                word3_id INTEGER,
                probability REAL DEFAULT 0.0,
                count INTEGER DEFAULT 0,
                FOREIGN KEY(word1_id) REFERENCES words(id),
                FOREIGN KEY(word2_id) REFERENCES words(id),
                FOREIGN KEY(word3_id) REFERENCES words(id),
                UNIQUE(word1_id, word2_id, word3_id)
            )
        ''',

        "pos_unigrams": '''
            CREATE TABLE IF NOT EXISTS pos_unigrams (
                pos TEXT UNIQUE NOT NULL,
                probability REAL DEFAULT 0.0
            )
        ''',

        "pos_bigrams": '''
            CREATE TABLE IF NOT EXISTS pos_bigrams (
                pos1 TEXT NOT NULL,
                pos2 TEXT NOT NULL,
                probability REAL DEFAULT 0.0,
                UNIQUE(pos1, pos2)
            )
        ''',

        "pos_trigrams": '''
            CREATE TABLE IF NOT EXISTS pos_trigrams (
                pos1 TEXT NOT NULL,
                pos2 TEXT NOT NULL,
                pos3 TEXT NOT NULL,
                probability REAL DEFAULT 0.0,
                UNIQUE(pos1, pos2, pos3)
            )
        ''',

        "processed_files": '''
            CREATE TABLE IF NOT EXISTS processed_files (
                path TEXT PRIMARY KEY,
                mtime REAL,
                size INTEGER
            )
        ''',

        "fourgrams": '''
            CREATE TABLE IF NOT EXISTS fourgrams (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word1_id INTEGER,
                word2_id INTEGER,
                word3_id INTEGER,
                word4_id INTEGER,
                probability REAL DEFAULT 0.0,
                count INTEGER DEFAULT 0,
                FOREIGN KEY(word1_id) REFERENCES words(id),
                FOREIGN KEY(word2_id) REFERENCES words(id),
                FOREIGN KEY(word3_id) REFERENCES words(id),
                FOREIGN KEY(word4_id) REFERENCES words(id),
                UNIQUE(word1_id, word2_id, word3_id, word4_id)
            )
        ''',

        "fivegrams": '''
            CREATE TABLE IF NOT EXISTS fivegrams (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word1_id INTEGER,
                word2_id INTEGER,
                word3_id INTEGER,
                word4_id INTEGER,
                word5_id INTEGER,
                probability REAL DEFAULT 0.0,
                count INTEGER DEFAULT 0,
                FOREIGN KEY(word1_id) REFERENCES words(id),
                FOREIGN KEY(word2_id) REFERENCES words(id),
                FOREIGN KEY(word3_id) REFERENCES words(id),
                FOREIGN KEY(word4_id) REFERENCES words(id),
                FOREIGN KEY(word5_id) REFERENCES words(id),
                UNIQUE(word1_id, word2_id, word3_id, word4_id, word5_id)
            )
        ''',

        "metadata": '''
            CREATE TABLE IF NOT EXISTS metadata (
                key TEXT PRIMARY KEY,
                value TEXT NOT NULL
            )
        ''',

        "confusable_pairs": '''
            CREATE TABLE IF NOT EXISTS confusable_pairs (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word1 TEXT NOT NULL,
                word2 TEXT NOT NULL,
                confusion_type TEXT NOT NULL,
                context_overlap REAL DEFAULT 0.0,
                freq_ratio REAL,
                suppress INTEGER DEFAULT 0,
                source TEXT DEFAULT 'mined',
                UNIQUE(word1, word2, confusion_type)
            )
        ''',

        "compound_confusions": '''
            CREATE TABLE IF NOT EXISTS compound_confusions (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                compound TEXT NOT NULL,
                part1 TEXT NOT NULL,
                part2 TEXT NOT NULL,
                compound_freq INTEGER DEFAULT 0,
                split_freq INTEGER DEFAULT 0,
                pmi REAL DEFAULT 0.0,
                UNIQUE(compound, part1, part2)
            )
        ''',

        "collocations": '''
            CREATE TABLE IF NOT EXISTS collocations (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word1 TEXT NOT NULL,
                word2 TEXT NOT NULL,
                pmi REAL NOT NULL,
                npmi REAL,
                count INTEGER NOT NULL,
                UNIQUE(word1, word2)
            )
        ''',

        "register_tags": '''
            CREATE TABLE IF NOT EXISTS register_tags (
                word TEXT PRIMARY KEY,
                register TEXT NOT NULL,
                confidence REAL DEFAULT 0.0,
                formal_count INTEGER DEFAULT 0,
                informal_count INTEGER DEFAULT 0
            )
        ''',
    }

    # Index definitions for query optimization
    INDEXES = {
        "idx_syllables_text": "CREATE INDEX IF NOT EXISTS idx_syllables_text ON syllables(syllable)",
        "idx_words_text": "CREATE INDEX IF NOT EXISTS idx_words_text ON words(word)",
        "idx_bigrams_w1_w2": "CREATE INDEX IF NOT EXISTS idx_bigrams_w1_w2 ON bigrams(word1_id, word2_id)",
        "idx_trigrams_w1_w2_w3": "CREATE INDEX IF NOT EXISTS idx_trigrams_w1_w2_w3 ON trigrams(word1_id, word2_id, word3_id)",
        "idx_fourgrams_w1_w2_w3_w4": "CREATE INDEX IF NOT EXISTS idx_fourgrams_w1_w2_w3_w4 ON fourgrams(word1_id, word2_id, word3_id, word4_id)",
        "idx_fivegrams_w1_w2_w3_w4_w5": "CREATE INDEX IF NOT EXISTS idx_fivegrams_w1_w2_w3_w4_w5 ON fivegrams(word1_id, word2_id, word3_id, word4_id, word5_id)",
        "idx_words_frequency": "CREATE INDEX IF NOT EXISTS idx_words_frequency ON words(frequency)",
        "idx_confusable_word1": "CREATE INDEX IF NOT EXISTS idx_confusable_word1 ON confusable_pairs(word1)",
        "idx_confusable_word2": "CREATE INDEX IF NOT EXISTS idx_confusable_word2 ON confusable_pairs(word2)",
        "idx_compound_word": "CREATE INDEX IF NOT EXISTS idx_compound_word ON compound_confusions(compound)",
        "idx_compound_parts": "CREATE INDEX IF NOT EXISTS idx_compound_parts ON compound_confusions(part1, part2)",
        "idx_colloc_word1": "CREATE INDEX IF NOT EXISTS idx_colloc_word1 ON collocations(word1)",
        "idx_colloc_word2": "CREATE INDEX IF NOT EXISTS idx_colloc_word2 ON collocations(word2)",
    }

Table Schemas

syllables Table

Stores validated Myanmar syllables:
ColumnTypeDescription
idINTEGER PRIMARY KEYAuto-incrementing ID
syllableTEXT UNIQUEThe syllable text
frequencyINTEGERCorpus frequency count
-- Example queries
SELECT * FROM syllables WHERE frequency > 100;
SELECT syllable FROM syllables ORDER BY frequency DESC LIMIT 10;

words Table

Stores dictionary words with POS information:
ColumnTypeDescription
idINTEGER PRIMARY KEYAuto-incrementing ID
wordTEXT UNIQUEThe word text
syllable_countINTEGERNumber of syllables
frequencyINTEGERCorpus frequency count
pos_tagTEXTPOS tag from corpus
is_curatedINTEGERCurated word flag (0/1)
inferred_posTEXTPOS tag from inference
inferred_confidenceREALInference confidence (0.0-1.0)
inferred_sourceTEXTSource of inference
-- Example queries
SELECT * FROM words WHERE pos_tag = 'N' ORDER BY frequency DESC;
SELECT word, inferred_pos FROM words WHERE inferred_confidence > 0.8;

bigrams Table

Stores word bigram probabilities using word IDs:
ColumnTypeDescription
idINTEGER PRIMARY KEYAuto-incrementing ID
word1_idINTEGERForeign key to first word
word2_idINTEGERForeign key to second word
probabilityREALP(word2 | word1)
countINTEGERRaw co-occurrence count
-- Example: Find common word pairs (requires JOIN)
SELECT w1.word, w2.word, b.probability
FROM bigrams b
JOIN words w1 ON b.word1_id = w1.id
JOIN words w2 ON b.word2_id = w2.id
WHERE w1.word = 'မြန်မာ'
ORDER BY b.probability DESC;

trigrams Table

Stores word trigram probabilities using word IDs:
ColumnTypeDescription
idINTEGER PRIMARY KEYAuto-incrementing ID
word1_idINTEGERForeign key to first word
word2_idINTEGERForeign key to second word
word3_idINTEGERForeign key to third word
probabilityREALP(word3 | word1, word2)
countINTEGERRaw co-occurrence count

pos_* Tables

POS tag probability tables for statistical tagging:
-- POS unigrams: Tag probabilities
SELECT pos, probability FROM pos_unigrams ORDER BY probability DESC;

-- POS bigrams: Tag transition probabilities
SELECT pos1, pos2, probability FROM pos_bigrams WHERE pos1 = 'N';

-- POS trigrams: Tag sequence probabilities
SELECT * FROM pos_trigrams WHERE pos1 = 'N' AND pos2 = 'P';

processed_files Table

Tracks processed files for incremental builds:
ColumnTypeDescription
pathTEXT PRIMARY KEYFile path
mtimeREALModification timestamp
sizeINTEGERFile size in bytes

Methods

create_schema

Creates all tables:
def create_schema(
    self,
    in_transaction: bool = False,
    defer_indexes: bool = False,
) -> List[str]:
    """Create all tables in the database.

    Args:
        in_transaction: If True, don't start/commit own transaction
        defer_indexes: If True, skip index creation (call create_indexes later)

    Returns:
        List of verified table names
    """

create_indexes

Creates performance indexes:
def create_indexes(self) -> None:
    """Create all indexes for query optimization.

    Should be called AFTER bulk data insertion for best performance.
    """

ensure_inferred_pos_columns

Adds columns for inferred POS data:
def ensure_inferred_pos_columns(self) -> None:
    """Ensure the inferred_pos columns exist in the words table.

    Adds the columns if they don't exist (for database migration):
    - inferred_pos: POS tag from statistical inference
    - inferred_confidence: Confidence score (0.0-1.0)
    - inferred_source: Source of the inference
    """

Usage in Pipeline

Creating a New Database

from myspellchecker.data_pipeline.schema_manager import SchemaManager
import sqlite3

def create_dictionary(output_path: str):
    """Create a new dictionary database."""
    with sqlite3.connect(output_path) as conn:
        cursor = conn.cursor()
        manager = SchemaManager(conn, cursor)

        # Create schema (tables and indexes)
        manager.create_schema()

        # Insert data...

Using with DatabasePackager

The recommended approach is to use DatabasePackager which handles schema management automatically:
from myspellchecker.data_pipeline import DatabasePackager

packager = DatabasePackager(
    input_dir="frequencies/",
    database_path="dict.db",
)

packager.connect()
packager.create_schema()  # Uses SchemaManager internally
packager.load_syllables()
packager.load_words()
packager.close()

See Also