Skip to main content
The Schema Manager provides centralized database schema definitions and management for mySpellChecker’s SQLite storage. It handles table creation, index optimization, and schema migrations.

Overview

from myspellchecker.data_pipeline.schema_manager import SchemaManager
import sqlite3

# SchemaManager requires connection and cursor
with sqlite3.connect("mydict.db") as conn:
    cursor = conn.cursor()
    manager = SchemaManager(conn, cursor)

    # Create all tables and indexes
    manager.create_schema()

SchemaManager Class

Manages database schema with predefined tables and indexes:
class SchemaManager:
    """Manages database schema for spell checker dictionaries.

    Provides centralized definitions for all tables and indexes,
    ensuring consistency across dictionary builds.

    Args:
        conn: SQLite database connection
        cursor: SQLite database cursor
        console: Optional PipelineConsole for output
    """

    # Table definitions
    TABLES = {
        "syllables": '''
            CREATE TABLE IF NOT EXISTS syllables (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                syllable TEXT UNIQUE NOT NULL,
                frequency INTEGER DEFAULT 0
            )
        ''',

        "words": '''
            CREATE TABLE IF NOT EXISTS words (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word TEXT UNIQUE NOT NULL,
                syllable_count INTEGER,
                frequency INTEGER DEFAULT 0,
                pos_tag TEXT,
                is_curated INTEGER DEFAULT 0,
                inferred_pos TEXT,
                inferred_confidence REAL,
                inferred_source TEXT
            )
        ''',

        "bigrams": '''
            CREATE TABLE IF NOT EXISTS bigrams (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word1_id INTEGER,
                word2_id INTEGER,
                probability REAL DEFAULT 0.0,
                count INTEGER DEFAULT 0,
                FOREIGN KEY(word1_id) REFERENCES words(id),
                FOREIGN KEY(word2_id) REFERENCES words(id),
                UNIQUE(word1_id, word2_id)
            )
        ''',

        "trigrams": '''
            CREATE TABLE IF NOT EXISTS trigrams (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                word1_id INTEGER,
                word2_id INTEGER,
                word3_id INTEGER,
                probability REAL DEFAULT 0.0,
                count INTEGER DEFAULT 0,
                FOREIGN KEY(word1_id) REFERENCES words(id),
                FOREIGN KEY(word2_id) REFERENCES words(id),
                FOREIGN KEY(word3_id) REFERENCES words(id),
                UNIQUE(word1_id, word2_id, word3_id)
            )
        ''',

        "pos_unigrams": '''
            CREATE TABLE IF NOT EXISTS pos_unigrams (
                pos TEXT UNIQUE NOT NULL,
                probability REAL DEFAULT 0.0
            )
        ''',

        "pos_bigrams": '''
            CREATE TABLE IF NOT EXISTS pos_bigrams (
                pos1 TEXT NOT NULL,
                pos2 TEXT NOT NULL,
                probability REAL DEFAULT 0.0,
                UNIQUE(pos1, pos2)
            )
        ''',

        "pos_trigrams": '''
            CREATE TABLE IF NOT EXISTS pos_trigrams (
                pos1 TEXT NOT NULL,
                pos2 TEXT NOT NULL,
                pos3 TEXT NOT NULL,
                probability REAL DEFAULT 0.0,
                UNIQUE(pos1, pos2, pos3)
            )
        ''',

        "processed_files": '''
            CREATE TABLE IF NOT EXISTS processed_files (
                path TEXT PRIMARY KEY,
                mtime REAL,
                size INTEGER
            )
        ''',
    }

    # Index definitions for query optimization
    INDEXES = {
        "idx_syllables_text": "CREATE INDEX IF NOT EXISTS idx_syllables_text ON syllables(syllable)",
        "idx_words_text": "CREATE INDEX IF NOT EXISTS idx_words_text ON words(word)",
        "idx_bigrams_w1_w2": "CREATE INDEX IF NOT EXISTS idx_bigrams_w1_w2 ON bigrams(word1_id, word2_id)",
        "idx_trigrams_w1_w2_w3": "CREATE INDEX IF NOT EXISTS idx_trigrams_w1_w2_w3 ON trigrams(word1_id, word2_id, word3_id)",
    }

Table Schemas

syllables Table

Stores validated Myanmar syllables:
ColumnTypeDescription
idINTEGER PRIMARY KEYAuto-incrementing ID
syllableTEXT UNIQUEThe syllable text
frequencyINTEGERCorpus frequency count
-- Example queries
SELECT * FROM syllables WHERE frequency > 100;
SELECT syllable FROM syllables ORDER BY frequency DESC LIMIT 10;

words Table

Stores dictionary words with POS information:
ColumnTypeDescription
idINTEGER PRIMARY KEYAuto-incrementing ID
wordTEXT UNIQUEThe word text
syllable_countINTEGERNumber of syllables
frequencyINTEGERCorpus frequency count
pos_tagTEXTPOS tag from corpus
is_curatedINTEGERCurated word flag (0/1)
inferred_posTEXTPOS tag from inference
inferred_confidenceREALInference confidence (0.0-1.0)
inferred_sourceTEXTSource of inference
-- Example queries
SELECT * FROM words WHERE pos_tag = 'N' ORDER BY frequency DESC;
SELECT word, inferred_pos FROM words WHERE inferred_confidence > 0.8;

bigrams Table

Stores word bigram probabilities using word IDs:
ColumnTypeDescription
idINTEGER PRIMARY KEYAuto-incrementing ID
word1_idINTEGERForeign key to first word
word2_idINTEGERForeign key to second word
probabilityREALP(word2 | word1)
countINTEGERRaw co-occurrence count
-- Example: Find common word pairs (requires JOIN)
SELECT w1.word, w2.word, b.probability
FROM bigrams b
JOIN words w1 ON b.word1_id = w1.id
JOIN words w2 ON b.word2_id = w2.id
WHERE w1.word = 'မြန်မာ'
ORDER BY b.probability DESC;

trigrams Table

Stores word trigram probabilities using word IDs:
ColumnTypeDescription
idINTEGER PRIMARY KEYAuto-incrementing ID
word1_idINTEGERForeign key to first word
word2_idINTEGERForeign key to second word
word3_idINTEGERForeign key to third word
probabilityREALP(word3 | word1, word2)
countINTEGERRaw co-occurrence count

pos_* Tables

POS tag probability tables for statistical tagging:
-- POS unigrams: Tag probabilities
SELECT pos, probability FROM pos_unigrams ORDER BY probability DESC;

-- POS bigrams: Tag transition probabilities
SELECT pos1, pos2, probability FROM pos_bigrams WHERE pos1 = 'N';

-- POS trigrams: Tag sequence probabilities
SELECT * FROM pos_trigrams WHERE pos1 = 'N' AND pos2 = 'P';

processed_files Table

Tracks processed files for incremental builds:
ColumnTypeDescription
pathTEXT PRIMARY KEYFile path
mtimeREALModification timestamp
sizeINTEGERFile size in bytes

Methods

create_schema

Creates all tables:
def create_schema(self, in_transaction: bool = False) -> List[str]:
    """Create all tables in the database.

    Args:
        in_transaction: If True, don't start/commit own transaction

    Returns:
        List of verified table names
    """

create_indexes

Creates performance indexes:
def create_indexes(self) -> None:
    """Create all indexes for query optimization.

    Should be called AFTER bulk data insertion for best performance.
    """

ensure_inferred_pos_columns

Adds columns for inferred POS data:
def ensure_inferred_pos_columns(self) -> None:
    """Ensure the inferred_pos columns exist in the words table.

    Adds the columns if they don't exist (for database migration):
    - inferred_pos: POS tag from statistical inference
    - inferred_confidence: Confidence score (0.0-1.0)
    - inferred_source: Source of the inference
    """

Usage in Pipeline

Creating a New Database

from myspellchecker.data_pipeline.schema_manager import SchemaManager
import sqlite3

def create_dictionary(output_path: str):
    """Create a new dictionary database."""
    with sqlite3.connect(output_path) as conn:
        cursor = conn.cursor()
        manager = SchemaManager(conn, cursor)

        # Create schema (tables and indexes)
        manager.create_schema()

        # Insert data...

Using with DatabasePackager

The recommended approach is to use DatabasePackager which handles schema management automatically:
from myspellchecker.data_pipeline import DatabasePackager

packager = DatabasePackager(
    input_dir="frequencies/",
    database_path="dict.db",
)

packager.connect()
packager.create_schema()  # Uses SchemaManager internally
packager.load_syllables()
packager.load_words()
packager.close()

See Also