Skip to main content
Utility functions used internally by the data pipeline for pre-flight checks like disk space verification before building dictionaries.

Overview

from myspellchecker.utils.io_utils import check_disk_space

# Check if enough disk space before building database
check_disk_space("/path/to/output", required_mb=500)

Functions

check_disk_space

Verify that a disk partition has sufficient free space:
from myspellchecker.utils.io_utils import check_disk_space

def check_disk_space(
    path: Union[str, Path],
    required_mb: int = 500,
) -> None:
    """Check if disk partition has enough free space.

    Args:
        path: Path to check (directory or file)
        required_mb: Minimum required free space in MB (default: 500MB)

    Raises:
        InsufficientStorageError: If free space is less than required.
    """

Usage

from pathlib import Path
from myspellchecker.utils.io_utils import check_disk_space
from myspellchecker.core.exceptions import InsufficientStorageError

# Basic check
try:
    check_disk_space("/tmp/output.db", required_mb=500)
    print("Sufficient disk space available")
except InsufficientStorageError as e:
    print(f"Not enough space: {e}")

# Check with Path object
output_path = Path("/data/dictionaries/mydict.db")
check_disk_space(output_path, required_mb=1000)

# Check directory
check_disk_space("/var/data", required_mb=2000)

Behavior

Path Resolution

The function handles various path scenarios:
# Absolute path
check_disk_space("/home/user/data/output.db")

# Relative path (converted to absolute)
check_disk_space("./output.db")

# Non-existent path (checks nearest existing parent)
check_disk_space("/data/new_dir/new_subdir/output.db")
# Checks /data/new_dir, then /data, then / until existing path found

Error Conditions

from myspellchecker.core.exceptions import InsufficientStorageError

try:
    check_disk_space("/tmp", required_mb=1000000)  # 1TB
except InsufficientStorageError as e:
    print(e)
    # "Insufficient disk space at /tmp. Available: 50000.00 MB, Required: 1000000 MB."

Silent Failures

Some errors are logged but don’t raise exceptions:
# FileNotFoundError - logged as warning
check_disk_space("/nonexistent/mount/point")
# Warning: "Could not check disk space for /nonexistent/mount/point"

# OSError - logged as warning
check_disk_space("/inaccessible/path")
# Warning: "Error checking disk space: ..."

Integration with Pipeline

Pre-Build Check

from myspellchecker.data_pipeline import Pipeline
from myspellchecker.utils.io_utils import check_disk_space

class Pipeline:
    def run(self, input_path: Path, output_path: Path):
        # Check disk space before starting
        estimated_size = self._estimate_output_size(input_path)
        check_disk_space(output_path, required_mb=estimated_size * 1.5)

        # Proceed with build
        self._run_pipeline()

With Reporter

from myspellchecker.utils.io_utils import check_disk_space
from myspellchecker.core.exceptions import InsufficientStorageError

def build_dictionary(input_path, output_path, reporter):
    try:
        check_disk_space(output_path, required_mb=500)
        reporter.report_info("Disk space check passed")
    except InsufficientStorageError as e:
        reporter.report_error(str(e))
        raise

InsufficientStorageError

Custom exception for disk space issues:
from myspellchecker.core.exceptions import InsufficientStorageError

class InsufficientStorageError(MyanmarSpellcheckError):
    """Raised when disk space is insufficient for an operation."""
    pass

Handling the Exception

from myspellchecker.core.exceptions import InsufficientStorageError

try:
    check_disk_space(path, required_mb=500)
except InsufficientStorageError:
    # Option 1: Clean up and retry
    cleanup_temp_files()
    check_disk_space(path, required_mb=500)

    # Option 2: Use alternative location
    check_disk_space("/tmp", required_mb=500)

    # Option 3: Inform user
    print("Please free up disk space and try again")

Disk Usage Information

The function uses shutil.disk_usage() internally:
import shutil

# Get disk usage details
total, used, free = shutil.disk_usage("/path")

print(f"Total: {total / (1024**3):.2f} GB")
print(f"Used: {used / (1024**3):.2f} GB")
print(f"Free: {free / (1024**3):.2f} GB")

Best Practices

1. Check Before Large Operations

def build_large_dictionary(corpus_path, output_path):
    # Estimate required space
    corpus_size = corpus_path.stat().st_size
    estimated_db_size = corpus_size * 3  # Database typically 3x corpus

    # Add safety margin
    required_mb = (estimated_db_size / (1024 * 1024)) * 1.5

    check_disk_space(output_path, required_mb=int(required_mb))

2. Check Multiple Locations

def setup_workspace(output_dir, temp_dir):
    # Check both output and temp locations
    check_disk_space(output_dir, required_mb=500)
    check_disk_space(temp_dir, required_mb=200)

3. Graceful Degradation

def safe_build(input_path, output_path):
    try:
        check_disk_space(output_path, required_mb=500)
    except InsufficientStorageError:
        logger.warning("Low disk space, using minimal build options")
        # Use smaller batch sizes, skip optional features
        return build_minimal(input_path, output_path)

    return build_full(input_path, output_path)

See Also