sunday/data_loader.py

"""
Comic data loader for YAML-based comic management with caching.

This module scans the data/comics/ directory for .yaml files,
loads each comic's configuration, and builds the COMICS list.
Caching is used to speed up subsequent loads.
"""

import os
import pickle
import yaml
from pathlib import Path


def load_comics_from_yaml(comics_dir='data/comics', use_cache=True):
    """
    Load all comic data from YAML files with optional caching.

    Args:
        comics_dir: Path to directory containing comic YAML files
        use_cache: Whether to use cache (set to False to force reload)

    Returns:
        List of comic dictionaries, sorted by comic number
    """
    comics_path = Path(comics_dir)

    if not comics_path.exists():
        print(f"Warning: Comics directory '{comics_dir}' does not exist. Creating it...")
        comics_path.mkdir(parents=True, exist_ok=True)
        return []

    # Cache file location
    cache_file = comics_path / '.comics_cache.pkl'

    # Check if caching is disabled via environment variable
    if os.getenv('DISABLE_COMIC_CACHE') == 'true':
        use_cache = False

    # Find all .yaml and .yml files
    yaml_files = list(comics_path.glob('*.yaml')) + list(comics_path.glob('*.yml'))

    # Filter out template and README files
    yaml_files = [f for f in yaml_files if f.stem.upper() not in ('TEMPLATE', 'README')]

    if not yaml_files:
        print(f"Warning: No YAML files found in '{comics_dir}'")
        return []

    # Check if we can use cache
    if use_cache and cache_file.exists():
        cache_mtime = cache_file.stat().st_mtime

        # Get the newest YAML file modification time
        newest_yaml_mtime = max(f.stat().st_mtime for f in yaml_files)

        # If cache is newer than all YAML files, use it
        if cache_mtime >= newest_yaml_mtime:
            try:
                with open(cache_file, 'rb') as f:
                    comics = pickle.load(f)
                print(f"Loaded {len(comics)} comics from cache")
                return comics
            except Exception as e:
                print(f"Warning: Failed to load cache: {e}")
                # Fall through to reload from YAML

    # Load from YAML files (cache miss or disabled)
    print(f"Loading {len(yaml_files)} comic files from YAML...")
    comics = []

    for yaml_file in yaml_files:
        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                comic_data = yaml.safe_load(f)

            if comic_data is None:
                print(f"Warning: '{yaml_file.name}' is empty, skipping")
                continue

            if 'number' not in comic_data:
                print(f"Warning: '{yaml_file.name}' missing required 'number' field, skipping")
                continue

            if 'filename' not in comic_data:
                print(f"Warning: '{yaml_file.name}' missing required 'filename' field, skipping")
                continue

            if 'date' not in comic_data:
                print(f"Warning: '{yaml_file.name}' missing required 'date' field, skipping")
                continue

            if 'alt_text' not in comic_data:
                print(f"Warning: '{yaml_file.name}' missing required 'alt_text' field, skipping")
                continue

            comics.append(comic_data)

        except yaml.YAMLError as e:
            print(f"Error parsing '{yaml_file.name}': {e}")
            continue
        except Exception as e:
            print(f"Error loading '{yaml_file.name}': {e}")
            continue

    # Sort by comic number
    comics.sort(key=lambda c: c['number'])

    # Save to cache
    if use_cache:
        try:
            with open(cache_file, 'wb') as f:
                pickle.dump(comics, f)
            print(f"Saved {len(comics)} comics to cache")
        except Exception as e:
            print(f"Warning: Failed to save cache: {e}")

    return comics


def clear_cache(comics_dir='data/comics'):
    """
    Clear the comics cache file.

    Args:
        comics_dir: Path to directory containing comic YAML files
    """
    cache_file = Path(comics_dir) / '.comics_cache.pkl'
    if cache_file.exists():
        cache_file.unlink()
        print("Cache cleared")
        return True
    else:
        print("No cache file found")
        return False


def validate_comics(comics):
    """
    Validate the loaded comics for common issues.

    Args:
        comics: List of comic dictionaries

    Returns:
        True if validation passes, False otherwise
    """
    if not comics:
        return True

    numbers = [c['number'] for c in comics]

    # Check for duplicate comic numbers
    if len(numbers) != len(set(numbers)):
        duplicates = [n for n in numbers if numbers.count(n) > 1]
        print(f"Warning: Duplicate comic numbers found: {set(duplicates)}")
        return False

    # Check for gaps in comic numbering (optional warning)
    for i in range(len(comics) - 1):
        if comics[i+1]['number'] - comics[i]['number'] > 1:
            print(f"Info: Gap in comic numbering between {comics[i]['number']} and {comics[i+1]['number']}")

    return True


if __name__ == '__main__':
    # Test the loader
    print("Loading comics from data/comics/...")
    comics = load_comics_from_yaml()
    print(f"Loaded {len(comics)} comics")

    if validate_comics(comics):
        print("Validation passed!")
        for comic in comics:
            title = comic.get('title', f"#{comic['number']}")
            print(f"  - Comic {comic['number']}: {title} ({comic['date']})")
    else:
        print("Validation failed!")