Added incremental generation of HTML output

Took a simple approach to speeding up rebuilds of the site that I’ve been meaning to crack on with for a while. Ripped out the old output_html function and replaced it with one that natively supports

May tweak the state that gets hashed for each document if the cache doesn’t stay as warm as it can, but initial results show lots of cache hits, with the Generate Hypertext build step on a typical rebuild having been cut from around 10 seconds, to 300 milliseconds!

That’s more than half the total build time gone in most cases :)

Here’s the bulk of the code:

import json
import logging
from dataclasses import asdict
from hashlib import md5
from pathlib import Path
from typing import Dict, Any, TYPE_CHECKING

# Import types only for type hints, avoiding circular imports
if TYPE_CHECKING:
    from jinja2 import Environment
    from build import DocumentMetadata, AssetMetadata, SiteMetadata

logger = logging.getLogger(__name__)


def calculate_content_hash(
    page: "DocumentMetadata", collections: Dict, site: "SiteMetadata"
) -> str:
    """Calculate a hash representing all content that could affect this page's output."""
    # Convert page data to string, excluding volatile fields like backlinks
    page_dict = asdict(page)

    # Remove volatile fields that change frequently but don't affect content
    volatile_fields = ["links"]  # Remove if backlinks change shouldn't trigger rebuild
    for field in volatile_fields:
        page_dict.pop(field, None)

    # Include relevant collection data (for pages that use collections)
    collection_data = ""
    if page.collection.get("include"):
        for include in page.collection["include"]:
            if include in collections:
                # Hash just the UIDs and titles of collection items to detect changes
                collection_items = [
                    (item.uid, item.title, item.updated.isoformat())
                    for item in collections[include][:20]
                ]  # Limit to avoid huge hashes
                collection_data += str(collection_items)

    # Include site metadata that might affect rendering
    site_data = {
        "stylesheet_hash": site.stylesheet_hash,
        "name": site.name,
        "url": site.url,
        "baseurl": site.baseurl,
    }

    # Combine all data and hash
    combined_data = json.dumps(
        [page_dict, collection_data, site_data], sort_keys=True, default=str
    )
    return md5(combined_data.encode()).hexdigest()


def load_build_cache(output_dir: Path) -> Dict[str, Dict[str, Any]]:
    """Load the build cache from previous run."""
    cache_file = output_dir / ".build_cache.json"
    if cache_file.exists():
        try:
            with open(cache_file, "r") as f:
                return json.load(f)
        except (json.JSONDecodeError, IOError):
            logger.warning("Could not load build cache, rebuilding all files")
    return {}


def save_build_cache(cache: Dict[str, Dict[str, Any]], output_dir: Path):
    """Save the build cache for next run."""
    cache_file = output_dir / ".build_cache.json"
    try:
        with open(cache_file, "w") as f:
            json.dump(cache, f)
    except IOError as e:
        logger.warning(f"Could not save build cache: {e}")


def needs_rebuild(
    page: "DocumentMetadata",
    collections: Dict,
    site: "SiteMetadata",
    output_path: Path,
    cache: Dict[str, Dict[str, Any]],
    template_file: str,
    env: "Environment",
) -> bool:
    """Determine if a page needs to be rebuilt."""

    # Always rebuild if output doesn't exist
    if not output_path.exists():
        return True

    page_key = page.uid
    current_hash = calculate_content_hash(page, collections, site)

    if page_key not in cache:
        return True

    cached_data = cache[page_key]

    if cached_data.get("content_hash") != current_hash:
        return True

    # Check template modification time
    try:
        template_path = env.loader.get_source(env, template_file)[1]
        if template_path:
            template_mtime = Path(template_path).stat().st_mtime
            if cached_data.get("template_mtime", 0) < template_mtime:
                return True
    except Exception:
        # If we can't check template mtime, rebuild to be safe
        return True

    # Check source file modification time
    source_mtime = page.filepath.stat().st_mtime
    if cached_data.get("source_mtime", 0) < source_mtime:
        return True

    return False


def output_html_incremental(
    assets: Dict[str, "AssetMetadata"],
    documents: Dict[str, "DocumentMetadata"],
    collections: Dict[str, Any],
    site: "SiteMetadata",
    env: "Environment",
    output_dir: Path,
    build_page_collection_func,  # Function to build page collections
) -> None:
    logger.info("Generating Hypertext (incremental)")

    # Load build cache
    cache = load_build_cache(output_dir)
    new_cache = {}

    built_count = 0
    skipped_count = 0

    for key, page in documents.items():
        template_file = page.layout
        output_path = output_dir / page.slug / "index.html"

        # Check if rebuild is needed
        if not needs_rebuild(
            page, collections, site, output_path, cache, template_file, env
        ):
            # Copy cache entry to new cache
            new_cache[key] = cache[key]
            skipped_count += 1
            logger.debug(f"  SKIP: {page.filepath}")
            continue

        # Rebuild the page
        template = env.get_template(template_file)
        collection = build_page_collection_func(page, collections)

        output = template.render(
            documents=documents,
            assets=assets,
            collections=collections,
            collection=collection,
            page=asdict(page),
            site=site,
        )

        # Ensure output directory exists
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Write the file
        with open(output_path, "w") as f:
            f.write(output)

        # Update cache
        current_hash = calculate_content_hash(page, collections, site)
        template_mtime = 0
        try:
            template_path = env.loader.get_source(env, template_file)[1]
            if template_path:
                template_mtime = Path(template_path).stat().st_mtime
        except Exception:
            pass

        new_cache[key] = {
            "content_hash": current_hash,
            "template_mtime": template_mtime,
            "source_mtime": page.filepath.stat().st_mtime,
            "output_path": str(output_path),
        }

        built_count += 1
        logger.debug(f"  BUILD: {page.filepath} >> {output_path}")

    # Save updated cache
    save_build_cache(new_cache, output_dir)

    logger.info(f"Built: {built_count}, Skipped: {skipped_count}")


def clear_build_cache(output_dir: Path) -> None:
    """Clear the build cache to force a full rebuild."""
    cache_file = output_dir / ".build_cache.json"
    if cache_file.exists():
        cache_file.unlink()
        logger.info("Build cache cleared")