Added incremental generation of HTML output
Took a simple approach to speeding up rebuilds of the site that I’ve been meaning to crack on with for a while. Ripped out the old output_html function and replaced it with one that natively supports
May tweak the state that gets hashed for each document if the cache doesn’t stay as warm as it can, but initial results show lots of cache hits, with the Generate Hypertext
build step on a typical rebuild having been cut from around 10 seconds, to 300 milliseconds!
That’s more than half the total build time gone in most cases :)
Here’s the bulk of the code:
import json
import logging
from dataclasses import asdict
from hashlib import md5
from pathlib import Path
from typing import Dict, Any, TYPE_CHECKING
# Import types only for type hints, avoiding circular imports
if TYPE_CHECKING:
from jinja2 import Environment
from build import DocumentMetadata, AssetMetadata, SiteMetadata
logger = logging.getLogger(__name__)
def calculate_content_hash(
page: "DocumentMetadata", collections: Dict, site: "SiteMetadata"
) -> str:
"""Calculate a hash representing all content that could affect this page's output."""
# Convert page data to string, excluding volatile fields like backlinks
page_dict = asdict(page)
# Remove volatile fields that change frequently but don't affect content
volatile_fields = ["links"] # Remove if backlinks change shouldn't trigger rebuild
for field in volatile_fields:
page_dict.pop(field, None)
# Include relevant collection data (for pages that use collections)
collection_data = ""
if page.collection.get("include"):
for include in page.collection["include"]:
if include in collections:
# Hash just the UIDs and titles of collection items to detect changes
collection_items = [
(item.uid, item.title, item.updated.isoformat())
for item in collections[include][:20]
] # Limit to avoid huge hashes
collection_data += str(collection_items)
# Include site metadata that might affect rendering
site_data = {
"stylesheet_hash": site.stylesheet_hash,
"name": site.name,
"url": site.url,
"baseurl": site.baseurl,
}
# Combine all data and hash
combined_data = json.dumps(
[page_dict, collection_data, site_data], sort_keys=True, default=str
)
return md5(combined_data.encode()).hexdigest()
def load_build_cache(output_dir: Path) -> Dict[str, Dict[str, Any]]:
"""Load the build cache from previous run."""
cache_file = output_dir / ".build_cache.json"
if cache_file.exists():
try:
with open(cache_file, "r") as f:
return json.load(f)
except (json.JSONDecodeError, IOError):
logger.warning("Could not load build cache, rebuilding all files")
return {}
def save_build_cache(cache: Dict[str, Dict[str, Any]], output_dir: Path):
"""Save the build cache for next run."""
cache_file = output_dir / ".build_cache.json"
try:
with open(cache_file, "w") as f:
json.dump(cache, f)
except IOError as e:
logger.warning(f"Could not save build cache: {e}")
def needs_rebuild(
page: "DocumentMetadata",
collections: Dict,
site: "SiteMetadata",
output_path: Path,
cache: Dict[str, Dict[str, Any]],
template_file: str,
env: "Environment",
) -> bool:
"""Determine if a page needs to be rebuilt."""
# Always rebuild if output doesn't exist
if not output_path.exists():
return True
page_key = page.uid
current_hash = calculate_content_hash(page, collections, site)
if page_key not in cache:
return True
cached_data = cache[page_key]
if cached_data.get("content_hash") != current_hash:
return True
# Check template modification time
try:
template_path = env.loader.get_source(env, template_file)[1]
if template_path:
template_mtime = Path(template_path).stat().st_mtime
if cached_data.get("template_mtime", 0) < template_mtime:
return True
except Exception:
# If we can't check template mtime, rebuild to be safe
return True
# Check source file modification time
source_mtime = page.filepath.stat().st_mtime
if cached_data.get("source_mtime", 0) < source_mtime:
return True
return False
def output_html_incremental(
assets: Dict[str, "AssetMetadata"],
documents: Dict[str, "DocumentMetadata"],
collections: Dict[str, Any],
site: "SiteMetadata",
env: "Environment",
output_dir: Path,
build_page_collection_func, # Function to build page collections
) -> None:
logger.info("Generating Hypertext (incremental)")
# Load build cache
cache = load_build_cache(output_dir)
new_cache = {}
built_count = 0
skipped_count = 0
for key, page in documents.items():
template_file = page.layout
output_path = output_dir / page.slug / "index.html"
# Check if rebuild is needed
if not needs_rebuild(
page, collections, site, output_path, cache, template_file, env
):
# Copy cache entry to new cache
new_cache[key] = cache[key]
skipped_count += 1
logger.debug(f" SKIP: {page.filepath}")
continue
# Rebuild the page
template = env.get_template(template_file)
collection = build_page_collection_func(page, collections)
output = template.render(
documents=documents,
assets=assets,
collections=collections,
collection=collection,
page=asdict(page),
site=site,
)
# Ensure output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
# Write the file
with open(output_path, "w") as f:
f.write(output)
# Update cache
current_hash = calculate_content_hash(page, collections, site)
template_mtime = 0
try:
template_path = env.loader.get_source(env, template_file)[1]
if template_path:
template_mtime = Path(template_path).stat().st_mtime
except Exception:
pass
new_cache[key] = {
"content_hash": current_hash,
"template_mtime": template_mtime,
"source_mtime": page.filepath.stat().st_mtime,
"output_path": str(output_path),
}
built_count += 1
logger.debug(f" BUILD: {page.filepath} >> {output_path}")
# Save updated cache
save_build_cache(new_cache, output_dir)
logger.info(f"Built: {built_count}, Skipped: {skipped_count}")
def clear_build_cache(output_dir: Path) -> None:
"""Clear the build cache to force a full rebuild."""
cache_file = output_dir / ".build_cache.json"
if cache_file.exists():
cache_file.unlink()
logger.info("Build cache cleared")