Add dataclasses to ratchet up the strictness

In continuing to ratchet up the strictness of the build system I landed on the concept of dataclasses, which are new to this novice programmer, though the concept of a schema for validation is not.

Anyway, I landed on creating one dataclass for Documents and another for Assets.

@dataclass
class DocumentMetadata:
    filepath: Path
    uid: str
    slug: str
    title: str
    primary: str
    secondary: str
    available: datetime.datetime
    created: datetime.datetime
    updated: datetime.datetime
    creator: str = ""
    note: str = ""
    favourite: bool = False
    parent: str = ""
    description: str = ""
    layout: str = TEMPLATE_DEFAULT
    source: Dict = field(default_factory=dict)
    via: Dict = field(default_factory=dict)
    location: Dict[str, Any] = field(
        default_factory=lambda: {
            "continent": "",
            "country": "",
            "region": "",
            "city": "",
            "note": "",
            "lat": int,
            "lng": int,
        }
    )
    collection: Dict[str, Any] = field(
        default_factory=lambda: {
            "style": "title",
            "order": "chronological",
            "include": [],
        }
    )
    attribution: Dict[str, str] = field(
        default_factory=lambda: {
            "plain": "",
            "djot": "",
            "html": "",
        }
    )
    media: str = "application/toml"
    words: Dict[str, Any] = field(
        default_factory=lambda: {
            "self": 0,
            "code": {
                "lines": 0,
                "words": 0,
            },
            "references": 0,
        }
    )
    status: str = ""
    links: LinksDict = field(
        default_factory=lambda: {
            "internal": list(),
            "external": list(),
            "backlinks": list(),
        }
    )
    options: List[str] = field(default_factory=list)
    tags: List[str] = field(default_factory=list)
    content: Dict[str, str] = field(default_factory=dict)

    def __post_init__(self):
        # Validate links dictionary structure
        required_link_types = {"internal", "external", "backlinks"}
        if (
            not isinstance(self.links, dict)
            or set(self.links.keys()) != required_link_types
        ):
            raise ValueError(
                f"links must be a dictionary with exactly these keys: {required_link_types}"
            )
        for key in self.links:
            if not isinstance(self.links[key], set):
                self.links[key] = set(self.links[key])
@dataclass
class AssetMetadata:
    filepath: Path
    media: str
    uid: str
    slug: str
    title: str
    available: datetime.datetime
    created: datetime.datetime
    updated: datetime.datetime
    creator: str = ""
    note: str = ""
    favourite: bool = False
    source: Dict = field(default_factory=dict)
    via: Dict = field(default_factory=dict)
    hash: str = ""
    output_width: int = 0
    output_height: int = 0
    location: Dict[str, Any] = field(
        default_factory=lambda: {
            "continent": "",
            "country": "",
            "region": "",
            "city": "",
            "note": "",
            "lat": int,
            "lng": int,
        }
    )
    attribution: Dict[str, str] = field(
        default_factory=lambda: {
            "plain": "",
            "djot": "",
            "html": "",
        }
    )
    words: Dict[str, Any] = field(
        default_factory=lambda: {
            "self": 0,
            "code": {
                "lines": 0,
                "words": 0,
            },
            "references": 0,
        }
    )
    links: LinksDict = field(
        default_factory=lambda: {
            "internal": list(),
            "external": list(),
            "backlinks": list(),
        }
    )
    tags: List[str] = field(default_factory=list)
    content: Dict[str, str] = field(default_factory=dict)

    def __post_init__(self):
        # Validate links dictionary structure
        required_link_types = {"internal", "external", "backlinks"}
        if (
            not isinstance(self.links, dict)
            or set(self.links.keys()) != required_link_types
        ):
            raise ValueError(
                f"links must be a dictionary with exactly these keys: {required_link_types}"
            )
        for key in self.links:
            if not isinstance(self.links[key], set):
                self.links[key] = set(self.links[key])

This turned up a couple of oversights in metadata for existing documents, and also encouraged me to restructure the final built metadata, see backlinks and wordcounting.

Speaking of backlinks, I coupled this work with my ongoing effort to make the building of the completely deterministic. The builds already are deterministic in every way that matters. Same inputs make for the same output, except in one way: because documents are built asynchronously, it has been possible for the order of backlink references to change between builds. This doesn’t alter any functionality or correctness but does mean that the order of the backlinks in the hidden build summary of each page could change, and this makes diffing over the whole site to spot regressions more noisy, as many differences were not really differences at all.

Solving this turned out to be easy once I realised that a few pages kept seeing the order shift because they referenced pages that had the same publish (available) date, which is the key for the sort order. With that remedied, determinism is assured.

For a glimpse at the surface area of the changes, click here to see a diff from implementing the first dataclass, though more changes have landed since.
 # Imports
 from collections import Counter
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, asdict
 from hashlib import md5
 from pathlib import Path
 from shutil import copyfile
 from subprocess import run
 from typing import List, Dict, Any, Tuple
+from typing_extensions import TypedDict
 import asyncio
 import datetime
 import logging
@@ -73,11 +74,11 @@ class SiteMetadata:
     words: Dict[str, int] = field(
         default_factory=lambda: {"drafts": 0, "references": 0, "self": 0}
     )
-    links: Dict[str, set] = field(
+    links: Dict[str, Any] = field(
         default_factory=lambda: {
-            "internal": set(),
+            "internal": list(),
+            "backlinks": list(),
             "external": set(),
-            "backlinks": set(),
         }
     )
     pagecount: int = 0
@@ -91,9 +92,15 @@ class SiteMetadata:
     slug_to_title_lookup: Dict[str, str] = field(default_factory=dict)


+class LinksDict(TypedDict):
+    internal: list[str]
+    external: list[str]
+    backlinks: list[str]
+
+
 @dataclass
 class DocumentMetadata:
-    filename: Path
+    filepath: Path
     uid: str
     slug: str
     created: datetime.datetime
@@ -102,20 +109,78 @@ class DocumentMetadata:
     title: str
     primary: str
     secondary: str
+    creator: str = ""
+    note: str = ""
+    favourite: bool = False
+    parent: str = ""
+    description: str = ""
+    layout: str = TEMPLATE_DEFAULT
+    source: Dict = field(default_factory=dict)
+    via: Dict = field(default_factory=dict)
+    location: Dict = field(default_factory=dict)
+    collection: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "style": "title",
+            "order": "chronological",
+            "include": [],
+        }
+    )
+    attribution: Dict[str, str] = field(
+        default_factory=lambda: {
+            "plain": "",
+            "djot": "",
+            "html": "",
+        }
+    )
+    media: str = "application/toml"
     words: int = 0
-    links: Dict[str, set] = field(
+    status: str = ""
+    links: LinksDict = field(
         default_factory=lambda: {
-            "internal": set(),
-            "external": set(),
-            "backlinks": set(),
+            "internal": list(),
+            "external": list(),
+            "backlinks": list(),
         }
     )
     options: List[str] = field(default_factory=list)
     tags: List[str] = field(default_factory=list)
-    interlinks: List[str] = field(default_factory=list)
-    backlinks: List[str] = field(default_factory=list)
     content: Dict[str, str] = field(default_factory=dict)
-    html: str = ""
+
+    def __post_init__(self):
+        # Validate required string fields are not empty
+        # for field_name in ["uid", "slug", "title", "primary", "secondary"]:
+        #    value = getattr(self, field_name)
+        #    if not isinstance(value, str) or not value.strip():
+        #        raise ValueError(f"\n\n{self}\n\n{field_name} {value} must be a non-empty string"\n)
+
+        # Validate filepath is a Path object
+        if not isinstance(self.filepath, Path):
+            self.filepath = Path(self.filepath)
+
+        ## Validate datetime fields
+        # for field_name in ["created", "updated", "available"]:
+        #    value = getattr(self, field_name)
+        #    if not isinstance(value, datetime.datetime):
+        #        raise ValueError(f"{field_name} must be a datetime object")
+        #    # Ensure timezone is None
+        #    setattr(self, field_name, value.replace(tzinfo=None))
+
+        # Validate words is non-negative
+        if not isinstance(self.words, int) or self.words < 0:
+            raise ValueError("words must be a non-negative integer")
+
+        # Validate links dictionary structure
+        required_link_types = {"internal", "external", "backlinks"}
+        if (
+            not isinstance(self.links, dict)
+            or set(self.links.keys()) != required_link_types
+        ):
+            raise ValueError(
+                f"links must be a dictionary with exactly these keys: {required_link_types}"
+            )
+        for key in self.links:
+            if not isinstance(self.links[key], set):
+                self.links[key] = set(self.links[key])


 def init_site():
@@ -132,12 +197,47 @@ def init_site():
     )


-def load_assets():
+def preprocess_asset_metadata(
+    uid: str, asset_data: Dict[str, Any], manifest_path: Path
+) -> Dict[str, Any]:
+    """Preprocess asset metadata to ensure it meets DocumentMetadata requirements."""
+    processed = asset_data.copy()
+
+    # Handle dates
+    for date_field in ["created", "updated", "available"]:
+        if isinstance(processed.get(date_field), str):
+            processed[date_field] = _parse_date(processed[date_field])
+        elif isinstance(processed.get(date_field), datetime.datetime):
+            processed[date_field] = processed[date_field].replace(tzinfo=None)
+        else:
+            processed[date_field] = datetime.datetime.now()
+
+    # Set required fields with defaults if not present
+    processed.setdefault("uid", uid)
+    processed.setdefault("primary", "asset")
+    processed.setdefault("secondary", processed["media"])
+
+    return processed
+
+
+def load_assets() -> Dict[str, DocumentMetadata]:
+    """Load asset manifests and convert them to DocumentMetadata instances."""
     assets = {}
     asset_manifests = list(ASSET_DIR.glob("manifests/*.toml"))
+
     for manifest in asset_manifests:
         with open(manifest, "rb") as f:
-            assets.update(tomllib.load(f))
+            manifest_data = tomllib.load(f)
+
+        for uid, asset_data in manifest_data.items():
+            try:
+                processed_data = preprocess_asset_metadata(uid, asset_data, manifest)
+                processed_data["filepath"] = ASSET_DIR / processed_data["filepath"]
+                assets[uid] = DocumentMetadata(**processed_data)
+            except Exception as e:
+                print(f"Error processing asset {uid} from {manifest}: {str(e)}")
+                continue
+
     return assets


@@ -176,7 +276,7 @@ def get_files() -> List[Path]:
     return [f for f in NOTES_DIR.glob("**/*.md") if "available = " in f.read_text()]


-def extract_external_links(text: str) -> List:
+def extract_external_links(text: str, site) -> List:
     url_pattern = r"(https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^)\s]*)?)"
     matches = re.findall(url_pattern, text)

@@ -186,18 +286,21 @@ def extract_external_links(text: str) -> List:
         parsed_url = urlparse(url)
         if parsed_url.netloc.lower() != "silasjelley.com":
             external_links.add(url)
+            site.links["external"].add(url)

-    return list(external_links)
+    return sorted(external_links)


 async def process_document(
-    filename: Path, site: SiteMetadata
-) -> Tuple[str, Dict[str, Any]]:
-    with open(filename, "rb") as f:
+    filepath: Path, site: SiteMetadata
+) -> Tuple[str, DocumentMetadata]:
+    """Process a document file and return its UID and metadata."""
+
+    with open(filepath, "rb") as f:
         try:
             parsed_toml = tomllib.load(f)
         except:
-            print(filename)
+            print(filepath)
             import sys

             sys.exit(1)
@@ -205,52 +308,48 @@ async def process_document(
     # The UID is now the top-level table name
     uid = parsed_toml["uid"]

-    # Preprocess metadata (assuming this function exists and works with the new format)
-    document = preprocess_metadata(filename, parsed_toml)
+    # Process metadata into DocumentMetadata instance
+    document = preprocess_metadata(filepath, parsed_toml)

     # Calculate and update word counts
     try:
-        document["words"] = len(document["content"]["plain"].split())
+        document.words = len(document.content.get("plain", "").split())
     except:
-        document["words"] = 0
+        document.words = 0

-    if document.get("status", "") == "draft":
-        site.words["drafts"] += document["words"]
+    if document.status == "draft":
+        site.words["drafts"] += document.words
     else:
         try:
-            document["source"]["words"] = len(document["source"]["text"].split())
-            site.words["references"] += document["source"]["words"]
+            source_words = len(document.source.get("text", "").split())
+            site.words["references"] += source_words
         except KeyError:
             pass
         try:
-            site.words["self"] += document["words"]
+            site.words["self"] += document.words
         except:
             pass

     # Extract external links from the plain text content
     try:
-        plain_text = document.get("content", {}).get("plain", "") + " "
-        plain_text += document.get("source", {}).get("url", "") + " "
-        plain_text += document.get("via", {}).get("url", "") + " "
-
-        external_links = extract_external_links(plain_text)
+        plain_text = (
+            document.content.get("plain", "")
+            + " "
+            + document.source.get("url", "")
+            + " "
+            + document.via.get("url", "")
+        )

-        # Store the external links in document["links"]["external"]
-        document["links"] = {
-            "internal": set(),
-            "external": set(),
-            "backlinks": set(),
-        }
-        document["links"]["external"].update(external_links)
-        site.links["external"].update(external_links)
+        external_links = extract_external_links(plain_text, site)
+        document.links["external"] = external_links
     except KeyError:
-        print(f"KeyError while compiling external links from {document['filename']}")
+        print(f"KeyError while compiling external links from {document.filepath}")
         pass

     return uid, document


-async def ingest_documents(site: SiteMetadata) -> Dict[str, Dict[str, Any]]:
+async def ingest_documents(site: SiteMetadata) -> Dict[str, Any]:
     logger.info("Ingesting files")
     file_list = get_files()
     documents = {}
@@ -258,11 +357,9 @@ async def ingest_documents(site: SiteMetadata) -> Dict[str, Dict[str, Any]]:
     slug_to_uid_lookup = {}
     site_primaries = set()
     site_secondaries = set()
-    site_series = set()
-    tags = set()
     uuid_collision_lookup = []

-    tasks = [process_document(filename, site) for filename in file_list]
+    tasks = [process_document(filepath, site) for filepath in file_list]
     results = await asyncio.gather(*tasks)
     site.words["total"] = (
         site.words["drafts"] + site.words["references"] + site.words["self"]
@@ -270,13 +367,11 @@ async def ingest_documents(site: SiteMetadata) -> Dict[str, Dict[str, Any]]:

     for uid, doc in results:
         documents[uid] = doc
-        slug_to_title_lookup[doc["slug"]] = doc["title"]
-        slug_to_uid_lookup[doc["slug"]] = uid
-        site_primaries.add(doc["primary"])
-        site_secondaries.add(doc["secondary"])
-        if "series" in doc:
-            site_series.add(doc["series"])
-        tags.update(doc.get("tags") or [])
+        slug_to_title_lookup[doc.slug] = doc.title
+        slug_to_uid_lookup[doc.slug] = uid
+        site_primaries.add(doc.primary)
+        site_secondaries.add(doc.secondary)
+        site.tags += doc.tags
         uuid_collision_lookup.append(uid)

     site.slug_to_uid_lookup = slug_to_uid_lookup
@@ -284,7 +379,6 @@ async def ingest_documents(site: SiteMetadata) -> Dict[str, Dict[str, Any]]:
     check_uuid_collisions(uuid_collision_lookup)
     site.primaries = list(site_primaries)
     site.secondaries = list(site_secondaries)
-    site.tags = list(tags)
     site.pagecount = len(documents)

     logger.info(f"Ingested {site.pagecount} files")
@@ -373,14 +467,14 @@ def process_image_parallel(input_data: Tuple[Path, Path, int, str]) -> None:


 def process_assets(
-    assets: Dict[str, Dict[str, Any]], asset_dir: Path, output_dir: Path
+    assets: Dict[str, DocumentMetadata], asset_dir: Path, output_dir: Path
 ) -> None:
     logger.info("Processing assets")
     manifest_images = []

     for asset_identifier, asset_metadata in assets.items():
-        source_path = Path(asset_metadata["filepath"])
-        output_path = output_dir / asset_metadata["slug"]
+        source_path = Path(asset_metadata.filepath)
+        output_path = output_dir / asset_metadata.slug
         os.makedirs(output_path.parent, exist_ok=True)

         if not source_path.exists():
@@ -422,74 +516,52 @@ def _parse_date(date_str: str) -> datetime.datetime:
         return datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=None)


-def preprocess_metadata(filename: Path, metadata: Dict[str, Any]) -> Dict[str, Any]:
-    """Preprocesses metadata for a document, ensuring required fields exist and formatting data."""
-
-    metadata["filename"] = filename
-
-    # Validate required fields
-    required_fields = ["uid", "slug", "available", "created", "primary", "secondary"]
-    missing_fields = [field for field in required_fields if field not in metadata]
-    if missing_fields:
-        raise ValueError(
-            f"[ERROR] Document missing {', '.join(missing_fields)}\n  {filename}"
-        )
-
-    # Set default values
-    metadata.setdefault("updated", metadata["created"])
+def preprocess_metadata(filepath: Path, metadata: Dict[str, Any]) -> DocumentMetadata:
+    """Preprocesses metadata for a document and converts it to a DocumentMetadata instance."""
+    # Create a working copy to avoid modifying the input
+    processed = metadata.copy()

     # Parse date fields
     for date_field in ["created", "updated", "available"]:
-        if date_field in metadata:
-            if isinstance(metadata[date_field], str):
-                metadata[date_field] = _parse_date(metadata[date_field])
-            elif isinstance(metadata[date_field], datetime.datetime):
-                metadata[date_field] = metadata[date_field].replace(tzinfo=None)
-
-    # Process source information
-    if "source" in metadata:
-        if "via" in metadata:
-            metadata.update(
-                process_source_information(metadata["source"], metadata["via"])
-            )
-        else:
-            metadata.update(process_source_information(metadata["source"], {}))
+        if isinstance(processed.get(date_field), str):
+            processed[date_field] = _parse_date(processed[date_field])
+        elif isinstance(processed.get(date_field), datetime.datetime):
+            processed[date_field] = processed[date_field].replace(tzinfo=None)
+
+    # Set default updated time if not provided
+    processed.setdefault("updated", processed.get("created"))
+
+    # Process source information if present
+    if "source" in processed:
+        processed["attribution"] = process_source_information(
+            processed["source"], processed.get("via", {})
+        )
+    else:
+        processed["attribution"] = {}
+        processed["source"] = {}
+
+    if "via" not in processed:
+        processed["via"] = {}

     # Determine title
-    metadata["title"] = (
-        metadata.get("title")
-        or metadata.get("attribution", {}).get("plain")
-        or metadata["created"].strftime("%B %e, %Y %-I.%M%p")
+    processed["title"] = (
+        processed.get("title")
+        or processed.get("attribution", {}).get("plain")
+        or processed["created"].strftime("%B %e, %Y %-I.%M%p")
     )
-    # Ensure title and slug are strings
-    metadata["title"] = str(metadata["title"])
-    if metadata.get("status") == "draft":
-        metadata["slug"] = "drafts/" + metadata["uid"]
-        metadata["parent"] = "a26221ae-c742-4b73-8dc6-7f8807456a1b"
-    else:
-        metadata["slug"] = str(metadata["slug"])

-    # Initialize interlinks, backlinks, and tags
-    metadata["interlinks"] = set()
-    metadata["backlinks"] = set()
-    metadata["tags"] = metadata.get("tags") or []
+    # Handle draft status
+    if processed.get("status") == "draft":
+        processed["slug"] = f"drafts/{processed['uid']}"

-    # Strip whitespace from plain content
-    try:
-        metadata["content"]["plain"] = metadata["content"]["plain"].strip()
-    except KeyError:
-        pass
-    try:
-        metadata["source"]["text"] = metadata["source"]["text"].strip()
-    except KeyError:
-        pass
+    # Add filepath as it's required but comes from function parameter
+    processed["filepath"] = filepath

-    return metadata
+    # Create and return DocumentMetadata instance
+    return DocumentMetadata(**processed)


-def process_source_information(
-    source: Dict[str, Any], via
-) -> Dict[str, Dict[str, str]]:
+def process_source_information(source: Dict[str, Any], via) -> Dict[str, str]:
     creator = source.get("creator") or source.get("director")
     title = source.get("title") or (
         " ♫ " + str(source.get("track"))
@@ -587,13 +659,11 @@ def process_source_information(
             partsvia = f" ([via]({escape_url(via["url"])}))"

     return {
-        "attribution": {
-            "plain": f"{speaker}{', '.join(partsplain + partsshared)}",
-            "djot": f"{speaker}{', '.join(partsdjot + partsshared)}",
-            "html": format_rich_attribution(
-                " — " + f"{speaker}{', '.join(partshtml + partsshared) + partsvia}"
-            ),
-        }
+        "plain": f"{speaker}{', '.join(partsplain + partsshared)}",
+        "djot": f"{speaker}{', '.join(partsdjot + partsshared)}",
+        "html": format_rich_attribution(
+            " — " + f"{speaker}{', '.join(partshtml + partsshared) + partsvia}"
+        ),
     }


@@ -619,8 +689,8 @@ def check_uuid_collisions(uuid_list):


 def insert_substitutions(
-    documents: Dict[str, Dict[str, Any]],
-    assets: Dict[str, Dict[str, Any]],
+    documents: Dict[str, DocumentMetadata],
+    assets: Dict[str, DocumentMetadata],
     site: SiteMetadata,
 ) -> None:
     logger.info("Performing substitutions")
@@ -635,53 +705,56 @@ def insert_substitutions(
     merged_data = {**documents, **assets}

     for key, page in documents.items():
-        logger.debug(f"  {key}, {page['title'][:40]}")
+        logger.debug(f"  {key}, {page.title[:40]}")

-        text = page.get("content", {}).get("plain")
+        text = page.content.get("plain")
         if text:
             text = replace_import_references(text, REF_IMPT_RE, merged_data, key, page)
             text = replace_cite_references(text, REF_CITE_RE, merged_data)
             text = replace_title_references(text, REF_TITLE_RE, merged_data)
             text = replace_slug_references(text, REF_SLUG_RE, merged_data)
             text = process_reference_links(text, REF_LINK_RE, merged_data, key)
-            page["content"]["plain"] = text.strip()
+            page.content["plain"] = text.strip()


 def replace_slug_references(
-    text: str, regex: re.Pattern, merged_data: Dict[str, Dict[str, Any]]
+    text: str, regex: re.Pattern, merged_data: Dict[str, DocumentMetadata]
 ) -> str:
     for match in regex.finditer(text):
         ref_type, ref_short_id = match.groups()
         full_match = match.group(0)
         ref_id = next((k for k in merged_data if k.startswith(ref_short_id)), None)
         if ref_id:
-            replacement = f"/{merged_data[ref_id]['slug']}"
+            try:
+                replacement = f"/{merged_data[ref_id].slug}"
+            except AttributeError:
+                replacement = f"/{merged_data[ref_id].slug}"
             text = text.replace(full_match, replacement)
     return text


 def replace_title_references(
-    text: str, regex: re.Pattern, merged_data: Dict[str, Dict[str, Any]]
+    text: str, regex: re.Pattern, merged_data: Dict[str, DocumentMetadata]
 ) -> str:
     for match in regex.finditer(text):
         opening, ref_type, ref_short_id, comment, closing = match.groups()
         full_match = match.group(0)
         ref_id = next((k for k in merged_data if k.startswith(ref_short_id)), None)
         if ref_id:
-            replacement = merged_data[ref_id]["title"]
+            replacement = merged_data[ref_id].title
             text = text.replace(full_match, replacement)
     return text


 def replace_cite_references(
-    text: str, regex: re.Pattern, merged_data: Dict[str, Dict[str, Any]]
+    text: str, regex: re.Pattern, merged_data: Dict[str, DocumentMetadata]
 ) -> str:
     for match in regex.finditer(text):
         opening, ref_type, ref_short_id, comment, closing = match.groups()
         full_match = match.group(0)
         ref_id = next((k for k in merged_data if k.startswith(ref_short_id)), None)
         if ref_id:
-            replacement = f"[{merged_data[ref_id]["attribution"]["djot"]}] (/{merged_data[ref_id]["slug"]})"
+            replacement = f"[{merged_data[ref_id].attribution["djot"]}] (/{merged_data[ref_id].slug})"
             text = text.replace(full_match, replacement)
     return text

@@ -689,29 +762,32 @@ def replace_cite_references(
 def replace_import_references(
     text: str,
     regex: re.Pattern,
-    merged_data: Dict[str, Dict[str, Any]],
+    merged_data: Dict[str, DocumentMetadata],
     key: str,
-    page: Dict,
+    page: DocumentMetadata,
 ) -> str:
     for match in regex.finditer(text):
         opening, ref_type, ref_short_id, comment, closing = match.groups()
         full_match = match.group(0)
         ref_id = next((k for k in merged_data if k.startswith(ref_short_id)), None)
         if ref_id:
-            ref_text = merged_data[ref_id]["content"]["plain"]
+            ref_text = merged_data[ref_id].content["plain"]
             if ref_type == "import::":
                 replacement = ref_text
             elif ref_type == "aside::":
-                ref_title = merged_data[ref_id]["title"]
-                ref_slug = merged_data[ref_id]["slug"]
-                ref_location = merged_data[ref_id].get("location", "")
-                location_string = (
-                    " ⚕ "
-                    + ref_location.get("city")
-                    + ", "
-                    + ref_location.get("country")
-                    or ""
-                )
+                ref_title = merged_data[ref_id].title
+                ref_slug = merged_data[ref_id].slug
+                ref_location = merged_data[ref_id].location
+                if ref_location:
+                    location_string = (
+                        " ⚕ "
+                        + ref_location.get("city")
+                        + ", "
+                        + ref_location.get("country")
+                        or ""
+                    )
+                else:
+                    location_string = ""
                 replacement = (
                     f'{{.aside}}\n{':'*78}\n'
                     f'{ref_text}\\\n'
@@ -722,18 +798,22 @@ def replace_import_references(
                 )
             else:
                 raise ValueError(f"Unrecognised reference type: {ref_type}")
-            if not page.get("status", "") == "draft":
-                merged_data[ref_id]["backlinks"].add(key)
+            if not page.status == "draft":
+                merged_data[ref_id].links["backlinks"].add(key)
             text = text.replace(full_match, replacement)
     return text


 def process_reference_links(
-    text: str, regex: re.Pattern, merged_data: Dict[str, Dict[str, Any]], key: str
+    text: str, regex: re.Pattern, merged_data: Dict[str, DocumentMetadata], key: str
 ) -> str:
     for ref_text_match, _, ref_type, ref_short_id in regex.findall(text):
         match = f"[{ref_text_match}]({ref_type}{ref_short_id})"
-        ref_id = next((k for k in merged_data if k.startswith(ref_short_id)), None)
+        ref_id = next(
+            (k for k in merged_data.keys() if k.startswith(ref_short_id)), None
+        )
+        if ref_id is None:
+            print(f"No match found for {ref_short_id}")

         if not ref_id:
             raise ValueError(
@@ -746,16 +826,16 @@ def process_reference_links(
             )

         ref_text = get_reference_text(ref_text_match, merged_data[ref_id])
-        ref_slug = f"/{merged_data[ref_id]['slug']}"
+        ref_slug = f"/{merged_data[ref_id].slug}"

         if ref_type == "link::":
             try:
                 # Double quotes within a page title are escaped so that they don't break the HTML 'title' element
-                ref_title = f"{merged_data[ref_id]['title']} | {merged_data[ref_id]['created'].strftime('%B %Y')}".replace(
+                ref_title = f"{merged_data[ref_id].title} | {merged_data[ref_id].created.strftime('%B %Y')}".replace(
                     '"', '\\"'
                 )
             except KeyError:
-                ref_title = merged_data[ref_id]["title"].replace('"', '\\"')
+                ref_title = merged_data[ref_id].title.replace('"', '\\"')
             replacement = f'[{ref_text}]({ref_slug}){{title="{ref_title}"}}'
         elif ref_type == "img::":
             replacement = f"[{ref_text}]({ref_slug})"
@@ -778,18 +858,18 @@ def process_reference_links(
     return text


-def get_reference_text(ref_text_match: str, ref_data: Dict) -> str:
+def get_reference_text(ref_text_match: str, ref_data) -> str:
     if ref_text_match.startswith("::") or ref_text_match == "":
-        return ref_data.get("title")
+        return ref_data.title
     return ref_text_match


-def create_quote_replacement(ref_data: Dict, ref_slug: str) -> str:
-    ref_src = ref_data["attribution"]["djot"]
+def create_quote_replacement(ref_data: DocumentMetadata, ref_slug: str) -> str:
+    ref_src = ref_data.attribution["djot"]
     try:
-        ref_text = ref_data["source"]["text"].replace("\n\n", "\n> \n> ").strip()
+        ref_text = ref_data.source["text"].replace("\n\n", "\n> \n> ").strip()
     except:
-        print(f"Error creating quote replacement: {ref_data["uid"]}")
+        print(f"Error creating quote replacement: {ref_data.uid}")
         import sys

         sys.exit()
@@ -808,10 +888,10 @@ Your browser does not support the video tag.
 def generate_html(documents):
     logger.info("Generating HTML")
     for key, page in documents.items():
-        if page.get("content", {}).get("plain"):
-            page["content"]["html"] = run_jotdown(page["content"]["plain"])
-        if page.get("source", {}).get("text"):
-            page["source"]["html"] = run_jotdown(page["source"]["text"])
+        if page.content.get("plain"):
+            page.content["html"] = run_jotdown(page.content["plain"])
+        if page.source.get("text"):
+            page.source["html"] = run_jotdown(page.source["text"])


 class LedgerLexer(RegexLexer):
@@ -841,7 +921,7 @@ class LedgerLexer(RegexLexer):
     }


-def highlight_code(code: str, language: str = None) -> str:
+def highlight_code(code: str, language: str) -> str:
     """
     Highlight code using Pygments with specified or guessed language.
     """
@@ -921,20 +1001,17 @@ def build_backlinks(documents, site):
     FOOTNOTE_LINK_URL_RE = re.compile(r"\[.+?\]:\s\/(.*)", re.DOTALL)
     interlink_count = 0
     for key, page in documents.items():
-        if (
-            "nobacklinks" in page.get("options", "")
-            or page.get("status", "") == "draft"
-        ):
+        if "nobacklinks" in page.options or page.status == "draft":
             continue

-        logger.debug(page["filename"])
+        logger.debug(page.filepath)

-        text = page.get("content", {}).get("plain")
+        text = page.content.get("plain")
         # Skip if no main content
         if not text:
             continue

-        interlinks = set(documents[key]["interlinks"])
+        interlinks = set(documents[key].links["internal"])

         combined_refs = INLINE_LINK_RE.findall(text) + FOOTNOTE_LINK_URL_RE.findall(
             text
@@ -950,11 +1027,28 @@ def build_backlinks(documents, site):
                     continue
                 logger.warning(f"\nKeyError in {page['title']} ({key}): {slug}")

-        documents[key]["interlinks"] = list(interlinks)
+        documents[key].links["internal"] = interlinks
         for interlink_key in interlinks:
-            documents[interlink_key]["backlinks"].add(key)
+            documents[interlink_key].links["backlinks"].add(key)

-    return interlink_count
+    for key, page in documents.items():
+        # Sort interlinks based on published dates
+        documents[key].links["internal"] = sorted(
+            documents[key].links["internal"],
+            key=lambda x: documents[x].available,
+            reverse=True,  # Most recent first
+        )
+        # Sort backlinks based on published dates
+        documents[key].links["backlinks"] = sorted(
+            documents[key].links["backlinks"],
+            key=lambda x: documents[x].available,
+            reverse=True,  # Most recent first
+        )
+
+    """
+    TODO: REMOVE SITE.BACKLINKS in favour a 'stats' or 'count' (templates will need updating
+    """
+    site.backlinks += interlink_count


 def should_ignore_slug(slug):
@@ -966,7 +1060,7 @@ def should_ignore_slug(slug):


 def build_collections(
-    documents: Dict[str, Dict[str, Any]], site: SiteMetadata
+    documents: Dict[str, DocumentMetadata], site: SiteMetadata
 ) -> Tuple[Dict[str, List[Dict[str, Any]]], List[Dict[str, Any]]]:
     collections = {
         primary: []
@@ -978,24 +1072,24 @@ def build_collections(
     sitemap = []

     for key, page in sorted(
-        documents.items(), key=lambda k_v: k_v[1]["available"], reverse=True
+        documents.items(), key=lambda k_v: k_v[1].available, reverse=True
     ):
-        if page.get("status", "") == "draft":
+        if page.status == "draft":
             collections["cd68b918-ac5f-4d6c-abb5-a55a0318846b"].append(page)
             continue
-        elif "nofeed" in page.get("options", []):
+        elif "nofeed" in page.options:
             sitemap.append(page)
             continue
         else:
             sitemap.append(page)
             collections["everything"].append(page)
-            collections[page["primary"]].append(page)
-            collections[page["secondary"]].append(page)
+            collections[page.primary].append(page)
+            collections[page.secondary].append(page)

-            for tag in page.get("tags", []):
+            for tag in page.tags:
                 collections[tag].append(page)

-            if page["secondary"] in [
+            if page.secondary in [
                 "essays",
                 "wandering",
                 "rambling",
@@ -1008,8 +1102,8 @@ def build_collections(


 def output_html(
-    assets: Dict[str, Dict[str, Any]],
-    documents: Dict[str, Dict[str, Any]],
+    assets: Dict[str, DocumentMetadata],
+    documents: Dict[str, DocumentMetadata],
     collections: Dict[str, List[Dict[str, Any]]],
     site: SiteMetadata,
     env: Environment,
@@ -1018,7 +1112,7 @@ def output_html(
     logger.info("Generating Hypertext")

     for key, page in documents.items():
-        template_file = page.get("layout", TEMPLATE_DEFAULT)
+        template_file = page.layout
         template = env.get_template(template_file)

         collection = build_page_collection(page, collections)
@@ -1028,28 +1122,29 @@ def output_html(
             assets=assets,
             collections=collections,
             collection=collection,
-            page=page,
+            page=asdict(page),
             site=site,
         )

-        output_path = output_dir / page["slug"] / "index.html"
+        output_path = output_dir / page.slug / "index.html"
         output_path.parent.mkdir(parents=True, exist_ok=True)

         with open(output_path, "w") as f:
             f.write(output)

-        logger.debug(f"  {page['filename']} >> {output_path}")
+        logger.debug(f"  {page.filepath} >> {output_path}")


 def build_page_collection(page, collections):
     try:
         collection = [
             item
-            for include in page["collection"]["include"]
+            for include in page.collection["include"]
             for item in collections[include]
         ]
-        return sorted(collection, key=lambda x: x["available"], reverse=True)
+        return sorted(collection, key=lambda x: x.available, reverse=True)
     except KeyError:
+        print(f"Failed collection for {page.filepath}")
         return []


@@ -1126,8 +1221,7 @@ async def main():
     generate_html(documents)

     # Build backlinks and collections
-    interlink_count = build_backlinks(documents, site)
-    site.backlinks += interlink_count
+    build_backlinks(documents, site)
     collections, sitemap = build_collections(documents, site)

     # Output HTML, feeds, and sitemap
@@ -1142,7 +1236,8 @@ async def main():
     logger.info("Build complete!")
     logger.info(f"Pages: {site.pagecount}")
     logger.info(f"Words: {site.words["total"]}")
-    logger.info(f"Interlinks: {interlink_count}")
+    logger.info(f"Internal links: {site.backlinks}")
+    logger.info(f"External links: {len(site.links["external"])}")


 if __name__ == "__main__":