#!/usr/bin/env python # creator: Silas Jelley # created: 2020-08-11 09:52:32 # updated: 2024-09-22 15:38:57 # version: 3.0 # /// script # requires-python = ">=3.12" # dependencies = [ # "Pillow ", # "filelock", # "jinja2", # "urllib3", # "pygments", # "pillow_heif", # "pillow_avif-plugin", # "typing-extensions", # "beautifulsoup4", # "csscompressor", # ] # /// # Imports from collections import Counter from dataclasses import dataclass, field from hashlib import md5 from pathlib import Path from shutil import copyfile from subprocess import run from typing import List, Dict, Any, Tuple, Set, Optional from typing_extensions import TypedDict import asyncio import datetime import logging import multiprocessing import os import random import re import tomllib from csscompressor import compress from bs4 import BeautifulSoup from filelock import FileLock from jinja2 import Environment, FileSystemLoader from PIL import Image, ImageOps from pillow_heif import register_heif_opener from urllib.parse import urlparse import pillow_avif import incremental_build from shortcodes import process_shortcodes from attribution import process_source_information from code_highlight import highlight_code register_heif_opener() # Load configuration with open("config.toml", "rb") as config_file: config = tomllib.load(config_file) # Constants ASSET_DIR = Path(config["paths"]["asset_dir"]) MANIFEST_DIR = Path(config["paths"]["manifest_dir"]) TEMPLATE_DIR = Path(config["paths"]["template_dir"]) TEMPLATE_FEED = config["templates"]["feed"] TEMPLATE_FEED_XSL = config["templates"]["feed_xsl"] TEMPLATE_SITEMAP = config["templates"]["sitemap"] TEMPLATE_DEFAULT = config["templates"]["default"] OUTPUT_DIR = Path(config["paths"]["output_dir"]) NOTES_DIR = Path(config["paths"]["notes_dir"]) LINK_REPORT_PATH = Path(config["paths"]["link_report_path"]) INIT_DIR = os.getcwd() # Set up logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) # Dataclasses @dataclass class SiteMetadata: name: str created: str url: str baseurl: str uid: str description: str creator: Dict[str, str] backlinks: int = 0 words: Dict[str, Any] = field( default_factory=lambda: { "self": 0, "drafts": 0, "code": { "lines": 0, "words": 0, }, "references": 0, } ) links: Dict[str, Any] = field( default_factory=lambda: { "internal": list(), "backlinks": list(), "external": set(), } ) pagecount: int = 0 references: int = 0 categories: Set = field(default_factory=set) secondaries: Set = field(default_factory=set) tags: Set = field(default_factory=set) data: Dict[str, Any] = field(default_factory=dict) stylesheet_hash: str = "" slug_to_uid_lookup: Dict[str, str] = field(default_factory=dict) slug_to_title_lookup: Dict[str, str] = field(default_factory=dict) class LinksDict(TypedDict): internal: list[str] external: list[str] backlinks: list[str] @dataclass class DocumentMetadata: filepath: Path uid: str slug: str title: str category: str secondary: str available: datetime.datetime created: datetime.datetime updated: datetime.datetime creator: str = "" note: str = "" favourite: bool = False parent: str = "" description: str = "" layout: str = TEMPLATE_DEFAULT source: Dict = field(default_factory=dict) via: Dict = field(default_factory=dict) location: Dict[str, Any] = field( default_factory=lambda: { "continent": "", "country": "", "region": "", "city": "", "note": "", "lat": int, "lng": int, } ) collection: Dict[str, Any] = field( default_factory=lambda: { "style": "title", "order": "chronological", "include": [], } ) attribution: Dict[str, str] = field( default_factory=lambda: { "plain": "", "djot": "", "html": "", } ) media: str = "application/toml" words: Dict[str, Any] = field( default_factory=lambda: { "self": 0, "code": { "lines": 0, "words": 0, }, "references": 0, "referenced": 0, } ) status: str = "" links: LinksDict = field( default_factory=lambda: { "internal": list(), "external": list(), "backlinks": list(), } ) options: List[str] = field(default_factory=list) tags: List[str] = field(default_factory=list) styles: str = "" content: Dict[str, str] = field(default_factory=dict) def __post_init__(self): # Validate links dictionary structure required_link_types = {"internal", "external", "backlinks"} if ( not isinstance(self.links, dict) or set(self.links.keys()) != required_link_types ): raise ValueError( f"links must be a dictionary with exactly these keys: {required_link_types}" ) for key in self.links: if not isinstance(self.links[key], set): self.links[key] = set(self.links[key]) @dataclass class AssetMetadata: added: Path filepath: Path media: str uid: str slug: str title: str available: datetime.datetime created: datetime.datetime updated: datetime.datetime creator: str = "" note: str = "" source: Dict = field(default_factory=dict) via: Dict = field(default_factory=dict) filehash: str = "" output_width: int = 0 output_height: int = 0 location: Dict[str, Any] = field( default_factory=lambda: { "continent": "", "country": "", "region": "", "city": "", "note": "", "lat": int, "lng": int, } ) attribution: Dict[str, str] = field( default_factory=lambda: { "plain": "", "djot": "", "html": "", } ) words: Dict[str, Any] = field( default_factory=lambda: { "self": 0, "code": { "lines": 0, "words": 0, }, "references": 0, "referenced": 0, } ) links: LinksDict = field( default_factory=lambda: { "internal": list(), "external": list(), "backlinks": list(), } ) tags: List[str] = field(default_factory=list) def __post_init__(self): # Validate links dictionary structure required_link_types = {"internal", "external", "backlinks"} if ( not isinstance(self.links, dict) or set(self.links.keys()) != required_link_types ): raise ValueError( f"links must be a dictionary with exactly these keys: {required_link_types}" ) for key in self.links: if not isinstance(self.links[key], set): self.links[key] = set(self.links[key]) def init_site(): site_config = config["site"] return SiteMetadata( name=site_config["name"], created=site_config["created"], url=site_config["url"], baseurl=site_config["baseurl"], uid=site_config["uid"], description=site_config["description"], creator=site_config["creator"], ) def preprocess_asset_metadata( uid: str, asset_data: Dict[str, Any], manifest_path: Path ) -> Dict[str, Any]: """Preprocess asset metadata to ensure it meets AssetMetadata requirements.""" processed = asset_data.copy() # Handle dates for date_field in ["created", "updated", "available"]: if isinstance(processed.get(date_field), str): processed[date_field] = _parse_date(processed[date_field]) elif isinstance(processed.get(date_field), datetime.datetime): processed[date_field] = processed[date_field].replace(tzinfo=None) else: processed[date_field] = datetime.datetime.now() # Set required fields with defaults if not present processed.setdefault("uid", uid) return processed def load_assets() -> Dict[str, AssetMetadata]: """Load asset manifests and convert them to AssetMetadata instances.""" assets = {} asset_manifests = list(MANIFEST_DIR.glob("*.toml")) for manifest in asset_manifests: with open(manifest, "rb") as f: manifest_data = tomllib.load(f) for uid, asset_data in manifest_data.items(): try: processed_data = preprocess_asset_metadata(uid, asset_data, manifest) processed_data["filepath"] = ASSET_DIR / processed_data["filepath"] assets[uid] = AssetMetadata(**processed_data) except Exception as e: logger.error( f"Error processing asset {uid}\n{' ' * 26}{manifest}\n{' ' * 26}{str(e)}" ) continue return assets def setup_jinja_environment(): file_loader = FileSystemLoader(TEMPLATE_DIR) env = Environment(loader=file_loader) # Add custom filters env.filters["shuffle"] = lambda seq: random.sample(seq, len(seq)) env.filters["time_local"] = lambda value, format="%-I:%M%p": value.strftime( format ).lower() env.filters["year"] = lambda value, format="%Y": value.strftime(format) env.filters["month"] = lambda value, format="%m": value.strftime(format) env.filters["day"] = lambda value, format="%d": value.strftime(format) env.filters["year_month"] = lambda value, format="%Y/%m": value.strftime(format) env.filters["year_month_day"] = lambda value, format="%Y/%m/%d": value.strftime( format ) env.filters["iso_date"] = lambda value, format="%Y-%m-%d": value.strftime(format) env.filters["date_long_short_month"] = ( lambda value, format="%b %e, %Y": value.strftime(format) ) env.filters["datetime_w3c"] = ( lambda value, format="%Y-%m-%dT%H:%M:%S": value.strftime(format) ) env.filters["date_long_full_month"] = ( lambda value, format="%B %e, %Y": value.strftime(format) ) env.filters["timedate_long"] = ( lambda value, format="%-I:%M%p %B %e, %Y": value.strftime(format) ) env.filters["highlight_code"] = highlight_code return env def get_files() -> List[Path]: return [f for f in NOTES_DIR.glob("**/*.md") if "available = " in f.read_text()] def extract_markdown_links(text: str) -> List[Tuple[str, str]]: """Extract markdown links, properly handling parentheses in URLs.""" links = [] pattern = r"\[([^\]]+)\]\((https?://)" for match in re.finditer(pattern, text): link_text = match.group(1) url_start = match.end() # Count parentheses to find the real end paren_count = 1 # We've seen the opening ( pos = url_start while pos < len(text) and paren_count > 0: if text[pos] == "(": paren_count += 1 elif text[pos] == ")": paren_count -= 1 pos += 1 url = match.group(2) + text[url_start : pos - 1] links.append((link_text, url)) return links def extract_external_links( text: str, site, status ) -> Tuple[List, Optional[Tuple[str, str]]]: """Extract external links from text. Returns: Tuple of (sorted external links list, optional (link_text, url) tuple if problematic link found) """ # Characters that must be percent-encoded in URLs (RFC 3986) UNENCODED_CHARS = { "(": "%28", ")": "%29", " ": "%20", "<": "%3C", ">": "%3E", '"': "%22", "\\": "%5C", "^": "%5E", "`": "%60", "{": "%7B", "}": "%7D", "|": "%7C", } # Extract markdown links properly handling parens markdown_matches = extract_markdown_links(text) # Check for markdown links with characters that should be encoded for link_text, url in markdown_matches: for char in UNENCODED_CHARS.keys(): if char in url: return [], (link_text, url) # Pattern that matches properly encoded URLs url_pattern = ( r"https?://" # scheme r"(?:[-\w.]|(?:%[\da-fA-F]{2}))+" # domain r"(?:" # optional path and query r"/(?:[^\s\"'<>)\|\\^`{}]|(?:%[\da-fA-F]{2}))*" # path segments r")?" ) matches = re.findall(url_pattern, text) external_links = set() for url in matches: # Clean up any trailing punctuation that might have been caught url = url.rstrip(".,;:!?") # Skip URLs with unencoded problematic characters if any(char in url for char in UNENCODED_CHARS.keys()): continue parsed_url = urlparse(url) if parsed_url.netloc.lower() != "silasjelley.com": external_links.add(url) # Only add to site.links if not a draft if status != "draft": site.links["external"].add(url) return sorted(external_links), None async def process_document( filepath: Path, site: SiteMetadata ) -> Tuple[str, DocumentMetadata]: """Process a document file and return its UID and metadata.""" with open(filepath, "rb") as f: try: parsed_toml = tomllib.load(f) except: logger.error(f"Error while processing document: {filepath}") import sys sys.exit(1) # The UID is now the top-level table name uid = parsed_toml["uid"] # Process metadata into DocumentMetadata instance document = preprocess_metadata(filepath, parsed_toml) # Extract external links from the plain text content try: plain_text = ( document.content.get("plain", "") + " " + document.source.get("url", "") + " " + document.via.get("url", "") ) status = document.status if document.status else "" external_links, problematic = extract_external_links(plain_text, site, status) if problematic: link_text, url = problematic # Simply encode parentheses and other problematic characters encoded_url = url.replace("(", "%28").replace(")", "%29") # Add other character replacements as needed encoded_url = encoded_url.replace(" ", "%20") encoded_url = encoded_url.replace("<", "%3C").replace(">", "%3E") encoded_url = encoded_url.replace('"', "%22") encoded_url = encoded_url.replace("\\", "%5C") encoded_url = encoded_url.replace("^", "%5E") encoded_url = encoded_url.replace("`", "%60") encoded_url = encoded_url.replace("{", "%7B").replace("}", "%7D") encoded_url = encoded_url.replace("|", "%7C") raise ValueError( f"\n\n ERROR: Document contains URL with improperly encoded characters:\n" f" Title: {document.title}\n" f" File: {filepath}\n" f" Link: [{link_text}]({url})\n\n" f" Replace with:\n" f" [{link_text}]({encoded_url})\n\n" ) document.links["external"] = external_links except KeyError: logger.warn(f"KeyError while compiling external links from {document.filepath}") pass return uid, document async def ingest_documents(site: SiteMetadata) -> Dict[str, Any]: logger.info("Ingesting files") file_list = get_files() documents = {} slug_to_title_lookup = {} slug_to_uid_lookup = {} uuid_collision_lookup = [] tasks = [process_document(filepath, site) for filepath in file_list] results = await asyncio.gather(*tasks) for uid, doc in results: documents[uid] = doc slug_to_title_lookup[doc.slug] = doc.title slug_to_uid_lookup[doc.slug] = uid site.categories.add(doc.category) site.secondaries.add(doc.secondary) site.tags.update(doc.tags) uuid_collision_lookup.append(uid) site.slug_to_uid_lookup = slug_to_uid_lookup site.slug_to_title_lookup = slug_to_title_lookup check_uuid_collisions(uuid_collision_lookup) site.pagecount = len(documents) logger.info(f"Ingested {site.pagecount} files") return documents def process_image_parallel(input_data: Tuple[Path, Path, int, AssetMetadata]) -> None: workaround_import = pillow_avif.AvifImagePlugin input_image, output_path, output_width, asset_metadata = input_data lock_path = output_path.with_suffix(".lock") lock = FileLock(str(lock_path)) # Define AVIF output path avif_output_path = output_path.with_suffix(".avif") # Check if AVIF support is available avif_available = "AVIF" in Image.SAVE if output_path.exists() and avif_output_path.exists(): return try: with lock: os.makedirs(output_path.parent, exist_ok=True) with Image.open(input_image) as im: original_format = im.format im = ImageOps.exif_transpose(im) output_height = int(im.size[1] * (output_width / im.size[0])) asset_metadata.output_width = output_width asset_metadata.output_height = output_height logger.debug(f"Output width parameter: {output_width}") logger.debug(f"Image size before resize calculation: {im.size}") output_height = int(im.size[1] * (output_width / im.size[0])) logger.debug(f"Calculated output height: {output_height}") with im.resize( (output_width, output_height), Image.Resampling.LANCZOS ) as output_image: # Save JPEG version if ( original_format != "JPEG" and str(output_path).endswith("jpg") and output_image.mode in ("RGBA", "P") ): output_image = output_image.convert("RGB") output_image.save(output_path, quality=85, optimize=True) # Save AVIF version only if support is available if avif_available: try: if output_image.mode in ("RGBA", "P"): avif_image = output_image.convert("RGB") else: avif_image = output_image.copy() avif_image.save( avif_output_path, format="AVIF", quality=60, # Lower quality for better compression, still maintains good visual quality speed=5, # Slowest speed = best compression (0 is slowest, 10 is fastest) bits=10, # Use 10-bit color depth for better quality-to-size ratio compress_level=8, # Highest compression level (range 0-8) color_space="bt709", # Use YUV BT.709 color space chroma=0, # 4:4:4 chroma sampling (0=4:4:4, 1=4:2:0, 2=4:2:2) num_threads=0, # Use all available CPU threads for encoding ) logger.debug( f"Processed image: {input_image} -> {output_path} and {avif_output_path}" ) except Exception as e: logger.error( f"Error saving AVIF version of {input_image}: {e}" ) else: logger.error( "AVIF support not available. Skipping AVIF conversion." ) logger.debug(f"Processed image: {input_image} -> {output_path}") except OSError as e: logger.error(f"OS error processing {input_image}: {e}") except Exception as e: logger.error(f"Error processing {input_image}: {e}") finally: if lock_path.exists(): try: lock_path.unlink() except OSError: pass def process_assets( assets: Dict[str, AssetMetadata], asset_dir: Path, output_dir: Path ) -> None: logger.info("Processing assets") manifest_images = [] for asset_identifier, asset_metadata in assets.items(): source_path = Path(asset_metadata.filepath) output_path = output_dir / asset_metadata.slug os.makedirs(output_path.parent, exist_ok=True) if not source_path.exists(): raise FileNotFoundError( f"Missing asset: {asset_identifier} at {source_path}" ) if source_path.suffix == ".gpx": with open(source_path, "rb") as file_to_hash: asset_metadata.hash = md5(file_to_hash.read()).hexdigest() copyfile(source_path, output_path) elif output_path.exists(): continue elif source_path.suffix in (".jpg", ".png", ".heic", ".webp"): width = 3000 if "PANO" in str(output_path) else 1600 manifest_images.append((source_path, output_path, width, asset_metadata)) else: copyfile(source_path, output_path) for asset in asset_dir.rglob("*"): if asset.is_file(): output_path = output_dir / asset.relative_to(asset_dir) os.makedirs(output_path.parent, exist_ok=True) copyfile(asset, output_path) with multiprocessing.Pool() as pool: pool.map(process_image_parallel, manifest_images) logger.info("Finished processing assets") def _parse_date(date_str: str) -> datetime.datetime: """Parses a date string into a datetime object, handling both date and datetime inputs.""" try: return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S%z").replace( tzinfo=None ) except ValueError: return datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=None) def preprocess_metadata(filepath: Path, metadata: Dict[str, Any]) -> DocumentMetadata: """Preprocesses metadata for a document and converts it to a DocumentMetadata instance.""" # Create a working copy to avoid modifying the input processed = metadata.copy() # Parse date fields for date_field in ["available", "created", "updated"]: if isinstance(processed.get(date_field), str): processed[date_field] = _parse_date(processed[date_field]) elif isinstance(processed.get(date_field), datetime.datetime): processed[date_field] = processed[date_field].replace(tzinfo=None) # Set default updated time if not provided processed.setdefault("updated", processed.get("available")) # Process source information if present if "source" in processed: processed["attribution"] = process_source_information( processed["source"], processed.get("via", {}) ) else: processed["attribution"] = {} processed["source"] = {} if "via" not in processed: processed["via"] = {} # Handle draft status if processed.get("status") == "draft": processed["slug"] = f"drafts/{processed['uid']}" # Add filepath as it's required but comes from function parameter processed["filepath"] = filepath # Determine title processed["title"] = ( processed.get("title") or processed.get("attribution", {}).get("plain") or processed["available"].strftime("%B %e, %Y %-I.%M%p") ) # Create and return DocumentMetadata instance return DocumentMetadata(**processed) def check_uuid_collisions(uuid_list): prefixes = [uuid[:8] for uuid in uuid_list] if len(set(prefixes)) != len(prefixes): collisions = [ prefix for prefix, count in Counter(prefixes).items() if count > 1 ] raise ValueError( f"CRITICAL ERROR: UUID prefix collision for: {', '.join(collisions)}" ) def generate_html(documents): logger.info("Generating HTML") for key, page in documents.items(): if page.content.get("plain"): page.content["html"] = run_jotdown(page.content["plain"], page) if page.source.get("text"): page.source["html"] = run_jotdown(page.source["text"], page) if not page.description: html_content = page.content.get("html") or page.source.get("html") if html_content: soup = BeautifulSoup(html_content, "html.parser") text = soup.get_text(separator=" ", strip=True) # Normalize whitespace (collapse multiple spaces) text = " ".join(text.split()) page.description = text[:200] # Aggregate site-wide counts from per-page counts site.words["self"] = sum( p.words["self"] for p in documents.values() if p.status != "draft" ) site.words["drafts"] = sum( p.words["self"] for p in documents.values() if p.status == "draft" ) site.words["references"] = sum(p.words["references"] for p in documents.values()) site.words["code"]["lines"] = sum( p.words["code"]["lines"] for p in documents.values() ) site.words["code"]["words"] = sum( p.words["code"]["words"] for p in documents.values() ) site.words["total"] = ( site.words["self"] + site.words["drafts"] + site.words["references"] ) def run_jotdown(plaintext: str, page) -> str: """ Process djot formatted plaintext in HTML while also making accurate counts of words of prose, references, and code """ CODE_BLOCK_RE = re.compile( r"( *)````*(=html|\s*(?:(\w+)\n))?(.*?)( *)````*", re.DOTALL ) code_blocks = [] marker_template = "§CODE_BLOCK_{}§" def save_code_block(match): leading_space = match.group(1) raw_html_marker = match.group(2) language = match.group(3) code = match.group(4).rstrip() trailing_space = match.group(5) code_words = len(code.split()) code_lines = len(code.splitlines()) page.words["code"]["lines"] += code_lines page.words["code"]["words"] += code_words # Check if this is a raw HTML block if raw_html_marker == "=html": return f"{leading_space}```=html\n{code}\n{trailing_space}```" # For all other cases, including 'html' language, highlight the code highlighted = highlight_code(code, language) marker = marker_template.format(len(code_blocks)) code_blocks.append(highlighted) return f"{leading_space}```=html\n{marker}\n{trailing_space}```" # First, replace all code blocks with markers processed_text = CODE_BLOCK_RE.sub(save_code_block, plaintext) # Count prose words from the processed text # Remove the markers before counting text_without_markers = processed_text for i in range(len(code_blocks)): marker = marker_template.format(i) text_without_markers = text_without_markers.replace(marker, "") # Also remove raw HTML blocks for word counting RAW_HTML_BLOCK_RE = re.compile(r"```=html\n.*?\n```", re.DOTALL) text_without_markers = RAW_HTML_BLOCK_RE.sub("", text_without_markers) # Count words from the cleaned markdown prose prose_wordcount = len(text_without_markers.split()) if page.category == "references": page.words["references"] += prose_wordcount else: page.words["self"] += prose_wordcount # Run through jotdown html = run("jotdown", input=processed_text, text=True, capture_output=True).stdout # Replace markers with actual highlighted code for i, code in enumerate(code_blocks): marker = marker_template.format(i) html = html.replace(marker, code) return html def build_backlinks(documents, site): logger.info("Building backlinks") INLINE_LINK_RE = re.compile( r"\[[^\]]*(?:\[[^\]]*\][^\]]*)*\]\(\/([^)#]*)\)", re.DOTALL ) FOOTNOTE_LINK_URL_RE = re.compile(r"\[.+?\]:\s\/(.*)", re.DOTALL) interlink_count = 0 for key, page in documents.items(): if "nobacklinks" in page.options or page.status == "draft": continue logger.debug(page.filepath) text = page.content.get("plain") # Skip if no main content if not text: continue interlinks = set(documents[key].links["internal"]) combined_refs = INLINE_LINK_RE.findall(text) + FOOTNOTE_LINK_URL_RE.findall( text ) for slug in combined_refs: try: link_uid = site.slug_to_uid_lookup[slug] interlinks.add(link_uid) interlink_count += 1 except KeyError: if should_ignore_slug(slug): continue logger.warning(f"\nKeyError in {page.title} ({key}): {slug}") documents[key].links["internal"] = sorted(interlinks) for interlink_key in interlinks: documents[interlink_key].links["backlinks"].add(key) """ TODO: REMOVE SITE.BACKLINKS in favour a 'stats' or 'count' (templates will need updating """ site.backlinks += interlink_count def should_ignore_slug(slug): return ( slug.startswith(("feeds/", "images/", "$")) or slug.endswith((".jpg", ".webp", ".png", ".svg", ".pdf", ".gif", ".html")) or slug in ["publickey", "humans.txt", "build.py", "links.txt"] ) def build_collections( documents: Dict[str, DocumentMetadata], site: SiteMetadata ) -> Tuple[Dict[str, List[Dict[str, Any]]], List[Dict[str, Any]]]: logger.info("Building collections") collections = { category: [] for category in list(site.categories) + list(site.secondaries) + list(site.tags) + ["everything", "main", "cd68b918-ac5f-4d6c-abb5-a55a0318846b"] } sitemap = [] for key, page in sorted( documents.items(), key=lambda k_v: k_v[1].available, reverse=True ): if page.status == "draft": collections["cd68b918-ac5f-4d6c-abb5-a55a0318846b"].append(page) continue elif page.status == "hidden": continue elif "nofeed" in page.options: sitemap.append(page) continue else: sitemap.append(page) collections["everything"].append(page) collections[page.category].append(page) collections[page.secondary].append(page) for tag in page.tags: collections[tag].append(page) if page.secondary in [ "essays", "wandering", "rambling", "pearls", ]: collections["main"].append(page) return collections, sitemap def output_html( assets: Dict[str, AssetMetadata], documents: Dict[str, DocumentMetadata], collections: Dict[str, List[Dict[str, Any]]], site: SiteMetadata, env: Environment, output_dir: Path, ) -> None: """HTML output with incremental builds.""" incremental_build.output_html_incremental( assets=assets, documents=documents, collections=collections, site=site, env=env, output_dir=output_dir, build_page_collection_func=build_page_collection, ) def build_page_collection(page, collections): try: collection = [ item for include in page.collection["include"] for item in collections[include] ] return sorted(collection, key=lambda x: x.available, reverse=True) except KeyError: logger.error(f"Failed collection for {page.filepath}") return [] def output_feeds(collections, site, env, output_dir): logger.info("Generating Feeds") feed_list = list(site.categories) + list(site.secondaries) + ["everything", "main"] for entry in feed_list: feed = render_feed(entry, collections, site, env) write_feed(feed, output_dir) logger.debug(f" {entry} >> {feed['path']}") output_feed_stylesheet(site, env, output_dir) def render_feed(feed_name, collections, site, env): slug = f"feeds/{feed_name}" feed_path = f"{slug}/index.xml" template = env.get_template(TEMPLATE_FEED) feed_content = template.render( site=site, slug=slug, collection=feed_name, feed=collections[feed_name], ) return {"name": feed_name, "output": feed_content, "path": feed_path} def write_feed(feed, output_dir): feed_path = output_dir / feed["path"] feed_path.parent.mkdir(parents=True, exist_ok=True) feed_path.write_text(feed["output"]) def output_link_report(site, link_report_path): logger.info("Creating plaintext link files") with open(link_report_path, "w") as file: for link in sorted(site.links["external"]): file.write(f"{link}\n") logger.debug(f" {link_report_path}") def output_feed_stylesheet(site, env, output_dir): logger.info("Creating XSL Stylesheet") template = env.get_template(TEMPLATE_FEED_XSL) output_path = output_dir / "feed.xsl" output = template.render(site=site) output_path.write_text(output) logger.debug(f" {output_path}") def output_sitemap(sitemap, site, env, output_dir): logger.info("Generating Sitemap") template = env.get_template(TEMPLATE_SITEMAP) output = template.render(sitemap=sitemap, site=site) output_path = output_dir / "sitemap.xml" output_path.write_text(output) logger.debug(f" {output_path}") async def main(): # Initialize site and load assets import sys global site site = init_site() assets = load_assets() # Set up Jinja environment env = setup_jinja_environment() core_stylesheet_content = "" deferred_stylesheet_content = "" stylesheet_dir = Path(config["paths"]["stylesheet_dir"]) core_css_files = config.get("css", {}).get("core_files", []) if stylesheet_dir.is_dir(): css_files = sorted(stylesheet_dir.glob("*.css")) for css_file in css_files: if str(css_file) in core_css_files: core_stylesheet_content += css_file.read_text() + "\n" else: deferred_stylesheet_content += css_file.read_text() + "\n" # Write deferred stylesheet to output output_stylesheet_path = OUTPUT_DIR / "deferred.css" output_stylesheet_path.parent.mkdir(parents=True, exist_ok=True) output_stylesheet_path.write_text(deferred_stylesheet_content) # Assign content and hashes to site context site.data["core_stylesheet"] = core_stylesheet_content site.data["core_stylesheet_minified"] = compress(core_stylesheet_content) if deferred_stylesheet_content: site.data["deferred_stylesheet_hash"] = md5( deferred_stylesheet_content.encode("utf-8") ).hexdigest() if "--full-rebuild" in sys.argv: logger.info("Full rebuild requested, clearing cache") incremental_build.clear_build_cache(OUTPUT_DIR) # Process assets await asyncio.to_thread(process_assets, assets, ASSET_DIR, OUTPUT_DIR) # Ingest and process documents documents = await ingest_documents(site) # Calculate base per-page word counts before shortcode processing # for page in documents.values(): # page.words["self"] = len(page.content.get("plain", "").split()) # page.words["references"] = len(page.source.get("text", "").split()) logger.info("Processing shortcodes") process_shortcodes(documents, assets, site) generate_html(documents) # Build backlinks and collections build_backlinks(documents, site) collections, sitemap = build_collections(documents, site) # Attempting to make final order of 'backlinks' deterministic for key, page in documents.items(): # Sort interlinks based on published dates documents[key].links["internal"] = sorted( documents[key].links["internal"], key=lambda x: documents[x].available, reverse=True, # Most recent first ) # Sort backlinks based on published dates documents[key].links["backlinks"] = sorted( documents[key].links["backlinks"], key=lambda x: documents[x].available, reverse=True, # Most recent first ) # Output HTML, feeds, and sitemap output_html(assets, documents, collections, site, env, OUTPUT_DIR) output_link_report(site, LINK_REPORT_PATH) output_feeds(collections, site, env, OUTPUT_DIR) output_sitemap(sitemap, site, env, OUTPUT_DIR) # Change back to the initial directory os.chdir(INIT_DIR) # Print summary logger.info("Build complete!") logger.info(f"Pages: {site.pagecount}") logger.info(f"Words: {site.words['total']}") logger.info(f"Internal links: {site.backlinks}") logger.info(f"External links: {len(site.links['external'])}") if __name__ == "__main__": asyncio.run(main())