Fail build on unencoded parentheses in URLs

One of the outputs of this site is a plaintext file containing all the external links I reference across the site. I noticed recently that a couple of the Wikipedia links in there were incorrect. Wikipedia is one of the few websites that uses parentheses “()” in URLs, and my naive link extracting regex would finish looking at the first closing parenthesis because it took that to mean it was inside a Djot format link and shouldn’t include the bracket.

The result, a URL like https://en.wikipedia.org/wiki/Cardinality_(data_modeling) would lose it’s last parenthesis and be rendered invalid.

I first concocted a bit of clever parsing to handle this, and then I decided that the most robust thing to do would be throw away that cleverness and fail loudly if any such URL was found. Now, if I save a URL with unencoded parentheses into one of my markup files, it’ll raise a ValueError and print a helpful error like so:

ERROR: Document contains URL with improperly encoded characters:
      Title: We shape our software; thereafter our software shapes us
      File:  /home/silas/notes/essays/software-shapes-us.md

      Replace with:
            [ontlogies](https://en.wikipedia.org/wiki/Ontology_%28information_science%29)

(I’ve hidden the link that includes the link in the error output, else that would raise an error here :D)

This involved changes to extract_external_links() and process_documents().

Click here to see the code

def extract_external_links(
    text: str, site, status
) -> Tuple[List, Optional[Tuple[str, str]]]:
    """Extract external links from text.

    Returns:
        Tuple of (sorted external links list, optional (link_text, url) tuple if problematic link found)
    """
    # Pattern that captures markdown links with URLs containing unencoded parens
    markdown_link_pattern = r"\[([^\]]+)\]\((https?://[^)]*\([^)]*\))\)"

    # Check for markdown links with unencoded parentheses in URL
    problematic_links = re.findall(markdown_link_pattern, text)
    if problematic_links:
        return [], problematic_links[0]  # Return first problematic link

    # Pattern that handles URLs (now assumes parens are encoded)
    url_pattern = (
        r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+"  # domain
        r"(?:/[^\s\"'<>)]*)?"  # path (no unencoded parens allowed)
    )
    matches = re.findall(url_pattern, text)

    external_links = set()
    for url in matches:
        # Clean up any trailing punctuation that might have been caught
        url = url.rstrip(".,;:!?")

        parsed_url = urlparse(url)
        if parsed_url.netloc.lower() != "silasjelley.com":
            external_links.add(url)

            # Only add to site.links if not a draft
            if status != "draft":
                site.links["external"].add(url)

    return sorted(external_links), None


async def process_document(
    filepath: Path, site: SiteMetadata
) -> Tuple[str, DocumentMetadata]:
    """Process a document file and return its UID and metadata."""

    with open(filepath, "rb") as f:
        try:
            parsed_toml = tomllib.load(f)
        except:
            logger.error(f"Error while processing document: {filepath}")
            import sys

            sys.exit(1)

    # The UID is now the top-level table name
    uid = parsed_toml["uid"]

    # Process metadata into DocumentMetadata instance
    document = preprocess_metadata(filepath, parsed_toml)

    # Extract external links from the plain text content
    try:
        plain_text = (
            document.content.get("plain", "")
            + " "
            + document.source.get("url", "")
            + " "
            + document.via.get("url", "")
        )

        status = document.status if document.status else ""
        external_links, problematic = extract_external_links(plain_text, site, status)

        if problematic:
            link_text, url = problematic
            encoded_url = url.replace("(", "%28").replace(")", "%29")
            raise ValueError(
                f"\n\n  URL contains unencoded parentheses in document:\n"
                f"      Title: {document.title}\n"
                f"      File:  {filepath}\n"
                f"      Link:  [{link_text}]({url})\n\n"
                f"      Replace with:\n"
                f"            [{link_text}]({encoded_url})\n\n"
            )

        document.links["external"] = external_links
    except KeyError:
        logger.warn(f"KeyError while compiling external links from {document.filepath}")
        pass

    return uid, document