Fail build on unencoded parentheses in URLs
One of the outputs of this site is a plaintext file containing all the external links I reference across the site. I noticed recently that a couple of the Wikipedia links in there were incorrect. Wikipedia is one of the few websites that uses parentheses “()” in URLs, and my naive link extracting regex would finish looking at the first closing parenthesis because it took that to mean it was inside a Djot format link and shouldn’t include the bracket.
The result, a URL like https://en.wikipedia.org/wiki/Cardinality_(data_modeling)
would lose it’s last parenthesis and be rendered invalid.
I first concocted a bit of clever parsing to handle this, and then I decided that the most robust thing to do would be throw away that cleverness and fail loudly if any such URL was found.
Now, if I save a URL with unencoded parentheses into one of my markup files, it’ll raise a ValueError
and print a helpful error like so:
ERROR: Document contains URL with improperly encoded characters:
Title: We shape our software; thereafter our software shapes us
File: /home/silas/notes/essays/software-shapes-us.md
Replace with:
[ontlogies](https://en.wikipedia.org/wiki/Ontology_%28information_science%29)
(I’ve hidden the link that includes the link in the error output, else that would raise an error here :D)
This involved changes to extract_external_links()
and process_documents()
.
Click here to see the code
def extract_external_links(
text: str, site, status
) -> Tuple[List, Optional[Tuple[str, str]]]:
"""Extract external links from text.
Returns:
Tuple of (sorted external links list, optional (link_text, url) tuple if problematic link found)
"""
# Pattern that captures markdown links with URLs containing unencoded parens
markdown_link_pattern = r"\[([^\]]+)\]\((https?://[^)]*\([^)]*\))\)"
# Check for markdown links with unencoded parentheses in URL
problematic_links = re.findall(markdown_link_pattern, text)
if problematic_links:
return [], problematic_links[0] # Return first problematic link
# Pattern that handles URLs (now assumes parens are encoded)
url_pattern = (
r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+" # domain
r"(?:/[^\s\"'<>)]*)?" # path (no unencoded parens allowed)
)
matches = re.findall(url_pattern, text)
external_links = set()
for url in matches:
# Clean up any trailing punctuation that might have been caught
url = url.rstrip(".,;:!?")
parsed_url = urlparse(url)
if parsed_url.netloc.lower() != "silasjelley.com":
external_links.add(url)
# Only add to site.links if not a draft
if status != "draft":
site.links["external"].add(url)
return sorted(external_links), None
async def process_document(
filepath: Path, site: SiteMetadata
) -> Tuple[str, DocumentMetadata]:
"""Process a document file and return its UID and metadata."""
with open(filepath, "rb") as f:
try:
parsed_toml = tomllib.load(f)
except:
logger.error(f"Error while processing document: {filepath}")
import sys
sys.exit(1)
# The UID is now the top-level table name
uid = parsed_toml["uid"]
# Process metadata into DocumentMetadata instance
document = preprocess_metadata(filepath, parsed_toml)
# Extract external links from the plain text content
try:
plain_text = (
document.content.get("plain", "")
+ " "
+ document.source.get("url", "")
+ " "
+ document.via.get("url", "")
)
status = document.status if document.status else ""
external_links, problematic = extract_external_links(plain_text, site, status)
if problematic:
link_text, url = problematic
encoded_url = url.replace("(", "%28").replace(")", "%29")
raise ValueError(
f"\n\n URL contains unencoded parentheses in document:\n"
f" Title: {document.title}\n"
f" File: {filepath}\n"
f" Link: [{link_text}]({url})\n\n"
f" Replace with:\n"
f" [{link_text}]({encoded_url})\n\n"
)
document.links["external"] = external_links
except KeyError:
logger.warn(f"KeyError while compiling external links from {document.filepath}")
pass
return uid, document