Scraping your posts from BlueSky with Python
Inspired by my writing in public everyday for a month, which often draws from things I’ve read or watched, Kyle wants to get into the habit of writing a little something about the things he reads/watches/etc.
BlueSky came up as an option for a character limited, low-barrier, write-once platform, and while I’m broadly averse to centralised platforms1 I recognise the appeal of simple UX and broad reach.
POSSE (Publish (on your) Own Site, Syndicate Elsewhere) is one avenue for satisfying the tension between sovereignty and syndication, but if the goal is low-friction then perhaps the reverse — publishing elsewhere and then preserving on your own site — is a less complicated flow.
I toyed around with a simple Python program for pulling posts into my own document system, using Anthropic’s Claude 3.5 Sonnet to knock together the rough shape of the scraper, and then tweaking and expanding it to my liking.
The code is included at the end, but it may not be useful to you as is. So, in no particular order, and with no promises of completeness, here are some of the assumptions I’ve made,
-
Each post is only downloaded once. A sidecar JSON file is read and re-written on each run to keep track of the
at://
identifier of each saved post so the script can be re-run and only new posts will be downloaded. -
Message content is stored as a TOML multi-line literal string so can contain anything except three consecutive single quotes.
-
While I’ve begun work on preserving media, currently this is not implemented. See the
download_media
function for the work so far. -
Uses the getAuthorFeed API endpoint which includes the given author’s reposts and comments, and currently naively polls for all of an authors posts rather than specifying ‘posts since X date’ or similar.
-
The
created_at
datetime is converted to a localised datetime, if you’d prefer UTC, replace all instances oflocal_dt
in the code withutc_dt
. -
Metadata is added to each entry according to the needs of my document system, you can remove or change these in the
doc
metadata block shown here increate_toml_doc()
,# Basic metadata doc["uid"] = str(uuid.uuid4()) doc["creator"] = creator_name doc["primary"] = "journal" doc["secondary"] = "nonsense" doc["created"] = local_dt doc["slug"] = self.create_slug(local_dt)
-
Each post is saved as a separate TOML document with the following structure,
uid = "3f335802-34e4-4b9b-8a29-a26afe348344" creator = "Silas Jelley" primary = "journal" secondary = "nonsense" created = 2024-11-21T18:12:40.260000-08:00 slug = "2024/11/21/181240" [content] plain = """ Any fool can repeat the right answer, but who’s asking the right questions? """
-
The script takes 2 required positional arguments, and 3 optional arguments
usage: import-bluesky [-h] [--creator CREATOR] [--limit LIMIT] [--output-dir OUTPUT_DIR] identifier password Scrape BlueSky posts and save as TOML files positional arguments: identifier Your BlueSky handle or email password Your BlueSky password options: -h, --help show this help message and exit --creator CREATOR Creator name for TOML files (defaults to identifier) --limit LIMIT Maximum number of posts to retrieve (default: 500) --output-dir OUTPUT_DIR Directory to store TOML files (default: posts)
Still todo:
-
Save post url with document in a
[source]
block - Save media included in the post
- (possibly) Save likes, replies etc
The complete script,
#!/usr/bin/env python3
import requests
import json
from datetime import datetime
from typing import List, Dict, Optional, Tuple
import tomlkit
import uuid
from pathlib import Path
import argparse
class BlueSkyPostScraper:
def __init__(self, identifier: str, password: str, output_dir: str = "posts"):
"""
Initialize the BlueSky scraper with your credentials.
"""
self.base_url = "https://bsky.social/xrpc"
self.session = requests.Session()
self.auth_token = None
self.did = None
self.output_dir = Path(output_dir)
self.media_dir = self.output_dir / "media"
self.output_dir.mkdir(parents=True, exist_ok=True)
self.media_dir.mkdir(parents=True, exist_ok=True)
self.tracking_file = self.output_dir / "downloaded_posts.json"
self.downloaded_posts = self._load_tracking()
self._login(identifier, password)
self.username = identifier
def download_media(self, image_info: Dict) -> Optional[Tuple[str, str]]:
"""
Download media from BlueSky and return the local path.
Returns (filename, mime_type) tuple if successful, None if failed.
"""
try:
# Extract image URL and type
blob = image_info.get("blob", {})
ref = blob.get("ref", {})
link = f"https://bsky.social/xrpc/com.atproto.sync.getBlob?did={self.did}&cid={ref.get('$link', '')}"
mime_type = blob.get("mimeType", "")
if not mime_type.startswith("image/"):
return None
# Download the image
response = self.session.get(link)
if response.status_code != 200:
return None
# Create filename based on content hash
content_hash = hashlib.sha256(response.content).hexdigest()[:16]
ext = mime_type.split("/")[-1]
filename = f"{content_hash}.{ext}"
filepath = self.media_dir / filename
# Save the image
with open(filepath, "wb") as f:
f.write(response.content)
return filename, mime_type
except Exception as e:
print(f"Failed to download media: {e}")
return None
def create_toml_doc(
self, post: Dict, creator_name: str, media_files: List[Tuple[str, str]] = None
) -> tomlkit.document:
"""Create a TOML document for a post."""
doc = tomlkit.document()
# Convert UTC datetime string to local datetime object
utc_dt = datetime.fromisoformat(post["created_at"].replace("Z", "+00:00"))
local_dt = utc_dt.astimezone()
# Basic metadata
doc["uid"] = str(uuid.uuid4())
doc["creator"] = creator_name
doc["primary"] = "journal"
doc["secondary"] = "nonsense"
doc["created"] = local_dt
doc["slug"] = self.create_slug(local_dt)
# Content section
content = tomlkit.table()
content["plain"] = "\n" + post["message"] + "\n"
# Add media information if present
if media_files:
media = tomlkit.array()
for filename, mime_type in media_files:
media_item = tomlkit.inline_table()
media_item["filename"] = filename
media_item["type"] = mime_type
media.append(media_item)
content["media"] = media
doc["content"] = content
return doc
def get_posts(self, limit: int = 100) -> List[Dict]:
"""Fetch your posts from BlueSky."""
posts = []
cursor = None
while len(posts) < limit:
params = {"actor": self.did, "limit": min(100, limit - len(posts))}
if cursor:
params["cursor"] = cursor
response = self.session.get(
f"{self.base_url}/app.bsky.feed.getAuthorFeed", params=params
)
if response.status_code != 200:
break
data = response.json()
feed_items = data.get("feed", [])
if not feed_items:
break
for item in feed_items:
post = item["post"]
post_id = post["uri"]
# Skip if we've already downloaded this post
if post_id in self.downloaded_posts:
continue
# Extract post data including embedded media
post_data = {
"message": post["record"].get("text", ""),
"created_at": post["record"].get("createdAt", ""),
"embed": post["record"].get("embed", {}),
}
posts.append((post_id, post_data))
cursor = data.get("cursor")
if not cursor:
break
return posts
def save_posts_as_toml(self, posts: List[tuple], creator_name: str) -> None:
"""Save posts as individual TOML files."""
for post_id, post_data in posts:
# Handle media if present
media_files = []
embed = post_data.get("embed", {})
# Handle images
if embed.get("$type") == "app.bsky.embed.images":
for image in embed.get("images", []):
result = self.download_media(image)
if result:
media_files.append(result)
# Create TOML document with media information
doc = self.create_toml_doc(post_data, creator_name, media_files)
# Create filename based on created date
created_dt = datetime.fromisoformat(
post_data["created_at"].replace("Z", "+00:00")
)
filename = f"{created_dt.strftime('%Y%m%d_%H%M%S')}.toml"
filepath = self.output_dir / filename
# Save TOML file
with open(filepath, "w", encoding="utf-8") as f:
tomlkit.dump(doc, f)
# Mark post as downloaded
self.downloaded_posts.add(post_id)
# Save tracking file
self._save_tracking()
def _load_tracking(self) -> set:
"""Load the set of previously downloaded post IDs."""
if self.tracking_file.exists():
with open(self.tracking_file, "r") as f:
return set(json.load(f))
return set()
def _save_tracking(self) -> None:
"""Save the set of downloaded post IDs."""
with open(self.tracking_file, "w") as f:
json.dump(list(self.downloaded_posts), f)
def _login(self, identifier: str, password: str) -> None:
"""Authenticate with BlueSky."""
auth_response = self.session.post(
f"{self.base_url}/com.atproto.server.createSession",
json={"identifier": identifier, "password": password},
)
if auth_response.status_code != 200:
raise Exception("Authentication failed")
auth_data = auth_response.json()
self.auth_token = auth_data["accessJwt"]
self.did = auth_data["did"]
self.session.headers.update({"Authorization": f"Bearer {self.auth_token}"})
def create_slug(self, dt: datetime) -> str:
"""Create a slug from a datetime object."""
return dt.strftime("%Y/%m/%d/%H%M%S")
def main():
parser = argparse.ArgumentParser(
description="Scrape BlueSky posts and save as TOML files"
)
parser.add_argument("identifier", help="Your BlueSky handle or email")
parser.add_argument("password", help="Your BlueSky password")
parser.add_argument(
"--creator", help="Creator name for TOML files (defaults to identifier)"
)
parser.add_argument(
"--limit",
type=int,
default=500,
help="Maximum number of posts to retrieve (default: 500)",
)
parser.add_argument(
"--output-dir",
default="posts",
help="Directory to store TOML files (default: posts)",
)
args = parser.parse_args()
creator_name = args.creator if args.creator else args.identifier
try:
scraper = BlueSkyPostScraper(args.identifier, args.password, args.output_dir)
posts = scraper.get_posts(limit=args.limit)
scraper.save_posts_as_toml(posts, creator_name)
print(f"Successfully scraped {len(posts)} new posts")
except Exception as e:
print(f"Error: {str(e)}")
if __name__ == "__main__":
main()
-
And before someone pipes up with “BlueSky/AT-protocol is decentralised!”, no, it isn’t. The corporation behind it claims it is, but so far it only aspires to be. If you’re curious, you can read more here.↩︎