Coverage for scripts / ci / maintenance / ghcr_prune_untagged.py: 79%
170 statements
« prev ^ index » next coverage.py v7.13.0, created at 2026-04-03 18:53 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2026-04-03 18:53 +0000
1"""Prune untagged GHCR image versions for this repository.
3Google-style docstring.
5This script lists container package versions for the current repo on GHCR and
6deletes those that have no tags AND are older than a retention period.
8NOTE: All publish jobs now use ``provenance: false`` and ``sbom: false``,
9producing simple Docker v2 manifests with no untagged OCI child manifests.
10The min-age guard is retained as defense-in-depth but is no longer the
11primary protection mechanism.
13Requires GITHUB_TOKEN with packages:write scope in Actions.
14"""
16from __future__ import annotations
18import os
19from collections.abc import Mapping
20from dataclasses import dataclass, field
21from datetime import UTC, datetime, timedelta
22from typing import Any, Protocol, cast
24import httpx
25from loguru import logger
27# Default minimum age before an untagged version can be deleted (days)
28DEFAULT_MIN_AGE_DAYS = 7
31@dataclass
32class GhcrVersion:
33 """Container version metadata minimal subset.
35 Attributes:
36 id: Numeric version id.
37 tags: List of tags bound to this version.
38 created_at: ISO timestamp when version was created.
39 name: The manifest digest/name for this version.
40 """
42 id: int = field(default=0)
43 tags: list[str] = field(default_factory=list)
44 created_at: str = field(default="")
45 name: str = field(default="")
48# Protocols for typed client/response behavior (enables lightweight test doubles).
49class _ResponseProto(Protocol):
50 headers: Mapping[str, str]
51 status_code: int
53 def raise_for_status(self) -> None: ...
55 def json(self) -> Any: ...
58class GhcrClient(Protocol):
59 """Protocol for GHCR API client (httpx-compatible)."""
61 def get(
62 self,
63 url: str,
64 *,
65 headers: Mapping[str, str] | None = ...,
66 ) -> _ResponseProto:
67 """Send GET request to URL."""
68 ...
70 def delete(
71 self,
72 url: str,
73 *,
74 headers: Mapping[str, str] | None = ...,
75 ) -> _ResponseProto:
76 """Send DELETE request to URL."""
77 ...
80def get_repo_owner_repo() -> tuple[str, str]:
81 """Return (owner, repo) from GITHUB_REPOSITORY env.
83 Returns:
84 tuple[str, str]: owner and repo.
85 """
86 repo = os.environ.get("GITHUB_REPOSITORY", "lgtm-hq/py-lintro")
87 owner, name = repo.split("/", 1)
88 return owner, name
91def _parse_link_header(link_header: str | None) -> str | None:
92 """Parse the GitHub Link header to extract the 'next' page URL.
94 Args:
95 link_header: The Link header value from GitHub API response.
97 Returns:
98 The URL for the next page, or None if no next page.
99 """
100 if not link_header:
101 return None
102 # Link header format: <url>; rel="next", <url>; rel="last"
103 for part in link_header.split(","):
104 part = part.strip()
105 if 'rel="next"' in part:
106 # Extract URL between < and >
107 start = part.find("<")
108 end = part.find(">")
109 if start != -1 and end != -1:
110 return part[start + 1 : end]
111 return None
114def _parse_version_item(item: dict[str, Any]) -> GhcrVersion | None:
115 """Parse a single version item from the API response.
117 Args:
118 item: Dictionary from API response.
120 Returns:
121 GhcrVersion if successfully parsed, None otherwise.
122 """
123 vid_raw = item.get("id")
124 if vid_raw is None:
125 logger.error(
126 "API response missing 'id' field for item with created_at: {}",
127 item.get("created_at", "unknown"),
128 )
129 return None
130 try:
131 vid = int(vid_raw)
132 except (ValueError, TypeError) as e:
133 logger.error(
134 "Invalid 'id' value '{}' for item with created_at: {} - {}",
135 vid_raw,
136 item.get("created_at", "unknown"),
137 e,
138 )
139 return None
140 raw_tags = item.get("metadata", {}).get("container", {}).get("tags")
141 tags = list(raw_tags or [])
142 created_at = str(item.get("created_at", ""))
143 name = str(item.get("name", ""))
144 return GhcrVersion(id=vid, tags=tags, created_at=created_at, name=name)
147def _get_owner_type(client: GhcrClient, owner: str) -> str:
148 """Determine if owner is a user or organization.
150 Args:
151 client: Authenticated HTTP client.
152 owner: Repository owner name.
154 Returns:
155 "Organization" or "User" based on GitHub API response.
156 """
157 resp = client.get(
158 f"https://api.github.com/users/{owner}",
159 headers={"Accept": "application/vnd.github+json"},
160 )
161 resp.raise_for_status()
162 data: dict[str, Any] = resp.json()
163 return str(data.get("type", "User"))
166def list_container_versions(
167 client: GhcrClient,
168 owner: str,
169 package_name: str = "py-lintro",
170 base_path: str | None = None,
171) -> list[GhcrVersion]:
172 """List container versions for a package (supports both users and orgs).
174 Handles pagination to retrieve all versions across multiple pages.
175 Uses provided base_path or auto-detects owner type for API endpoint.
177 Args:
178 client: Authenticated HTTP client.
179 owner: Repository owner (user/org).
180 package_name: Name of the container package.
181 base_path: Pre-computed API base path (avoids redundant owner type lookups).
183 Returns:
184 list[GhcrVersion]: Version entries.
185 """
186 versions: list[GhcrVersion] = []
188 # Use provided base_path or compute it (fallback for direct calls)
189 if base_path is None:
190 owner_type = _get_owner_type(client, owner)
191 if owner_type == "Organization":
192 base_path = f"https://api.github.com/orgs/{owner}/packages/container"
193 else:
194 base_path = f"https://api.github.com/users/{owner}/packages/container"
196 url: str | None = f"{base_path}/{package_name}/versions?per_page=100"
198 while url:
199 resp = client.get(url, headers={"Accept": "application/vnd.github+json"})
200 resp.raise_for_status()
201 data: list[dict[str, Any]] = resp.json()
203 for item in data:
204 version = _parse_version_item(item)
205 if version is not None:
206 versions.append(version)
208 # Check for next page via Link header
209 url = _parse_link_header(resp.headers.get("link"))
211 return versions
214def parse_iso_datetime(iso_str: str) -> datetime | None:
215 """Parse ISO 8601 datetime string to timezone-aware datetime object.
217 Args:
218 iso_str: ISO format datetime string (e.g., "2026-01-31T20:05:01Z").
220 Returns:
221 Timezone-aware datetime object in UTC, or None if parsing fails.
222 """
223 if not iso_str:
224 return None
225 try:
226 # Handle Z suffix and +00:00 formats
227 iso_str = iso_str.replace("Z", "+00:00")
228 dt = datetime.fromisoformat(iso_str)
229 # Ensure timezone awareness - attach UTC if naive
230 if dt.tzinfo is None:
231 dt = dt.replace(tzinfo=UTC)
232 return dt
233 except ValueError:
234 logger.warning("Failed to parse datetime: {}", iso_str)
235 return None
238def is_older_than_days(created_at: str, min_age_days: int) -> bool:
239 """Check if a version is older than the specified number of days.
241 Args:
242 created_at: ISO timestamp when version was created.
243 min_age_days: Minimum age in days before deletion is allowed.
245 Returns:
246 True if version is older than min_age_days, False otherwise.
247 """
248 created = parse_iso_datetime(created_at)
249 if created is None:
250 # If we can't parse the date, don't delete (be conservative)
251 return False
252 cutoff = datetime.now(UTC) - timedelta(days=min_age_days)
253 return created < cutoff
256def delete_version(
257 client: GhcrClient,
258 owner: str,
259 version_id: int,
260 package_name: str = "py-lintro",
261 base_path: str | None = None,
262) -> None:
263 """Delete a container version by id.
265 Args:
266 client: Authenticated HTTP client.
267 owner: Repository owner (user/org).
268 version_id: GHCR version id to delete.
269 package_name: Name of the container package.
270 base_path: Pre-computed API base path (avoids redundant owner type lookups).
271 """
272 # Use provided base_path or compute it (fallback for direct calls)
273 if base_path is None:
274 owner_type = _get_owner_type(client, owner)
275 if owner_type == "Organization":
276 base_path = f"https://api.github.com/orgs/{owner}/packages/container"
277 else:
278 base_path = f"https://api.github.com/users/{owner}/packages/container"
280 url = f"{base_path}/{package_name}/versions/{version_id}"
281 resp = client.delete(url, headers={"Accept": "application/vnd.github+json"})
282 # 204 no content on success
283 if resp.status_code not in (204, 404):
284 resp.raise_for_status()
287def prune_package(
288 client: GhcrClient,
289 owner: str,
290 package_name: str,
291 *,
292 dry_run: bool,
293 min_age_days: int,
294 keep_n: int,
295) -> int:
296 """Prune untagged versions for a single package.
298 Args:
299 client: Authenticated HTTP client.
300 owner: Repository owner (user/org).
301 package_name: Name of the container package.
302 dry_run: If True, only log what would be deleted.
303 min_age_days: Minimum age in days before deletion is allowed.
304 keep_n: Keep at least N most recent untagged versions.
306 Returns:
307 Number of versions deleted (or would be deleted in dry-run).
309 Raises:
310 httpx.HTTPStatusError: If API request fails (except 404 which is handled).
311 """
312 logger.info("Processing package: {}", package_name)
314 # Compute base_path once to avoid redundant API calls
315 owner_type = _get_owner_type(client, owner)
316 if owner_type == "Organization":
317 base_path = f"https://api.github.com/orgs/{owner}/packages/container"
318 else:
319 base_path = f"https://api.github.com/users/{owner}/packages/container"
321 try:
322 versions = list_container_versions(
323 client=client,
324 owner=owner,
325 package_name=package_name,
326 base_path=base_path,
327 )
328 except httpx.HTTPStatusError as e:
329 if e.response.status_code == 404:
330 logger.warning("Package {} not found, skipping", package_name)
331 return 0
332 raise
334 # Filter to untagged versions only
335 untagged = [v for v in versions if len(v.tags) == 0]
336 logger.info(
337 "Found {} total versions, {} untagged for {}",
338 len(versions),
339 len(untagged),
340 package_name,
341 )
343 # Filter to versions older than min_age_days
344 # This protects multi-arch manifest dependencies that are recently created
345 old_enough = [v for v in untagged if is_older_than_days(v.created_at, min_age_days)]
346 protected_count = len(untagged) - len(old_enough)
347 if protected_count > 0:
348 logger.info(
349 "Protected {} untagged versions younger than {} days",
350 protected_count,
351 min_age_days,
352 )
354 # Keep the N most recent untagged by created_at (descending)
355 if keep_n > 0:
356 # Sort by parsed datetime for accurate ordering (not string comparison)
357 old_enough.sort(
358 key=lambda v: parse_iso_datetime(v.created_at)
359 or datetime.min.replace(
360 tzinfo=UTC,
361 ),
362 reverse=True,
363 )
364 to_delete = old_enough[keep_n:]
365 if len(old_enough) > len(to_delete):
366 logger.info(
367 "Keeping {} most recent untagged versions per keep_n setting",
368 keep_n,
369 )
370 else:
371 to_delete = old_enough
373 deleted = 0
374 for v in to_delete:
375 if dry_run:
376 logger.info(
377 "[dry-run] Would delete {} version id={} name={} created_at={}",
378 package_name,
379 v.id,
380 v.name[:12] + "..." if len(v.name) > 15 else v.name,
381 v.created_at,
382 )
383 else:
384 delete_version(
385 client=client,
386 owner=owner,
387 version_id=v.id,
388 package_name=package_name,
389 base_path=base_path,
390 )
391 logger.info(
392 "Deleted {} version id={} created_at={}",
393 package_name,
394 v.id,
395 v.created_at,
396 )
397 deleted += 1
399 return deleted
402def main() -> int:
403 """Entry point.
405 Returns:
406 int: Process exit code.
407 """
408 token = os.environ.get("GITHUB_TOKEN")
409 if not token:
410 logger.error("GITHUB_TOKEN is required")
411 return 2
413 owner, _ = get_repo_owner_repo()
414 headers = {
415 "Authorization": f"Bearer {token}",
416 "X-GitHub-Api-Version": "2022-11-28",
417 "User-Agent": "py-lintro-ghcr-cleanup",
418 }
420 dry_run = os.environ.get("GHCR_PRUNE_DRY_RUN", "0") == "1"
422 # Minimum age before untagged versions can be deleted (protects multi-arch deps)
423 min_age_days_env = os.environ.get(
424 "GHCR_PRUNE_MIN_AGE_DAYS",
425 str(DEFAULT_MIN_AGE_DAYS),
426 )
427 try:
428 min_age_days = int(min_age_days_env)
429 except ValueError:
430 min_age_days = DEFAULT_MIN_AGE_DAYS
431 if min_age_days < 0:
432 min_age_days = DEFAULT_MIN_AGE_DAYS
434 # Keep at least N most recent untagged versions
435 keep_n_env = os.environ.get("GHCR_PRUNE_KEEP_UNTAGGED_N", "0")
436 try:
437 keep_n = int(keep_n_env)
438 except ValueError:
439 keep_n = 0
440 if keep_n < 0:
441 keep_n = 0
443 # Packages to clean up
444 packages = ["py-lintro", "lintro-tools"]
446 logger.info(
447 "GHCR cleanup starting (dry_run={}, min_age_days={}, keep_n={})",
448 dry_run,
449 min_age_days,
450 keep_n,
451 )
453 total_deleted = 0
454 with httpx.Client(headers=headers, timeout=30) as client:
455 # Cast httpx.Client to GhcrClient - they are compatible at runtime
456 # but mypy can't verify this due to httpx's complex method signatures
457 typed_client = cast(GhcrClient, client)
458 for package_name in packages:
459 deleted = prune_package(
460 client=typed_client,
461 owner=owner,
462 package_name=package_name,
463 dry_run=dry_run,
464 min_age_days=min_age_days,
465 keep_n=keep_n,
466 )
467 total_deleted += deleted
469 action = "Would delete" if dry_run else "Deleted"
470 logger.info("{} {} untagged GHCR versions total", action, total_deleted)
471 return 0
474if __name__ == "__main__":
475 raise SystemExit(main())