Files
consentos/apps/api/src/cli/seed_known_cookies.py
James Cottrill fbf26453f2 feat: initial public release
ConsentOS — a privacy-first cookie consent management platform.

Self-hosted, source-available alternative to OneTrust, Cookiebot, and
CookieYes. Full standards coverage (IAB TCF v2.2, GPP v1, Google
Consent Mode v2, GPC, Shopify Customer Privacy API), multi-tenant
architecture with role-based access, configuration cascade
(system → org → group → site → region), dark-pattern detection in
the scanner, and a tamper-evident consent record audit trail.

This is the initial public release. Prior development history is
retained internally.

See README.md for the feature list, architecture overview, and
quick-start instructions. Licensed under the Elastic Licence 2.0 —
self-host freely; do not resell as a managed service.
2026-04-14 09:18:18 +00:00

138 lines
4.8 KiB
Python

"""Seed the known_cookies table from the Open Cookie Database CSV.
Usage:
python -m src.cli.seed_known_cookies [--csv PATH] [--clear]
The Open Cookie Database is a community-maintained catalogue of ~2,200+
cookie patterns. See https://github.com/jkwakman/Open-Cookie-Database
"""
from __future__ import annotations
import argparse
import csv
import sys
import uuid
from pathlib import Path
import sqlalchemy as sa
# ---------------------------------------------------------------------------
# Category mapping: Open Cookie Database category → CMP slug
# ---------------------------------------------------------------------------
_CATEGORY_MAP: dict[str, str] = {
"Functional": "functional",
"Analytics": "analytics",
"Marketing": "marketing",
"Personalization": "personalisation",
"Security": "necessary",
}
_DEFAULT_CSV = Path(__file__).resolve().parent.parent.parent / "data" / "open-cookie-database.csv"
def _build_sync_url(async_url: str) -> str:
"""Convert an asyncpg DSN to a psycopg2 DSN for one-off scripts."""
return async_url.replace("postgresql+asyncpg://", "postgresql://")
def seed(csv_path: Path, *, clear: bool = False) -> int:
"""Read the CSV and upsert rows into known_cookies.
Returns the number of rows inserted.
"""
from src.config.settings import get_settings
settings = get_settings()
engine = sa.create_engine(_build_sync_url(settings.database_url))
with engine.begin() as conn:
# Build slug → category_id lookup
rows = conn.execute(sa.text("SELECT id, slug FROM cookie_categories"))
slug_to_id: dict[str, str] = {r[1]: str(r[0]) for r in rows}
if clear:
conn.execute(sa.text("DELETE FROM known_cookies"))
inserted = 0
with csv_path.open(newline="", encoding="utf-8") as fh:
reader = csv.DictReader(fh)
for row in reader:
category = row.get("Category", "").strip()
slug = _CATEGORY_MAP.get(category)
if not slug or slug not in slug_to_id:
continue
name = row.get("Cookie / Data Key name", "").strip()
if not name:
continue
domain_raw = row.get("Domain", "").strip()
domain = domain_raw if domain_raw else "*"
wildcard = row.get("Wildcard match", "0").strip() == "1"
description = row.get("Description", "").strip() or None
vendor = row.get("Platform", "").strip() or None
# Build pattern: if wildcard, append * to name for glob matching
name_pattern = f"{name}*" if wildcard else name
is_regex = False
conn.execute(
sa.text(
"""
INSERT INTO known_cookies
(id, name_pattern, domain_pattern, category_id,
vendor, description, is_regex, created_at, updated_at)
VALUES
(:id, :name_pattern, :domain_pattern, :category_id,
:vendor, :description, :is_regex, NOW(), NOW())
ON CONFLICT (name_pattern, domain_pattern) DO UPDATE SET
category_id = EXCLUDED.category_id,
vendor = EXCLUDED.vendor,
description = EXCLUDED.description,
is_regex = EXCLUDED.is_regex,
updated_at = NOW()
"""
),
{
"id": str(uuid.uuid4()),
"name_pattern": name_pattern,
"domain_pattern": domain,
"category_id": slug_to_id[slug],
"vendor": vendor,
"description": description,
"is_regex": is_regex,
},
)
inserted += 1
return inserted
def main() -> None:
parser = argparse.ArgumentParser(description="Seed known cookies from Open Cookie Database")
parser.add_argument(
"--csv",
type=Path,
default=_DEFAULT_CSV,
help="Path to the Open Cookie Database CSV (default: bundled copy)",
)
parser.add_argument(
"--clear",
action="store_true",
help="Delete all existing known_cookies before importing",
)
args = parser.parse_args()
if not args.csv.exists():
print(f"Error: CSV not found at {args.csv}", file=sys.stderr)
sys.exit(1)
count = seed(args.csv, clear=args.clear)
print(f"Seeded {count} known cookie patterns from {args.csv.name}")
if __name__ == "__main__":
main()