Emdash source with visual editor image upload fix

Fixes: 1. media.ts: wrap placeholder generation in try-catch 2. toolbar.ts: check r.ok, display error message in popover
2026-05-03 10:44:54 +07:00
parent 78f81bebb6
commit 2d1be52177
2352 changed files with 662964 additions and 0 deletions
--- a/packages/gutenberg-to-portable-text/CHANGELOG.md
+++ b/packages/gutenberg-to-portable-text/CHANGELOG.md
@@ -0,0 +1,23 @@
+# @emdash-cms/gutenberg-to-portable-text
+
+## 0.9.0
+
+## 0.8.0
+
+## 0.7.0
+
+## 0.6.0
+
+## 0.5.0
+
+## 0.4.0
+
+## 0.3.0
+
+## 0.2.0
+
+## 0.1.0
+
+### Minor Changes
+
+- [#14](https://github.com/emdash-cms/emdash/pull/14) [`755b501`](https://github.com/emdash-cms/emdash/commit/755b5017906811f97f78f4c0b5a0b62e67b52ec4) Thanks [@ascorbic](https://github.com/ascorbic)! - First beta release
--- a/packages/gutenberg-to-portable-text/package.json
+++ b/packages/gutenberg-to-portable-text/package.json
@@ -0,0 +1,53 @@
+{
+  "name": "@emdash-cms/gutenberg-to-portable-text",
+  "version": "0.9.0",
+  "description": "Convert WordPress Gutenberg blocks to Portable Text",
+  "type": "module",
+  "main": "dist/index.mjs",
+  "types": "dist/index.d.mts",
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.mts",
+      "default": "./dist/index.mjs"
+    }
+  },
+  "files": [
+    "dist"
+  ],
+  "scripts": {
+    "build": "tsdown src/index.ts --format esm --dts --clean",
+    "dev": "tsdown src/index.ts --format esm --dts --watch",
+    "test": "vitest",
+    "prepublishOnly": "node --run build",
+    "check": "publint && attw --pack --ignore-rules=cjs-resolves-to-esm",
+    "typecheck": "tsgo --noEmit"
+  },
+  "dependencies": {
+    "@wordpress/block-serialization-default-parser": "^5.13.0",
+    "parse5": "^7.2.1"
+  },
+  "devDependencies": {
+    "@arethetypeswrong/cli": "catalog:",
+    "publint": "catalog:",
+    "tsdown": "catalog:",
+    "typescript": "catalog:",
+    "vitest": "catalog:"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/emdash-cms/emdash.git",
+    "directory": "packages/gutenberg-to-portable-text"
+  },
+  "homepage": "https://github.com/emdash-cms/emdash",
+  "keywords": [
+    "wordpress",
+    "gutenberg",
+    "portable-text",
+    "migration",
+    "blocks"
+  ],
+  "author": "Matt Kane",
+  "license": "MIT",
+  "peerDependencies": {},
+  "optionalDependencies": {}
+}
--- a/packages/gutenberg-to-portable-text/src/index.ts
+++ b/packages/gutenberg-to-portable-text/src/index.ts
@@ -0,0 +1,467 @@
+/**
+ * Gutenberg to Portable Text Converter
+ *
+ * Converts WordPress Gutenberg block content to Portable Text format.
+ * Uses @wordpress/block-serialization-default-parser to parse the hybrid
+ * HTML+JSON format that WordPress uses.
+ */
+
+import { parse } from "@wordpress/block-serialization-default-parser";
+
+import { parseInlineContent } from "./inline.js";
+import { getTransformer } from "./transformers/index.js";
+import type {
+	GutenbergBlock,
+	PortableTextBlock,
+	ConvertOptions,
+	TransformContext,
+} from "./types.js";
+
+// Regex patterns for HTML parsing and conversion
+const BLOCK_ELEMENT_PATTERN =
+	/<(p|h[1-6]|blockquote|pre|ul|ol|figure|div|hr)[^>]*>([\s\S]*?)<\/\1>|<(hr|br)\s*\/?>|<img\s+[^>]+\/?>/gu;
+const LINKED_IMAGE_PATTERN = /<a\s+[^>]*href=["']([^"']+)["'][^>]*>\s*<img\s+([^>]+)\/?>\s*<\/a>/gu;
+const STANDALONE_IMAGE_PATTERN = /<img\s+[^>]+\/?>/gu;
+const IMG_TAG_PATTERN = /<img[^>]+>/i;
+const SRC_ATTR_PATTERN = /src=["']([^"']+)["']/i;
+const ALT_ATTR_PATTERN = /alt=["']([^"']*)["']/i;
+const LIST_ITEM_PATTERN = /<li[^>]*>([\s\S]*?)<\/li>/gu;
+const CODE_TAG_PATTERN = /<code[^>]*>([\s\S]*?)<\/code>/i;
+const HTML_TAG_PATTERN = /<[^>]+>/g;
+const FIGCAPTION_TAG_PATTERN = /<figcaption[^>]*>([\s\S]*?)<\/figcaption>/i;
+const AMP_ENTITY_PATTERN = /&amp;/g;
+const LESS_THAN_ENTITY_PATTERN = /&lt;/g;
+const GREATER_THAN_ENTITY_PATTERN = /&gt;/g;
+const QUOTE_ENTITY_PATTERN = /&quot;/g;
+const APOS_ENTITY_PATTERN = /&#039;/g;
+const NUMERIC_AMP_ENTITY_PATTERN = /&#0?38;/g;
+const HEX_AMP_ENTITY_PATTERN = /&#x26;/gi;
+const NBSP_ENTITY_PATTERN = /&nbsp;/g;
+
+// Re-export types
+export type {
+	GutenbergBlock,
+	PortableTextBlock,
+	PortableTextTextBlock,
+	PortableTextImageBlock,
+	PortableTextCodeBlock,
+	PortableTextEmbedBlock,
+	PortableTextGalleryBlock,
+	PortableTextColumnsBlock,
+	PortableTextBreakBlock,
+	PortableTextHtmlBlock,
+	PortableTextButtonBlock,
+	PortableTextButtonsBlock,
+	PortableTextCoverBlock,
+	PortableTextFileBlock,
+	PortableTextPullquoteBlock,
+	PortableTextSpan,
+	PortableTextMarkDef,
+	ConvertOptions,
+	BlockTransformer,
+	TransformContext,
+} from "./types.js";
+
+// Re-export transformers for customization
+export { defaultTransformers, fallbackTransformer } from "./transformers/index.js";
+export * as coreTransformers from "./transformers/core.js";
+export * as embedTransformers from "./transformers/embed.js";
+
+// Re-export inline utilities
+export {
+	parseInlineContent,
+	extractText,
+	extractAlt,
+	extractCaption,
+	extractSrc,
+} from "./inline.js";
+
+/**
+ * Default key generator
+ */
+function createKeyGenerator(): () => string {
+	let counter = 0;
+	return () => {
+		counter++;
+		return `key-${counter}-${Math.random().toString(36).substring(2, 7)}`;
+	};
+}
+
+/**
+ * Normalize parsed blocks from the WP parser into our GutenbergBlock type.
+ * The WP parser returns `attrs: Record<string, any> | null`, so we normalize
+ * null attrs to empty objects and recursively process innerBlocks.
+ */
+function normalizeBlocks(blocks: ReturnType<typeof parse>): GutenbergBlock[] {
+	return blocks.map(
+		(block): GutenbergBlock => ({
+			blockName: block.blockName,
+			attrs: (block.attrs ?? {}) satisfies Record<string, unknown>,
+			innerHTML: block.innerHTML,
+			innerBlocks: normalizeBlocks(block.innerBlocks),
+			innerContent: block.innerContent,
+		}),
+	);
+}
+
+/**
+ * Convert WordPress Gutenberg content to Portable Text
+ *
+ * @param content - WordPress post content (HTML with Gutenberg block comments)
+ * @param options - Conversion options
+ * @returns Array of Portable Text blocks
+ *
+ * @example
+ * ```ts
+ * const portableText = gutenbergToPortableText(`
+ *   <!-- wp:paragraph -->
+ *   <p>Hello <strong>world</strong>!</p>
+ *   <!-- /wp:paragraph -->
+ * `);
+ * // → [{ _type: "block", style: "normal", children: [...] }]
+ * ```
+ */
+export function gutenbergToPortableText(
+	content: string,
+	options: ConvertOptions = {},
+): PortableTextBlock[] {
+	// Handle empty content
+	if (!content || !content.trim()) {
+		return [];
+	}
+
+	// Check if content has Gutenberg blocks
+	const hasBlocks = content.includes("<!-- wp:");
+
+	if (!hasBlocks) {
+		// Classic editor content - treat as HTML
+		return htmlToPortableText(content, options);
+	}
+
+	// Parse Gutenberg blocks
+	const blocks = normalizeBlocks(parse(content));
+
+	// Create key generator
+	const generateKey = options.keyGenerator || createKeyGenerator();
+
+	// Create transform context
+	const context = createTransformContext(options, generateKey);
+
+	// Transform blocks
+	return blocks.flatMap((block) => transformBlock(block, options, context));
+}
+
+/**
+ * Convert plain HTML (classic editor) to Portable Text
+ */
+export function htmlToPortableText(
+	html: string,
+	options: ConvertOptions = {},
+): PortableTextBlock[] {
+	const generateKey = options.keyGenerator || createKeyGenerator();
+	const blocks: PortableTextBlock[] = [];
+
+	// Split on block-level elements (including standalone img tags)
+	let lastIndex = 0;
+	let match;
+
+	while ((match = BLOCK_ELEMENT_PATTERN.exec(html)) !== null) {
+		const fullMatch = match[0];
+		const tag = (match[1] || match[3] || "").toLowerCase();
+		const content = match[2] || "";
+
+		// Handle text between matches
+		const between = html.slice(lastIndex, match.index).trim();
+		if (between) {
+			const { children, markDefs } = parseInlineContent(between, generateKey);
+			if (children.some((c) => c.text.trim())) {
+				blocks.push({
+					_type: "block",
+					_key: generateKey(),
+					style: "normal",
+					children,
+					markDefs: markDefs.length > 0 ? markDefs : undefined,
+				});
+			}
+		}
+		lastIndex = match.index + match[0].length;
+
+		// Check for standalone <img> tag (not wrapped in figure/p)
+		if (fullMatch.toLowerCase().startsWith("<img")) {
+			const srcMatch = fullMatch.match(SRC_ATTR_PATTERN);
+			const altMatch = fullMatch.match(ALT_ATTR_PATTERN);
+			if (srcMatch?.[1]) {
+				const imgUrl = decodeUrlEntities(srcMatch[1]);
+				blocks.push({
+					_type: "image",
+					_key: generateKey(),
+					asset: {
+						_type: "reference",
+						_ref: imgUrl,
+						url: imgUrl,
+					},
+					alt: altMatch?.[1],
+				});
+			}
+			continue;
+		}
+
+		// Transform based on tag
+		switch (tag) {
+			case "p":
+			case "div": {
+				// Extract any images first (including those wrapped in <a> tags)
+				// Match: <a...><img...></a> or standalone <img...>
+				// Track positions of linked images so we don't double-process
+				const linkedImgPositions: Array<{ start: number; end: number }> = [];
+
+				// First extract linked images
+				let linkedMatch;
+				while ((linkedMatch = LINKED_IMAGE_PATTERN.exec(content)) !== null) {
+					const linkUrl = decodeUrlEntities(linkedMatch[1]!);
+					const imgAttrs = linkedMatch[2]!;
+					const srcMatch = imgAttrs.match(SRC_ATTR_PATTERN);
+					const altMatch = imgAttrs.match(ALT_ATTR_PATTERN);
+					if (srcMatch?.[1]) {
+						const imgUrl = decodeUrlEntities(srcMatch[1]);
+						blocks.push({
+							_type: "image",
+							_key: generateKey(),
+							asset: {
+								_type: "reference",
+								_ref: imgUrl,
+								url: imgUrl,
+							},
+							alt: altMatch?.[1],
+							link: linkUrl,
+						});
+					}
+					linkedImgPositions.push({
+						start: linkedMatch.index,
+						end: linkedMatch.index + linkedMatch[0].length,
+					});
+				}
+
+				// Then extract standalone images (not inside <a> tags)
+				let imgMatch;
+				while ((imgMatch = STANDALONE_IMAGE_PATTERN.exec(content)) !== null) {
+					// Skip if this image is inside a linked image we already processed
+					const isLinked = linkedImgPositions.some(
+						(pos) => imgMatch!.index >= pos.start && imgMatch!.index < pos.end,
+					);
+					if (isLinked) continue;
+
+					const srcMatch = imgMatch[0].match(SRC_ATTR_PATTERN);
+					const altMatch = imgMatch[0].match(ALT_ATTR_PATTERN);
+					if (srcMatch?.[1]) {
+						const imgUrl = decodeUrlEntities(srcMatch[1]);
+						blocks.push({
+							_type: "image",
+							_key: generateKey(),
+							asset: {
+								_type: "reference",
+								_ref: imgUrl,
+								url: imgUrl,
+							},
+							alt: altMatch?.[1],
+						});
+					}
+				}
+
+				// Then handle the text content (with images and image links stripped)
+				let textContent = content
+					.replace(LINKED_IMAGE_PATTERN, "") // Remove linked images
+					.replace(STANDALONE_IMAGE_PATTERN, "") // Remove standalone images
+					.trim();
+				if (textContent) {
+					const { children, markDefs } = parseInlineContent(textContent, generateKey);
+					if (children.some((c) => c.text.trim())) {
+						blocks.push({
+							_type: "block",
+							_key: generateKey(),
+							style: "normal",
+							children,
+							markDefs: markDefs.length > 0 ? markDefs : undefined,
+						});
+					}
+				}
+				break;
+			}
+
+			case "h1":
+			case "h2":
+			case "h3":
+			case "h4":
+			case "h5":
+			case "h6": {
+				const { children, markDefs } = parseInlineContent(content, generateKey);
+				blocks.push({
+					_type: "block",
+					_key: generateKey(),
+					style: tag,
+					children,
+					markDefs: markDefs.length > 0 ? markDefs : undefined,
+				});
+				break;
+			}
+
+			case "blockquote": {
+				const { children, markDefs } = parseInlineContent(content, generateKey);
+				blocks.push({
+					_type: "block",
+					_key: generateKey(),
+					style: "blockquote",
+					children,
+					markDefs: markDefs.length > 0 ? markDefs : undefined,
+				});
+				break;
+			}
+
+			case "pre": {
+				// Extract code content
+				const codeMatch = content.match(CODE_TAG_PATTERN);
+				const code = codeMatch?.[1] || content;
+				blocks.push({
+					_type: "code",
+					_key: generateKey(),
+					code: decodeHtmlEntities(code),
+				});
+				break;
+			}
+
+			case "ul":
+			case "ol": {
+				const listItem = tag === "ol" ? "number" : "bullet";
+				let liMatch;
+				while ((liMatch = LIST_ITEM_PATTERN.exec(content)) !== null) {
+					const liContent = liMatch[1] || "";
+					const { children, markDefs } = parseInlineContent(liContent, generateKey);
+					blocks.push({
+						_type: "block",
+						_key: generateKey(),
+						style: "normal",
+						listItem,
+						level: 1,
+						children,
+						markDefs: markDefs.length > 0 ? markDefs : undefined,
+					});
+				}
+				break;
+			}
+
+			case "hr": {
+				blocks.push({
+					_type: "break",
+					_key: generateKey(),
+					style: "lineBreak",
+				});
+				break;
+			}
+
+			case "figure": {
+				// Check for image
+				const imgMatch = content.match(IMG_TAG_PATTERN);
+				if (imgMatch) {
+					const srcMatch = imgMatch[0].match(SRC_ATTR_PATTERN);
+					const altMatch = imgMatch[0].match(ALT_ATTR_PATTERN);
+					const captionMatch = content.match(FIGCAPTION_TAG_PATTERN);
+					const imgUrl = srcMatch?.[1] ? decodeUrlEntities(srcMatch[1]) : "";
+
+					blocks.push({
+						_type: "image",
+						_key: generateKey(),
+						asset: {
+							_type: "reference",
+							_ref: imgUrl,
+							url: imgUrl || undefined,
+						},
+						alt: altMatch?.[1],
+						caption: captionMatch?.[1]?.replace(HTML_TAG_PATTERN, "").trim(),
+					});
+				}
+				break;
+			}
+		}
+	}
+
+	// Handle remaining text
+	const remaining = html.slice(lastIndex).trim();
+	if (remaining) {
+		const { children, markDefs } = parseInlineContent(remaining, generateKey);
+		if (children.some((c) => c.text.trim())) {
+			blocks.push({
+				_type: "block",
+				_key: generateKey(),
+				style: "normal",
+				children,
+				markDefs: markDefs.length > 0 ? markDefs : undefined,
+			});
+		}
+	}
+
+	return blocks;
+}
+
+/**
+ * Create transform context for recursive block transformation
+ */
+function createTransformContext(
+	options: ConvertOptions,
+	generateKey: () => string,
+): TransformContext {
+	const context: TransformContext = {
+		generateKey,
+		parseInlineContent: (html: string) => parseInlineContent(html, generateKey),
+		transformBlocks: (blocks: GutenbergBlock[]) =>
+			blocks.flatMap((block) => transformBlock(block, options, context)),
+	};
+	return context;
+}
+
+/**
+ * Transform a single block
+ */
+function transformBlock(
+	block: GutenbergBlock,
+	options: ConvertOptions,
+	context: TransformContext,
+): PortableTextBlock[] {
+	const transformer = getTransformer(block.blockName, options.customTransformers);
+	return transformer(block, options, context);
+}
+
+/**
+ * Decode HTML entities
+ */
+function decodeHtmlEntities(html: string): string {
+	return html
+		.replace(LESS_THAN_ENTITY_PATTERN, "<")
+		.replace(GREATER_THAN_ENTITY_PATTERN, ">")
+		.replace(AMP_ENTITY_PATTERN, "&")
+		.replace(QUOTE_ENTITY_PATTERN, '"')
+		.replace(APOS_ENTITY_PATTERN, "'")
+		.replace(NUMERIC_AMP_ENTITY_PATTERN, "&") // &#038; or &#38;
+		.replace(HEX_AMP_ENTITY_PATTERN, "&") // &#x26;
+		.replace(NBSP_ENTITY_PATTERN, " ");
+}
+
+/**
+ * Decode HTML entities in URLs (used for image src attributes)
+ */
+function decodeUrlEntities(url: string): string {
+	return url
+		.replace(AMP_ENTITY_PATTERN, "&")
+		.replace(NUMERIC_AMP_ENTITY_PATTERN, "&")
+		.replace(HEX_AMP_ENTITY_PATTERN, "&");
+}
+
+/**
+ * Parse Gutenberg blocks without converting to Portable Text
+ * Useful for inspection and debugging
+ */
+export function parseGutenbergBlocks(content: string): GutenbergBlock[] {
+	if (!content || !content.trim()) {
+		return [];
+	}
+	return normalizeBlocks(parse(content));
+}
--- a/packages/gutenberg-to-portable-text/src/inline.ts
+++ b/packages/gutenberg-to-portable-text/src/inline.ts
@@ -0,0 +1,333 @@
+/**
+ * Inline HTML to Portable Text spans converter
+ *
+ * Parses inline HTML elements (strong, em, a, code, etc.) and converts
+ * them to Portable Text spans with marks.
+ */
+
+import { parseFragment, type DefaultTreeAdapterMap } from "parse5";
+
+import type { PortableTextSpan, PortableTextMarkDef } from "./types.js";
+import { sanitizeHref } from "./url.js";
+
+// Regex patterns for inline parsing
+const WHITESPACE_PATTERN = /\S/;
+
+// Pre-compiled block tag patterns
+const BLOCK_TAG_PATTERNS: Record<string, { open: RegExp; close: RegExp }> = {
+	p: { open: /^<p[^>]*>/i, close: /<\/p>$/i },
+	h1: { open: /^<h1[^>]*>/i, close: /<\/h1>$/i },
+	h2: { open: /^<h2[^>]*>/i, close: /<\/h2>$/i },
+	h3: { open: /^<h3[^>]*>/i, close: /<\/h3>$/i },
+	h4: { open: /^<h4[^>]*>/i, close: /<\/h4>$/i },
+	h5: { open: /^<h5[^>]*>/i, close: /<\/h5>$/i },
+	h6: { open: /^<h6[^>]*>/i, close: /<\/h6>$/i },
+	li: { open: /^<li[^>]*>/i, close: /<\/li>$/i },
+	blockquote: { open: /^<blockquote[^>]*>/i, close: /<\/blockquote>$/i },
+	figcaption: { open: /^<figcaption[^>]*>/i, close: /<\/figcaption>$/i },
+};
+
+// Regex patterns for extracting attributes
+const IMG_ALT_PATTERN = /<img[^>]+alt=["']([^"']*)["']/i;
+const FIGCAPTION_PATTERN = /<figcaption[^>]*>([\s\S]*?)<\/figcaption>/i;
+const IMG_SRC_PATTERN = /<img[^>]+src=["']([^"']*)["']/i;
+const URL_AMP_ENTITY_PATTERN = /&amp;/g;
+const URL_NUMERIC_AMP_ENTITY_PATTERN = /&#0?38;/g;
+const URL_HEX_AMP_ENTITY_PATTERN = /&#x26;/gi;
+
+type Node = DefaultTreeAdapterMap["node"];
+type TextNode = DefaultTreeAdapterMap["textNode"];
+type Element = DefaultTreeAdapterMap["element"];
+
+interface ParseResult {
+	children: PortableTextSpan[];
+	markDefs: PortableTextMarkDef[];
+}
+
+/**
+ * Parse inline HTML content into Portable Text spans
+ */
+export function parseInlineContent(html: string, generateKey: () => string): ParseResult {
+	const children: PortableTextSpan[] = [];
+	const markDefs: PortableTextMarkDef[] = [];
+	const markDefMap = new Map<string, string>();
+
+	// Handle whitespace-only input BEFORE stripping (parse5 normalizes whitespace away)
+	if (html.length > 0 && !WHITESPACE_PATTERN.test(html)) {
+		return {
+			children: [{ _type: "span", _key: generateKey(), text: html }],
+			markDefs: [],
+		};
+	}
+
+	// Strip wrapping tags like <p>, <h1>, etc.
+	const strippedHtml = stripBlockTags(html);
+
+	// Parse HTML fragment
+	const fragment = parseFragment(strippedHtml);
+
+	// Walk the tree and build spans
+	walkNodes(fragment.childNodes, [], children, markDefs, markDefMap, generateKey);
+
+	// Ensure at least one span exists
+	if (children.length === 0) {
+		children.push({
+			_type: "span",
+			_key: generateKey(),
+			text: "",
+		});
+	}
+
+	return { children, markDefs };
+}
+
+/**
+ * Strip common block-level wrapper tags
+ */
+function stripBlockTags(html: string): string {
+	// Remove leading/trailing whitespace
+	let stripped = html.trim();
+
+	// Strip common block wrappers
+	const blockTags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote", "figcaption"];
+
+	for (const tag of blockTags) {
+		const patterns = BLOCK_TAG_PATTERNS[tag];
+		if (patterns && patterns.open.test(stripped) && patterns.close.test(stripped)) {
+			stripped = stripped.replace(patterns.open, "").replace(patterns.close, "").trim();
+			break;
+		}
+	}
+
+	return stripped;
+}
+
+/**
+ * Recursively walk DOM nodes and build spans
+ */
+function walkNodes(
+	nodes: Node[],
+	currentMarks: string[],
+	children: PortableTextSpan[],
+	markDefs: PortableTextMarkDef[],
+	markDefMap: Map<string, string>,
+	generateKey: () => string,
+): void {
+	for (const node of nodes) {
+		if (isTextNode(node)) {
+			const text = node.value;
+			if (text) {
+				// Handle line breaks in text
+				const parts = text.split("\n");
+				for (let i = 0; i < parts.length; i++) {
+					const part = parts[i];
+					if (part || i > 0) {
+						// Add text span
+						if (part) {
+							children.push({
+								_type: "span",
+								_key: generateKey(),
+								text: part,
+								marks: currentMarks.length > 0 ? [...currentMarks] : undefined,
+							});
+						}
+						// Add newline (except after last part)
+						if (i < parts.length - 1) {
+							// Append newline to previous span or create new one
+							if (children.length > 0) {
+								const lastChild = children.at(-1);
+								if (lastChild) {
+									lastChild.text += "\n";
+								}
+							} else {
+								children.push({
+									_type: "span",
+									_key: generateKey(),
+									text: "\n",
+								});
+							}
+						}
+					}
+				}
+			}
+		} else if (isElement(node)) {
+			const tagName = node.tagName.toLowerCase();
+
+			// Handle <br> as newline
+			if (tagName === "br") {
+				if (children.length > 0) {
+					const lastChild = children.at(-1);
+					if (lastChild) {
+						lastChild.text += "\n";
+					}
+				} else {
+					children.push({
+						_type: "span",
+						_key: generateKey(),
+						text: "\n",
+					});
+				}
+				continue;
+			}
+
+			// Get mark for this element
+			const markResult = getMarkForElement(node, markDefs, markDefMap, generateKey);
+			const newMarks = markResult ? [...currentMarks, markResult] : currentMarks;
+
+			// Recurse into children
+			walkNodes(node.childNodes, newMarks, children, markDefs, markDefMap, generateKey);
+		}
+	}
+}
+
+/**
+ * Get the Portable Text mark for an HTML element
+ */
+function getMarkForElement(
+	element: Element,
+	markDefs: PortableTextMarkDef[],
+	markDefMap: Map<string, string>,
+	generateKey: () => string,
+): string | null {
+	const tagName = element.tagName.toLowerCase();
+
+	switch (tagName) {
+		case "strong":
+		case "b":
+			return "strong";
+
+		case "em":
+		case "i":
+			return "em";
+
+		case "u":
+			return "underline";
+
+		case "s":
+		case "strike":
+		case "del":
+			return "strike-through";
+
+		case "code":
+			return "code";
+
+		case "sup":
+			return "superscript";
+
+		case "sub":
+			return "subscript";
+
+		case "a": {
+			const href = sanitizeHref(getAttr(element, "href"));
+			const target = getAttr(element, "target");
+
+			// Check if we already have a markDef for this href
+			const existingKey = markDefMap.get(href);
+			if (existingKey) {
+				return existingKey;
+			}
+
+			// Create new mark definition
+			const key = generateKey();
+			const markDef: PortableTextMarkDef = {
+				_type: "link",
+				_key: key,
+				href,
+			};
+			if (target === "_blank") {
+				markDef.blank = true;
+			}
+			markDefs.push(markDef);
+			markDefMap.set(href, key);
+			return key;
+		}
+
+		default:
+			// Unknown inline element - ignore the tag, process children
+			return null;
+	}
+}
+
+/**
+ * Get attribute value from element
+ */
+function getAttr(element: Element, name: string): string | undefined {
+	const attr = element.attrs.find((a) => a.name.toLowerCase() === name);
+	return attr?.value;
+}
+
+/**
+ * Type guard for text nodes
+ */
+function isTextNode(node: Node): node is TextNode {
+	return node.nodeName === "#text";
+}
+
+/**
+ * Type guard for elements
+ */
+function isElement(node: Node): node is Element {
+	return "tagName" in node;
+}
+
+/**
+ * Extract plain text from HTML (for alt text, captions)
+ */
+export function extractText(html: string): string {
+	const fragment = parseFragment(html);
+	return getTextContent(fragment.childNodes);
+}
+
+function getTextContent(nodes: Node[]): string {
+	let text = "";
+	for (const node of nodes) {
+		if (isTextNode(node)) {
+			text += node.value;
+		} else if (isElement(node)) {
+			text += getTextContent(node.childNodes);
+		}
+	}
+	return text.trim();
+}
+
+/**
+ * Extract alt text from an img element in HTML
+ */
+export function extractAlt(html: string): string | undefined {
+	const match = html.match(IMG_ALT_PATTERN);
+	if (match) {
+		return match[1]; // Can be empty string ""
+	}
+	return undefined;
+}
+
+/**
+ * Extract caption from a figcaption element
+ */
+export function extractCaption(html: string): string | undefined {
+	const match = html.match(FIGCAPTION_PATTERN);
+	if (match?.[1]) {
+		return extractText(match[1]);
+	}
+	return undefined;
+}
+
+/**
+ * Extract src from an img element
+ */
+export function extractSrc(html: string): string | undefined {
+	const match = html.match(IMG_SRC_PATTERN);
+	if (!match?.[1]) return undefined;
+	// Decode HTML entities in URLs
+	return decodeUrlEntities(match[1]);
+}
+
+/**
+ * Decode HTML entities commonly found in URLs
+ */
+function decodeUrlEntities(url: string): string {
+	return url
+		.replace(URL_AMP_ENTITY_PATTERN, "&")
+		.replace(URL_NUMERIC_AMP_ENTITY_PATTERN, "&")
+		.replace(URL_HEX_AMP_ENTITY_PATTERN, "&");
+}
--- a/packages/gutenberg-to-portable-text/src/transformers/core.ts
+++ b/packages/gutenberg-to-portable-text/src/transformers/core.ts
--- a/packages/gutenberg-to-portable-text/src/transformers/embed.ts
+++ b/packages/gutenberg-to-portable-text/src/transformers/embed.ts
@@ -0,0 +1,142 @@
+/**
+ * Transformers for WordPress embed blocks
+ */
+
+import type { BlockTransformer } from "../types.js";
+import { attrString } from "../types.js";
+
+// Regex patterns for embed parsing
+const IFRAME_SRC_PATTERN = /<iframe[^>]+src=["']([^"']+)["']/i;
+const VIDEO_SRC_PATTERN = /<video[^>]+src=["']([^"']+)["']/i;
+const VIDEO_SOURCE_PATTERN = /<source[^>]+src=["']([^"']+)["']/i;
+const AUDIO_SRC_PATTERN = /<audio[^>]+src=["']([^"']+)["']/i;
+const AUDIO_SOURCE_PATTERN = /<source[^>]+src=["']([^"']+)["']/i;
+
+/**
+ * core/embed and variants → embed block
+ */
+export const embed: BlockTransformer = (block, _options, context) => {
+	const url = attrString(block.attrs, "url");
+	const providerSlug = attrString(block.attrs, "providerNameSlug");
+
+	// Extract iframe src if present
+	const iframeMatch = block.innerHTML.match(IFRAME_SRC_PATTERN);
+	const iframeSrc = iframeMatch?.[1];
+
+	return [
+		{
+			_type: "embed",
+			_key: context.generateKey(),
+			url: url || iframeSrc || "",
+			provider: providerSlug || detectProvider(url || iframeSrc || ""),
+			html: block.innerHTML.trim() || undefined,
+		},
+	];
+};
+
+/**
+ * core-embed/youtube → embed block
+ */
+export const youtube: BlockTransformer = (block, options, context) => {
+	return embed(block, options, context);
+};
+
+/**
+ * core-embed/twitter → embed block
+ */
+export const twitter: BlockTransformer = (block, options, context) => {
+	return embed(block, options, context);
+};
+
+/**
+ * core-embed/vimeo → embed block
+ */
+export const vimeo: BlockTransformer = (block, options, context) => {
+	return embed(block, options, context);
+};
+
+/**
+ * core/video → embed block (self-hosted video)
+ */
+export const video: BlockTransformer = (block, _options, context) => {
+	const src = attrString(block.attrs, "src");
+
+	// Extract from video tag if not in attrs
+	const videoMatch = block.innerHTML.match(VIDEO_SRC_PATTERN);
+	const sourceMatch = block.innerHTML.match(VIDEO_SOURCE_PATTERN);
+	const videoSrc = src || videoMatch?.[1] || sourceMatch?.[1];
+
+	return [
+		{
+			_type: "embed",
+			_key: context.generateKey(),
+			url: videoSrc || "",
+			provider: "video",
+			html: block.innerHTML.trim() || undefined,
+		},
+	];
+};
+
+/**
+ * core/audio → embed block (self-hosted audio)
+ */
+export const audio: BlockTransformer = (block, _options, context) => {
+	const src = attrString(block.attrs, "src");
+
+	// Extract from audio tag if not in attrs
+	const audioMatch = block.innerHTML.match(AUDIO_SRC_PATTERN);
+	const sourceMatch = block.innerHTML.match(AUDIO_SOURCE_PATTERN);
+	const audioSrc = src || audioMatch?.[1] || sourceMatch?.[1];
+
+	return [
+		{
+			_type: "embed",
+			_key: context.generateKey(),
+			url: audioSrc || "",
+			provider: "audio",
+			html: block.innerHTML.trim() || undefined,
+		},
+	];
+};
+
+/**
+ * Detect embed provider from URL
+ */
+function detectProvider(url: string): string | undefined {
+	if (!url) return undefined;
+
+	const urlLower = url.toLowerCase();
+
+	if (urlLower.includes("youtube.com") || urlLower.includes("youtu.be")) {
+		return "youtube";
+	}
+	if (urlLower.includes("vimeo.com")) {
+		return "vimeo";
+	}
+	if (urlLower.includes("twitter.com") || urlLower.includes("x.com")) {
+		return "twitter";
+	}
+	if (urlLower.includes("instagram.com")) {
+		return "instagram";
+	}
+	if (urlLower.includes("facebook.com")) {
+		return "facebook";
+	}
+	if (urlLower.includes("tiktok.com")) {
+		return "tiktok";
+	}
+	if (urlLower.includes("spotify.com")) {
+		return "spotify";
+	}
+	if (urlLower.includes("soundcloud.com")) {
+		return "soundcloud";
+	}
+	if (urlLower.includes("codepen.io")) {
+		return "codepen";
+	}
+	if (urlLower.includes("gist.github.com")) {
+		return "gist";
+	}
+
+	return undefined;
+}
--- a/packages/gutenberg-to-portable-text/src/transformers/index.ts
+++ b/packages/gutenberg-to-portable-text/src/transformers/index.ts
@@ -0,0 +1,115 @@
+/**
+ * Block transformers registry
+ */
+
+import type { BlockTransformer, PortableTextBlock } from "../types.js";
+import * as core from "./core.js";
+import * as embed from "./embed.js";
+
+/**
+ * Default block transformers for core WordPress blocks
+ */
+export const defaultTransformers: Record<string, BlockTransformer> = {
+	// Text blocks
+	"core/paragraph": core.paragraph,
+	"core/heading": core.heading,
+	"core/list": core.list,
+	"core/quote": core.quote,
+	"core/code": core.code,
+	"core/preformatted": core.preformatted,
+	"core/pullquote": core.pullquote,
+	"core/verse": core.verse,
+
+	// Media blocks
+	"core/image": core.image,
+	"core/gallery": core.gallery,
+	"core/file": core.file,
+	"core/media-text": core.mediaText,
+	"core/cover": core.cover,
+
+	// Layout blocks
+	"core/columns": core.columns,
+	"core/group": core.group,
+	"core/separator": core.separator,
+	"core/spacer": core.separator,
+	"core/table": core.table,
+	"core/buttons": core.buttons,
+	"core/button": core.button,
+
+	// Structural blocks
+	"core/more": core.more,
+	"core/nextpage": core.nextpage,
+
+	// Pass-through blocks (preserve as HTML)
+	"core/html": core.html,
+	"core/shortcode": core.shortcode,
+
+	// Embed blocks
+	"core/embed": embed.embed,
+	"core/video": embed.video,
+	"core/audio": embed.audio,
+
+	// Legacy embed block names (WP < 5.6)
+	"core-embed/youtube": embed.youtube,
+	"core-embed/twitter": embed.twitter,
+	"core-embed/vimeo": embed.vimeo,
+	"core-embed/facebook": embed.embed,
+	"core-embed/instagram": embed.embed,
+	"core-embed/soundcloud": embed.embed,
+	"core-embed/spotify": embed.embed,
+};
+
+/**
+ * Fallback transformer for unknown blocks
+ * Stores the original HTML for manual review
+ */
+export const fallbackTransformer: BlockTransformer = (
+	block,
+	_options,
+	context,
+): PortableTextBlock[] => {
+	// Skip completely empty blocks
+	if (!block.innerHTML.trim() && block.innerBlocks.length === 0) {
+		return [];
+	}
+
+	// If it has inner blocks, try to transform those
+	if (block.innerBlocks.length > 0) {
+		return context.transformBlocks(block.innerBlocks);
+	}
+
+	// Store as HTML fallback
+	return [
+		{
+			_type: "htmlBlock",
+			_key: context.generateKey(),
+			html: block.innerHTML,
+			originalBlockName: block.blockName,
+			originalAttrs: Object.keys(block.attrs).length > 0 ? block.attrs : undefined,
+		},
+	];
+};
+
+/**
+ * Get transformer for a block
+ */
+export function getTransformer(
+	blockName: string | null,
+	customTransformers?: Record<string, BlockTransformer>,
+): BlockTransformer {
+	if (!blockName) {
+		return fallbackTransformer;
+	}
+
+	// Check custom transformers first
+	if (customTransformers?.[blockName]) {
+		return customTransformers[blockName];
+	}
+
+	// Check default transformers
+	if (defaultTransformers[blockName]) {
+		return defaultTransformers[blockName];
+	}
+
+	return fallbackTransformer;
+}
--- a/packages/gutenberg-to-portable-text/src/types.ts
+++ b/packages/gutenberg-to-portable-text/src/types.ts
@@ -0,0 +1,307 @@
+/**
+ * Types for Gutenberg to Portable Text conversion
+ */
+
+/**
+ * Gutenberg block as parsed by @wordpress/block-serialization-default-parser
+ */
+export interface GutenbergBlock {
+	/** Block name like "core/paragraph" or null for freeform HTML */
+	blockName: string | null;
+	/** Block attributes from the JSON comment */
+	attrs: Record<string, unknown>;
+	/** Inner HTML content */
+	innerHTML: string;
+	/** Nested blocks (for columns, groups, etc.) */
+	innerBlocks: GutenbergBlock[];
+	/** Content parts between inner blocks */
+	innerContent: Array<string | null>;
+}
+
+/**
+ * Portable Text span (inline text with marks)
+ */
+export interface PortableTextSpan {
+	_type: "span";
+	_key: string;
+	text: string;
+	marks?: string[];
+}
+
+/**
+ * Portable Text mark definition (for links, annotations)
+ */
+export interface PortableTextMarkDef {
+	_type: string;
+	_key: string;
+	[key: string]: unknown;
+}
+
+/**
+ * Portable Text text block
+ */
+export interface PortableTextTextBlock {
+	_type: "block";
+	_key: string;
+	style?: "normal" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "blockquote";
+	listItem?: "bullet" | "number";
+	level?: number;
+	children: PortableTextSpan[];
+	markDefs?: PortableTextMarkDef[];
+}
+
+/**
+ * Portable Text image block
+ */
+export interface PortableTextImageBlock {
+	_type: "image";
+	_key: string;
+	asset: {
+		_type: "reference";
+		_ref: string;
+		url?: string;
+	};
+	alt?: string;
+	caption?: string;
+	alignment?: "left" | "center" | "right" | "wide" | "full";
+	link?: string;
+}
+
+/**
+ * Portable Text code block
+ */
+export interface PortableTextCodeBlock {
+	_type: "code";
+	_key: string;
+	code: string;
+	language?: string;
+}
+
+/**
+ * Portable Text embed block (YouTube, Twitter, etc.)
+ */
+export interface PortableTextEmbedBlock {
+	_type: "embed";
+	_key: string;
+	url: string;
+	provider?: string;
+	html?: string;
+}
+
+/**
+ * Portable Text gallery block
+ */
+export interface PortableTextGalleryBlock {
+	_type: "gallery";
+	_key: string;
+	images: Array<{
+		_type: "image";
+		_key: string;
+		asset: { _type: "reference"; _ref: string; url?: string };
+		alt?: string;
+		caption?: string;
+	}>;
+	columns?: number;
+}
+
+/**
+ * Portable Text columns block
+ */
+export interface PortableTextColumnsBlock {
+	_type: "columns";
+	_key: string;
+	columns: Array<{
+		_type: "column";
+		_key: string;
+		content: PortableTextBlock[];
+	}>;
+}
+
+/**
+ * Portable Text break/divider block
+ */
+export interface PortableTextBreakBlock {
+	_type: "break";
+	_key: string;
+	style: "lineBreak";
+}
+
+/**
+ * Portable Text table block
+ */
+export interface PortableTextTableBlock {
+	_type: "table";
+	_key: string;
+	rows: Array<{
+		_type: "tableRow";
+		_key: string;
+		cells: Array<{
+			_type: "tableCell";
+			_key: string;
+			content: PortableTextSpan[];
+			markDefs?: PortableTextMarkDef[];
+			isHeader?: boolean;
+		}>;
+	}>;
+	hasHeaderRow?: boolean;
+}
+
+/**
+ * Fallback HTML block for unconvertible content
+ */
+export interface PortableTextHtmlBlock {
+	_type: "htmlBlock";
+	_key: string;
+	html: string;
+	originalBlockName?: string | null;
+	originalAttrs?: Record<string, unknown>;
+}
+
+/**
+ * Portable Text button block
+ */
+export interface PortableTextButtonBlock {
+	_type: "button";
+	_key: string;
+	text: string;
+	url?: string;
+	style?: "default" | "outline" | "fill";
+}
+
+/**
+ * Portable Text buttons container block
+ */
+export interface PortableTextButtonsBlock {
+	_type: "buttons";
+	_key: string;
+	buttons: PortableTextButtonBlock[];
+	layout?: "horizontal" | "vertical";
+}
+
+/**
+ * Portable Text cover block (image/video with text overlay)
+ */
+export interface PortableTextCoverBlock {
+	_type: "cover";
+	_key: string;
+	backgroundImage?: string;
+	backgroundVideo?: string;
+	overlayColor?: string;
+	overlayOpacity?: number;
+	content: PortableTextBlock[];
+	minHeight?: string;
+	alignment?: "left" | "center" | "right";
+}
+
+/**
+ * Portable Text file download block
+ */
+export interface PortableTextFileBlock {
+	_type: "file";
+	_key: string;
+	url: string;
+	filename?: string;
+	showDownloadButton?: boolean;
+}
+
+/**
+ * Portable Text pullquote block
+ */
+export interface PortableTextPullquoteBlock {
+	_type: "pullquote";
+	_key: string;
+	text: string;
+	citation?: string;
+}
+
+/**
+ * Union of all Portable Text block types
+ */
+export type PortableTextBlock =
+	| PortableTextTextBlock
+	| PortableTextImageBlock
+	| PortableTextCodeBlock
+	| PortableTextEmbedBlock
+	| PortableTextGalleryBlock
+	| PortableTextColumnsBlock
+	| PortableTextBreakBlock
+	| PortableTextTableBlock
+	| PortableTextHtmlBlock
+	| PortableTextButtonBlock
+	| PortableTextButtonsBlock
+	| PortableTextCoverBlock
+	| PortableTextFileBlock
+	| PortableTextPullquoteBlock;
+
+/**
+ * Options for the conversion
+ */
+export interface ConvertOptions {
+	/** Map of WordPress media IDs to EmDash media IDs/URLs */
+	mediaMap?: Map<number, string>;
+	/** Custom block transformers */
+	customTransformers?: Record<string, BlockTransformer>;
+	/** Whether to generate keys (default: true) */
+	generateKeys?: boolean;
+	/** Custom key generator */
+	keyGenerator?: () => string;
+}
+
+/**
+ * Block transformer function
+ */
+export type BlockTransformer = (
+	block: GutenbergBlock,
+	options: ConvertOptions,
+	context: TransformContext,
+) => PortableTextBlock[];
+
+/**
+ * Context passed to transformers
+ */
+export interface TransformContext {
+	/** Transform child blocks recursively */
+	transformBlocks: (blocks: GutenbergBlock[]) => PortableTextBlock[];
+	/** Parse inline HTML to spans */
+	parseInlineContent: (html: string) => {
+		children: PortableTextSpan[];
+		markDefs: PortableTextMarkDef[];
+	};
+	/** Generate a unique key */
+	generateKey: () => string;
+}
+
+// ── Attribute accessor helpers ──────────────────────────────────────
+// Gutenberg attrs are Record<string, unknown>. These narrow safely
+// without `as` casts.
+
+/** Extract a string attribute, returning undefined if missing or wrong type */
+export function attrString(attrs: Record<string, unknown>, key: string): string | undefined {
+	const v = attrs[key];
+	return typeof v === "string" ? v : undefined;
+}
+
+/** Extract a number attribute, returning undefined if missing or wrong type */
+export function attrNumber(attrs: Record<string, unknown>, key: string): number | undefined {
+	const v = attrs[key];
+	return typeof v === "number" ? v : undefined;
+}
+
+/** Extract a boolean attribute, returning undefined if missing or wrong type */
+export function attrBoolean(attrs: Record<string, unknown>, key: string): boolean | undefined {
+	const v = attrs[key];
+	return typeof v === "boolean" ? v : undefined;
+}
+
+function isRecord(v: unknown): v is Record<string, unknown> {
+	return typeof v === "object" && v !== null && !Array.isArray(v);
+}
+
+/** Extract an object attribute, returning undefined if missing or wrong type */
+export function attrObject(
+	attrs: Record<string, unknown>,
+	key: string,
+): Record<string, unknown> | undefined {
+	const v = attrs[key];
+	return isRecord(v) ? v : undefined;
+}
--- a/packages/gutenberg-to-portable-text/src/url.ts
+++ b/packages/gutenberg-to-portable-text/src/url.ts
@@ -0,0 +1,21 @@
+/**
+ * URL scheme validation for the converter pipeline (defense-in-depth).
+ *
+ * This mirrors the canonical sanitizeHref in packages/core/src/utils/url.ts.
+ * The converter is a standalone zero-dependency package, so it carries its own
+ * copy. The render layer in core is the primary defense; this is secondary.
+ */
+
+const SAFE_URL_SCHEME_RE = /^(https?:|mailto:|tel:|\/(?!\/)|#)/i;
+
+/**
+ * Returns the URL unchanged if it uses a safe scheme, otherwise returns "".
+ *
+ * Returns empty string (not "#") because this is the converter layer — we
+ * strip bad URLs rather than substituting anchors. The render layer handles
+ * the fallback to "#".
+ */
+export function sanitizeHref(url: string | undefined | null): string {
+	if (!url) return "";
+	return SAFE_URL_SCHEME_RE.test(url) ? url : "";
+}
--- a/packages/gutenberg-to-portable-text/tests/converter.test.ts
+++ b/packages/gutenberg-to-portable-text/tests/converter.test.ts
--- a/packages/gutenberg-to-portable-text/tests/inline.test.ts
+++ b/packages/gutenberg-to-portable-text/tests/inline.test.ts
@@ -0,0 +1,407 @@
+/**
+ * Tests for inline HTML parsing
+ */
+
+import { describe, it, expect } from "vitest";
+
+import {
+	parseInlineContent,
+	extractText,
+	extractAlt,
+	extractCaption,
+	extractSrc,
+} from "../src/inline.js";
+
+let keyCounter = 0;
+const generateKey = () => `key-${++keyCounter}`;
+
+const NEWLINE_PATTERN = /\n/g;
+
+describe("parseInlineContent", () => {
+	describe("plain text", () => {
+		it("parses plain text", () => {
+			const result = parseInlineContent("Hello world", generateKey);
+
+			expect(result.children).toHaveLength(1);
+			expect(result.children[0]).toMatchObject({
+				_type: "span",
+				text: "Hello world",
+			});
+			expect(result.markDefs).toHaveLength(0);
+		});
+
+		it("handles empty string", () => {
+			const result = parseInlineContent("", generateKey);
+
+			expect(result.children).toHaveLength(1);
+			expect(result.children[0]).toMatchObject({
+				_type: "span",
+				text: "",
+			});
+		});
+
+		it("handles whitespace-only string", () => {
+			const result = parseInlineContent("   ", generateKey);
+
+			expect(result.children).toHaveLength(1);
+			expect(result.children[0]?.text).toBe("   ");
+		});
+
+		it("preserves newlines in text", () => {
+			const result = parseInlineContent("line1\nline2", generateKey);
+
+			// Should have one span with newline appended, then another span
+			expect(result.children.length).toBeGreaterThanOrEqual(1);
+			const fullText = result.children.map((c) => c.text).join("");
+			expect(fullText).toContain("line1");
+			expect(fullText).toContain("line2");
+		});
+	});
+
+	describe("basic formatting", () => {
+		it("parses <strong> tags", () => {
+			const result = parseInlineContent("Hello <strong>bold</strong> world", generateKey);
+
+			expect(result.children).toHaveLength(3);
+			expect(result.children[0]).toMatchObject({ text: "Hello " });
+			expect(result.children[1]).toMatchObject({
+				text: "bold",
+				marks: ["strong"],
+			});
+			expect(result.children[2]).toMatchObject({ text: " world" });
+		});
+
+		it("parses <b> tags as strong", () => {
+			const result = parseInlineContent("Hello <b>bold</b> world", generateKey);
+
+			expect(result.children[1]).toMatchObject({
+				text: "bold",
+				marks: ["strong"],
+			});
+		});
+
+		it("parses <em> tags", () => {
+			const result = parseInlineContent("Hello <em>italic</em> world", generateKey);
+
+			expect(result.children[1]).toMatchObject({
+				text: "italic",
+				marks: ["em"],
+			});
+		});
+
+		it("parses <i> tags as em", () => {
+			const result = parseInlineContent("Hello <i>italic</i> world", generateKey);
+
+			expect(result.children[1]).toMatchObject({
+				text: "italic",
+				marks: ["em"],
+			});
+		});
+
+		it("parses <u> tags", () => {
+			const result = parseInlineContent("Hello <u>underline</u> world", generateKey);
+
+			expect(result.children[1]).toMatchObject({
+				text: "underline",
+				marks: ["underline"],
+			});
+		});
+
+		it("parses <s> tags as strike-through", () => {
+			const result = parseInlineContent("Hello <s>strikethrough</s> world", generateKey);
+
+			expect(result.children[1]).toMatchObject({
+				text: "strikethrough",
+				marks: ["strike-through"],
+			});
+		});
+
+		it("parses <del> tags as strike-through", () => {
+			const result = parseInlineContent("Hello <del>deleted</del> world", generateKey);
+
+			expect(result.children[1]).toMatchObject({
+				text: "deleted",
+				marks: ["strike-through"],
+			});
+		});
+
+		it("parses <code> tags", () => {
+			const result = parseInlineContent("Use <code>const x = 1</code> for variables", generateKey);
+
+			expect(result.children[1]).toMatchObject({
+				text: "const x = 1",
+				marks: ["code"],
+			});
+		});
+
+		it("parses <sup> tags", () => {
+			const result = parseInlineContent("x<sup>2</sup>", generateKey);
+
+			expect(result.children[1]).toMatchObject({
+				text: "2",
+				marks: ["superscript"],
+			});
+		});
+
+		it("parses <sub> tags", () => {
+			const result = parseInlineContent("H<sub>2</sub>O", generateKey);
+
+			expect(result.children[1]).toMatchObject({
+				text: "2",
+				marks: ["subscript"],
+			});
+		});
+	});
+
+	describe("nested formatting", () => {
+		it("handles nested strong and em", () => {
+			const result = parseInlineContent("<strong><em>bold italic</em></strong>", generateKey);
+
+			expect(result.children).toHaveLength(1);
+			expect(result.children[0]).toMatchObject({
+				text: "bold italic",
+				marks: expect.arrayContaining(["strong", "em"]),
+			});
+		});
+
+		it("handles deeply nested marks", () => {
+			const result = parseInlineContent("<strong><em><code>code</code></em></strong>", generateKey);
+
+			expect(result.children[0]?.marks).toContain("strong");
+			expect(result.children[0]?.marks).toContain("em");
+			expect(result.children[0]?.marks).toContain("code");
+		});
+
+		it("handles mixed content with nested marks", () => {
+			const result = parseInlineContent(
+				"Start <strong>bold <em>bold-italic</em> bold</strong> end",
+				generateKey,
+			);
+
+			expect(result.children.length).toBeGreaterThanOrEqual(4);
+			// Find the bold-italic span
+			const boldItalic = result.children.find(
+				(c) => c.marks?.includes("strong") && c.marks?.includes("em"),
+			);
+			expect(boldItalic?.text).toBe("bold-italic");
+		});
+	});
+
+	describe("links", () => {
+		it("parses simple links", () => {
+			const result = parseInlineContent(
+				'Visit <a href="https://example.com">our site</a>',
+				generateKey,
+			);
+
+			expect(result.markDefs).toHaveLength(1);
+			expect(result.markDefs[0]).toMatchObject({
+				_type: "link",
+				href: "https://example.com",
+			});
+
+			const linkSpan = result.children.find((c) =>
+				c.marks?.includes(result.markDefs[0]?._key ?? ""),
+			);
+			expect(linkSpan?.text).toBe("our site");
+		});
+
+		it("handles links with target=_blank", () => {
+			const result = parseInlineContent(
+				'<a href="https://example.com" target="_blank">link</a>',
+				generateKey,
+			);
+
+			expect(result.markDefs[0]).toMatchObject({
+				_type: "link",
+				href: "https://example.com",
+				blank: true,
+			});
+		});
+
+		it("deduplicates identical links", () => {
+			const result = parseInlineContent(
+				'<a href="https://example.com">link1</a> and <a href="https://example.com">link2</a>',
+				generateKey,
+			);
+
+			expect(result.markDefs).toHaveLength(1);
+
+			const linkKey = result.markDefs[0]?._key;
+			const linkSpans = result.children.filter((c) => c.marks?.includes(linkKey ?? ""));
+			expect(linkSpans).toHaveLength(2);
+		});
+
+		it("creates separate markDefs for different links", () => {
+			const result = parseInlineContent(
+				'<a href="https://a.com">link1</a> and <a href="https://b.com">link2</a>',
+				generateKey,
+			);
+
+			expect(result.markDefs).toHaveLength(2);
+			expect(result.markDefs.map((m) => m.href)).toContain("https://a.com");
+			expect(result.markDefs.map((m) => m.href)).toContain("https://b.com");
+		});
+
+		it("handles links with formatting inside", () => {
+			const result = parseInlineContent(
+				'<a href="https://example.com"><strong>bold link</strong></a>',
+				generateKey,
+			);
+
+			const span = result.children.find((c) => c.text === "bold link");
+			expect(span?.marks).toContain("strong");
+			expect(span?.marks?.length).toBe(2); // strong + link key
+		});
+
+		it("handles links with empty href", () => {
+			const result = parseInlineContent('<a href="">empty link</a>', generateKey);
+
+			expect(result.markDefs).toHaveLength(1);
+			expect(result.markDefs[0]).toMatchObject({
+				_type: "link",
+				href: "",
+			});
+
+			const linkSpan = result.children.find((c) =>
+				c.marks?.includes(result.markDefs[0]?._key ?? ""),
+			);
+			expect(linkSpan?.text).toBe("empty link");
+		});
+
+		it("ignores unknown schemes in links", () => {
+			const result = parseInlineContent('<a href="ftp://foo.bar">bad link</a>', generateKey);
+
+			expect(result.markDefs).toHaveLength(1);
+			expect(result.markDefs[0]).toMatchObject({
+				_type: "link",
+				href: "",
+			});
+
+			const linkSpan = result.children.find((c) => c.marks?.includes(result.markDefs[0]!._key));
+			expect(linkSpan?.text).toBe("bad link");
+		});
+	});
+
+	describe("line breaks", () => {
+		it("handles <br> tags", () => {
+			const result = parseInlineContent("line1<br>line2", generateKey);
+
+			const fullText = result.children.map((c) => c.text).join("");
+			expect(fullText).toContain("line1");
+			expect(fullText).toContain("\n");
+			expect(fullText).toContain("line2");
+		});
+
+		it("handles self-closing <br /> tags", () => {
+			const result = parseInlineContent("line1<br />line2", generateKey);
+
+			const fullText = result.children.map((c) => c.text).join("");
+			expect(fullText).toContain("\n");
+		});
+
+		it("handles multiple consecutive <br> tags", () => {
+			const result = parseInlineContent("a<br><br>b", generateKey);
+
+			const fullText = result.children.map((c) => c.text).join("");
+			expect(fullText.match(NEWLINE_PATTERN)?.length).toBeGreaterThanOrEqual(2);
+		});
+	});
+
+	describe("block wrapper stripping", () => {
+		it("strips <p> wrapper", () => {
+			const result = parseInlineContent("<p>content</p>", generateKey);
+
+			expect(result.children).toHaveLength(1);
+			expect(result.children[0]?.text).toBe("content");
+		});
+
+		it("strips heading wrappers", () => {
+			const result = parseInlineContent("<h2>heading</h2>", generateKey);
+
+			expect(result.children[0]?.text).toBe("heading");
+		});
+
+		it("strips <li> wrapper", () => {
+			const result = parseInlineContent("<li>list item</li>", generateKey);
+
+			expect(result.children[0]?.text).toBe("list item");
+		});
+
+		it("preserves content when wrapper has attributes", () => {
+			const result = parseInlineContent('<p class="intro">content</p>', generateKey);
+
+			expect(result.children[0]?.text).toBe("content");
+		});
+	});
+});
+
+describe("extractText", () => {
+	it("extracts plain text", () => {
+		expect(extractText("Hello world")).toBe("Hello world");
+	});
+
+	it("strips HTML tags", () => {
+		expect(extractText("<p>Hello <strong>world</strong></p>")).toBe("Hello world");
+	});
+
+	it("handles nested elements", () => {
+		expect(extractText("<div><p>Nested <em>text</em></p></div>")).toBe("Nested text");
+	});
+
+	it("handles empty string", () => {
+		expect(extractText("")).toBe("");
+	});
+});
+
+describe("extractAlt", () => {
+	it("extracts alt from img tag", () => {
+		expect(extractAlt('<img src="photo.jpg" alt="A photo">')).toBe("A photo");
+	});
+
+	it("handles missing alt", () => {
+		expect(extractAlt('<img src="photo.jpg">')).toBeUndefined();
+	});
+
+	it("handles empty alt", () => {
+		expect(extractAlt('<img src="photo.jpg" alt="">')).toBe("");
+	});
+
+	it("handles single quotes", () => {
+		expect(extractAlt("<img src='photo.jpg' alt='A photo'>")).toBe("A photo");
+	});
+});
+
+describe("extractCaption", () => {
+	it("extracts caption from figcaption", () => {
+		expect(extractCaption("<figure><img><figcaption>My caption</figcaption></figure>")).toBe(
+			"My caption",
+		);
+	});
+
+	it("strips HTML from caption", () => {
+		expect(
+			extractCaption("<figure><figcaption>Caption with <em>formatting</em></figcaption></figure>"),
+		).toBe("Caption with formatting");
+	});
+
+	it("handles missing figcaption", () => {
+		expect(extractCaption("<figure><img></figure>")).toBeUndefined();
+	});
+});
+
+describe("extractSrc", () => {
+	it("extracts src from img tag", () => {
+		expect(extractSrc('<img src="https://example.com/photo.jpg">')).toBe(
+			"https://example.com/photo.jpg",
+		);
+	});
+
+	it("handles relative URLs", () => {
+		expect(extractSrc('<img src="/uploads/photo.jpg">')).toBe("/uploads/photo.jpg");
+	});
+
+	it("handles missing src", () => {
+		expect(extractSrc("<img alt='no source'>")).toBeUndefined();
+	});
+});
--- a/packages/gutenberg-to-portable-text/tsconfig.json
+++ b/packages/gutenberg-to-portable-text/tsconfig.json
@@ -0,0 +1,21 @@
+{
+	"compilerOptions": {
+		"target": "ES2024",
+		"module": "preserve",
+		"moduleResolution": "bundler",
+		"strict": true,
+		"declaration": true,
+		"declarationMap": true,
+		"sourceMap": true,
+		"outDir": "dist",
+		"rootDir": "src",
+		"esModuleInterop": true,
+		"skipLibCheck": true,
+		"noUncheckedIndexedAccess": true,
+		"noUnusedLocals": true,
+		"noImplicitOverride": true,
+		"verbatimModuleSyntax": true
+	},
+	"include": ["src/**/*"],
+	"exclude": ["node_modules", "dist"]
+}