334 lines
8.2 KiB
TypeScript
334 lines
8.2 KiB
TypeScript
/**
|
|
* Inline HTML to Portable Text spans converter
|
|
*
|
|
* Parses inline HTML elements (strong, em, a, code, etc.) and converts
|
|
* them to Portable Text spans with marks.
|
|
*/
|
|
|
|
import { parseFragment, type DefaultTreeAdapterMap } from "parse5";
|
|
|
|
import type { PortableTextSpan, PortableTextMarkDef } from "./types.js";
|
|
import { sanitizeHref } from "./url.js";
|
|
|
|
// Regex patterns for inline parsing
|
|
const WHITESPACE_PATTERN = /\S/;
|
|
|
|
// Pre-compiled block tag patterns
|
|
const BLOCK_TAG_PATTERNS: Record<string, { open: RegExp; close: RegExp }> = {
|
|
p: { open: /^<p[^>]*>/i, close: /<\/p>$/i },
|
|
h1: { open: /^<h1[^>]*>/i, close: /<\/h1>$/i },
|
|
h2: { open: /^<h2[^>]*>/i, close: /<\/h2>$/i },
|
|
h3: { open: /^<h3[^>]*>/i, close: /<\/h3>$/i },
|
|
h4: { open: /^<h4[^>]*>/i, close: /<\/h4>$/i },
|
|
h5: { open: /^<h5[^>]*>/i, close: /<\/h5>$/i },
|
|
h6: { open: /^<h6[^>]*>/i, close: /<\/h6>$/i },
|
|
li: { open: /^<li[^>]*>/i, close: /<\/li>$/i },
|
|
blockquote: { open: /^<blockquote[^>]*>/i, close: /<\/blockquote>$/i },
|
|
figcaption: { open: /^<figcaption[^>]*>/i, close: /<\/figcaption>$/i },
|
|
};
|
|
|
|
// Regex patterns for extracting attributes
|
|
const IMG_ALT_PATTERN = /<img[^>]+alt=["']([^"']*)["']/i;
|
|
const FIGCAPTION_PATTERN = /<figcaption[^>]*>([\s\S]*?)<\/figcaption>/i;
|
|
const IMG_SRC_PATTERN = /<img[^>]+src=["']([^"']*)["']/i;
|
|
const URL_AMP_ENTITY_PATTERN = /&/g;
|
|
const URL_NUMERIC_AMP_ENTITY_PATTERN = /�?38;/g;
|
|
const URL_HEX_AMP_ENTITY_PATTERN = /&/gi;
|
|
|
|
type Node = DefaultTreeAdapterMap["node"];
|
|
type TextNode = DefaultTreeAdapterMap["textNode"];
|
|
type Element = DefaultTreeAdapterMap["element"];
|
|
|
|
interface ParseResult {
|
|
children: PortableTextSpan[];
|
|
markDefs: PortableTextMarkDef[];
|
|
}
|
|
|
|
/**
|
|
* Parse inline HTML content into Portable Text spans
|
|
*/
|
|
export function parseInlineContent(html: string, generateKey: () => string): ParseResult {
|
|
const children: PortableTextSpan[] = [];
|
|
const markDefs: PortableTextMarkDef[] = [];
|
|
const markDefMap = new Map<string, string>();
|
|
|
|
// Handle whitespace-only input BEFORE stripping (parse5 normalizes whitespace away)
|
|
if (html.length > 0 && !WHITESPACE_PATTERN.test(html)) {
|
|
return {
|
|
children: [{ _type: "span", _key: generateKey(), text: html }],
|
|
markDefs: [],
|
|
};
|
|
}
|
|
|
|
// Strip wrapping tags like <p>, <h1>, etc.
|
|
const strippedHtml = stripBlockTags(html);
|
|
|
|
// Parse HTML fragment
|
|
const fragment = parseFragment(strippedHtml);
|
|
|
|
// Walk the tree and build spans
|
|
walkNodes(fragment.childNodes, [], children, markDefs, markDefMap, generateKey);
|
|
|
|
// Ensure at least one span exists
|
|
if (children.length === 0) {
|
|
children.push({
|
|
_type: "span",
|
|
_key: generateKey(),
|
|
text: "",
|
|
});
|
|
}
|
|
|
|
return { children, markDefs };
|
|
}
|
|
|
|
/**
|
|
* Strip common block-level wrapper tags
|
|
*/
|
|
function stripBlockTags(html: string): string {
|
|
// Remove leading/trailing whitespace
|
|
let stripped = html.trim();
|
|
|
|
// Strip common block wrappers
|
|
const blockTags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote", "figcaption"];
|
|
|
|
for (const tag of blockTags) {
|
|
const patterns = BLOCK_TAG_PATTERNS[tag];
|
|
if (patterns && patterns.open.test(stripped) && patterns.close.test(stripped)) {
|
|
stripped = stripped.replace(patterns.open, "").replace(patterns.close, "").trim();
|
|
break;
|
|
}
|
|
}
|
|
|
|
return stripped;
|
|
}
|
|
|
|
/**
|
|
* Recursively walk DOM nodes and build spans
|
|
*/
|
|
function walkNodes(
|
|
nodes: Node[],
|
|
currentMarks: string[],
|
|
children: PortableTextSpan[],
|
|
markDefs: PortableTextMarkDef[],
|
|
markDefMap: Map<string, string>,
|
|
generateKey: () => string,
|
|
): void {
|
|
for (const node of nodes) {
|
|
if (isTextNode(node)) {
|
|
const text = node.value;
|
|
if (text) {
|
|
// Handle line breaks in text
|
|
const parts = text.split("\n");
|
|
for (let i = 0; i < parts.length; i++) {
|
|
const part = parts[i];
|
|
if (part || i > 0) {
|
|
// Add text span
|
|
if (part) {
|
|
children.push({
|
|
_type: "span",
|
|
_key: generateKey(),
|
|
text: part,
|
|
marks: currentMarks.length > 0 ? [...currentMarks] : undefined,
|
|
});
|
|
}
|
|
// Add newline (except after last part)
|
|
if (i < parts.length - 1) {
|
|
// Append newline to previous span or create new one
|
|
if (children.length > 0) {
|
|
const lastChild = children.at(-1);
|
|
if (lastChild) {
|
|
lastChild.text += "\n";
|
|
}
|
|
} else {
|
|
children.push({
|
|
_type: "span",
|
|
_key: generateKey(),
|
|
text: "\n",
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else if (isElement(node)) {
|
|
const tagName = node.tagName.toLowerCase();
|
|
|
|
// Handle <br> as newline
|
|
if (tagName === "br") {
|
|
if (children.length > 0) {
|
|
const lastChild = children.at(-1);
|
|
if (lastChild) {
|
|
lastChild.text += "\n";
|
|
}
|
|
} else {
|
|
children.push({
|
|
_type: "span",
|
|
_key: generateKey(),
|
|
text: "\n",
|
|
});
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Get mark for this element
|
|
const markResult = getMarkForElement(node, markDefs, markDefMap, generateKey);
|
|
const newMarks = markResult ? [...currentMarks, markResult] : currentMarks;
|
|
|
|
// Recurse into children
|
|
walkNodes(node.childNodes, newMarks, children, markDefs, markDefMap, generateKey);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get the Portable Text mark for an HTML element
|
|
*/
|
|
function getMarkForElement(
|
|
element: Element,
|
|
markDefs: PortableTextMarkDef[],
|
|
markDefMap: Map<string, string>,
|
|
generateKey: () => string,
|
|
): string | null {
|
|
const tagName = element.tagName.toLowerCase();
|
|
|
|
switch (tagName) {
|
|
case "strong":
|
|
case "b":
|
|
return "strong";
|
|
|
|
case "em":
|
|
case "i":
|
|
return "em";
|
|
|
|
case "u":
|
|
return "underline";
|
|
|
|
case "s":
|
|
case "strike":
|
|
case "del":
|
|
return "strike-through";
|
|
|
|
case "code":
|
|
return "code";
|
|
|
|
case "sup":
|
|
return "superscript";
|
|
|
|
case "sub":
|
|
return "subscript";
|
|
|
|
case "a": {
|
|
const href = sanitizeHref(getAttr(element, "href"));
|
|
const target = getAttr(element, "target");
|
|
|
|
// Check if we already have a markDef for this href
|
|
const existingKey = markDefMap.get(href);
|
|
if (existingKey) {
|
|
return existingKey;
|
|
}
|
|
|
|
// Create new mark definition
|
|
const key = generateKey();
|
|
const markDef: PortableTextMarkDef = {
|
|
_type: "link",
|
|
_key: key,
|
|
href,
|
|
};
|
|
if (target === "_blank") {
|
|
markDef.blank = true;
|
|
}
|
|
markDefs.push(markDef);
|
|
markDefMap.set(href, key);
|
|
return key;
|
|
}
|
|
|
|
default:
|
|
// Unknown inline element - ignore the tag, process children
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get attribute value from element
|
|
*/
|
|
function getAttr(element: Element, name: string): string | undefined {
|
|
const attr = element.attrs.find((a) => a.name.toLowerCase() === name);
|
|
return attr?.value;
|
|
}
|
|
|
|
/**
|
|
* Type guard for text nodes
|
|
*/
|
|
function isTextNode(node: Node): node is TextNode {
|
|
return node.nodeName === "#text";
|
|
}
|
|
|
|
/**
|
|
* Type guard for elements
|
|
*/
|
|
function isElement(node: Node): node is Element {
|
|
return "tagName" in node;
|
|
}
|
|
|
|
/**
|
|
* Extract plain text from HTML (for alt text, captions)
|
|
*/
|
|
export function extractText(html: string): string {
|
|
const fragment = parseFragment(html);
|
|
return getTextContent(fragment.childNodes);
|
|
}
|
|
|
|
function getTextContent(nodes: Node[]): string {
|
|
let text = "";
|
|
for (const node of nodes) {
|
|
if (isTextNode(node)) {
|
|
text += node.value;
|
|
} else if (isElement(node)) {
|
|
text += getTextContent(node.childNodes);
|
|
}
|
|
}
|
|
return text.trim();
|
|
}
|
|
|
|
/**
|
|
* Extract alt text from an img element in HTML
|
|
*/
|
|
export function extractAlt(html: string): string | undefined {
|
|
const match = html.match(IMG_ALT_PATTERN);
|
|
if (match) {
|
|
return match[1]; // Can be empty string ""
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Extract caption from a figcaption element
|
|
*/
|
|
export function extractCaption(html: string): string | undefined {
|
|
const match = html.match(FIGCAPTION_PATTERN);
|
|
if (match?.[1]) {
|
|
return extractText(match[1]);
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Extract src from an img element
|
|
*/
|
|
export function extractSrc(html: string): string | undefined {
|
|
const match = html.match(IMG_SRC_PATTERN);
|
|
if (!match?.[1]) return undefined;
|
|
// Decode HTML entities in URLs
|
|
return decodeUrlEntities(match[1]);
|
|
}
|
|
|
|
/**
|
|
* Decode HTML entities commonly found in URLs
|
|
*/
|
|
function decodeUrlEntities(url: string): string {
|
|
return url
|
|
.replace(URL_AMP_ENTITY_PATTERN, "&")
|
|
.replace(URL_NUMERIC_AMP_ENTITY_PATTERN, "&")
|
|
.replace(URL_HEX_AMP_ENTITY_PATTERN, "&");
|
|
}
|