Emdash source with visual editor image upload fix

Fixes:
1. media.ts: wrap placeholder generation in try-catch
2. toolbar.ts: check r.ok, display error message in popover
This commit is contained in:
2026-05-03 10:44:54 +07:00
parent 78f81bebb6
commit 2d1be52177
2352 changed files with 662964 additions and 0 deletions

View File

@@ -0,0 +1,90 @@
/**
* Import system
*
* Provides a pluggable system for importing content from various sources.
*/
// Core types
export type {
ImportSource,
ImportAnalysis,
ImportContext,
SourceInput,
FileInput,
UrlInput,
OAuthInput,
SourceProbeResult,
ProbeResult,
SourceAuth,
SourceCapabilities,
SuggestedAction,
PostTypeAnalysis,
ImportFieldDef,
FieldCompatibility,
CollectionSchemaStatus,
AttachmentInfo,
NormalizedItem,
ImportConfig,
ImportResult,
FetchOptions,
PostTypeMapping,
NavMenuAnalysis,
TaxonomyAnalysis,
} from "./types.js";
// Menu import
export {
importMenusFromWxr,
importMenusFromPlugin,
type MenuImportResult,
type PluginMenu,
type PluginMenuItem,
} from "./menus.js";
// Sections import
export { importReusableBlocksAsSections, type SectionsImportResult } from "./sections.js";
// Site settings import
export {
importSiteSettings,
parseSiteSettingsFromPlugin,
type SiteSettingsAnalysis,
type SettingsImportResult,
type WidgetAreaAnalysis,
} from "./settings.js";
// Registry
export {
registerSource,
getSource,
getAllSources,
getFileSources,
getUrlSources,
probeUrl,
clearSources,
} from "./registry.js";
// SSRF protection
export { validateExternalUrl, ssrfSafeFetch, SsrfError } from "./ssrf.js";
// Sources
export { wxrSource, parseWxrDate } from "./sources/wxr.js";
export { wordpressRestSource } from "./sources/wordpress-rest.js";
export {
wordpressPluginSource,
createBasicAuthToken,
fetchPluginMedia,
fetchPluginTaxonomies,
} from "./sources/wordpress-plugin.js";
// Auto-register built-in sources
import { registerSource } from "./registry.js";
import { wordpressPluginSource } from "./sources/wordpress-plugin.js";
import { wordpressRestSource } from "./sources/wordpress-rest.js";
import { wxrSource } from "./sources/wxr.js";
// Register in priority order (most specific first)
// Plugin source first - if they have our plugin, use it
registerSource(wordpressPluginSource);
registerSource(wordpressRestSource);
registerSource(wxrSource);

View File

@@ -0,0 +1,436 @@
/**
* Menu import functions
*
* Import navigation menus from WordPress WXR exports or plugin API.
*/
import type { Kysely } from "kysely";
import { ulid } from "ulidx";
import type { WxrNavMenu, WxrNavMenuItem } from "../cli/wxr/parser.js";
import type { Database } from "../database/types.js";
import type { MenuItemType } from "../menus/types.js";
/**
* Result of menu import operation
*/
export interface MenuImportResult {
/** Number of menus created */
menusCreated: number;
/** Number of menu items created */
itemsCreated: number;
/** Mapping from WP menu slug to EmDash menu ID */
menuIdMap: Map<string, string>;
/** Errors encountered during import */
errors: Array<{ menu: string; error: string }>;
}
/**
* Plugin API menu format (matches /emdash/v1/menus response)
*/
export interface PluginMenu {
id: number;
name: string; // slug
label: string;
items: PluginMenuItem[];
}
export interface PluginMenuItem {
id: number;
parent_id: number | null;
sort_order: number;
type: "custom" | "post_type" | "taxonomy";
object: string | null; // 'page', 'post', 'category'
object_id: number | null;
url: string;
title: string;
target: string | null;
classes: string | null;
}
/**
* Import navigation menus from WXR export
*
* @param menus - Parsed navigation menus from WXR
* @param db - Database connection
* @param contentIdMap - Map from WP post ID to EmDash content ID (for resolving references)
* @returns Import result with counts and ID mapping
*/
export async function importMenusFromWxr(
menus: WxrNavMenu[],
db: Kysely<Database>,
contentIdMap: Map<number, string>,
): Promise<MenuImportResult> {
const result: MenuImportResult = {
menusCreated: 0,
itemsCreated: 0,
menuIdMap: new Map(),
errors: [],
};
for (const menu of menus) {
try {
// Check if menu already exists
const existing = await db
.selectFrom("_emdash_menus")
.select("id")
.where("name", "=", menu.name)
.executeTakeFirst();
if (existing) {
result.menuIdMap.set(menu.name, existing.id);
continue; // Skip existing menus
}
// Create the menu
const menuId = ulid();
await db
.insertInto("_emdash_menus")
.values({
id: menuId,
name: menu.name,
label: menu.label,
})
.execute();
result.menusCreated++;
result.menuIdMap.set(menu.name, menuId);
// Import menu items
const itemsCreated = await importWxrMenuItems(menu.items, menuId, db, contentIdMap);
result.itemsCreated += itemsCreated;
} catch (error) {
result.errors.push({
menu: menu.name,
error: error instanceof Error ? error.message : String(error),
});
}
}
return result;
}
/**
* Import navigation menus from Plugin API
*
* @param menus - Menus from plugin API
* @param db - Database connection
* @param contentIdMap - Map from WP post ID to EmDash content ID
* @returns Import result with counts and ID mapping
*/
export async function importMenusFromPlugin(
menus: PluginMenu[],
db: Kysely<Database>,
contentIdMap: Map<number, string>,
): Promise<MenuImportResult> {
const result: MenuImportResult = {
menusCreated: 0,
itemsCreated: 0,
menuIdMap: new Map(),
errors: [],
};
for (const menu of menus) {
try {
// Check if menu already exists
const existing = await db
.selectFrom("_emdash_menus")
.select("id")
.where("name", "=", menu.name)
.executeTakeFirst();
if (existing) {
result.menuIdMap.set(menu.name, existing.id);
continue;
}
// Create the menu
const menuId = ulid();
await db
.insertInto("_emdash_menus")
.values({
id: menuId,
name: menu.name,
label: menu.label,
})
.execute();
result.menusCreated++;
result.menuIdMap.set(menu.name, menuId);
// Import menu items
const itemsCreated = await importPluginMenuItems(menu.items, menuId, db, contentIdMap);
result.itemsCreated += itemsCreated;
} catch (error) {
result.errors.push({
menu: menu.name,
error: error instanceof Error ? error.message : String(error),
});
}
}
return result;
}
/**
* Import menu items from WXR format
*/
async function importWxrMenuItems(
items: WxrNavMenuItem[],
menuId: string,
db: Kysely<Database>,
contentIdMap: Map<number, string>,
): Promise<number> {
// Build a map of WP menu item IDs to EmDash IDs for parent resolution
const itemIdMap = new Map<number, string>();
let count = 0;
// Sort items by sort order to maintain hierarchy
const sortedItems = items.toSorted((a, b) => a.sortOrder - b.sortOrder);
// First pass: create all items with temporary parent IDs
for (const item of sortedItems) {
const itemId = ulid();
itemIdMap.set(item.id, itemId);
const { type, collection, referenceId, customUrl } = mapWxrMenuItem(item, contentIdMap);
await db
.insertInto("_emdash_menu_items")
.values({
id: itemId,
menu_id: menuId,
parent_id: null, // Will be set in second pass
sort_order: item.sortOrder,
type,
reference_collection: collection,
reference_id: referenceId,
custom_url: customUrl,
label: item.title,
title_attr: null,
target: item.target || null,
css_classes: item.classes || null,
})
.execute();
count++;
}
// Second pass: update parent IDs
for (const item of sortedItems) {
if (item.parentId) {
const itemId = itemIdMap.get(item.id);
const parentId = itemIdMap.get(item.parentId);
if (itemId && parentId) {
await db
.updateTable("_emdash_menu_items")
.set({ parent_id: parentId })
.where("id", "=", itemId)
.execute();
}
}
}
return count;
}
/**
* Import menu items from Plugin API format
*/
async function importPluginMenuItems(
items: PluginMenuItem[],
menuId: string,
db: Kysely<Database>,
contentIdMap: Map<number, string>,
): Promise<number> {
const itemIdMap = new Map<number, string>();
let count = 0;
const sortedItems = items.toSorted((a, b) => a.sort_order - b.sort_order);
for (const item of sortedItems) {
const itemId = ulid();
itemIdMap.set(item.id, itemId);
const { type, collection, referenceId, customUrl } = mapPluginMenuItem(item, contentIdMap);
await db
.insertInto("_emdash_menu_items")
.values({
id: itemId,
menu_id: menuId,
parent_id: null,
sort_order: item.sort_order,
type,
reference_collection: collection,
reference_id: referenceId,
custom_url: customUrl,
label: item.title,
title_attr: null,
target: item.target || null,
css_classes: item.classes || null,
})
.execute();
count++;
}
// Second pass: update parent IDs
for (const item of sortedItems) {
if (item.parent_id) {
const itemId = itemIdMap.get(item.id);
const parentId = itemIdMap.get(item.parent_id);
if (itemId && parentId) {
await db
.updateTable("_emdash_menu_items")
.set({ parent_id: parentId })
.where("id", "=", itemId)
.execute();
}
}
}
return count;
}
/**
* Map WXR menu item to EmDash format
*/
function mapWxrMenuItem(
item: WxrNavMenuItem,
contentIdMap: Map<number, string>,
): {
type: MenuItemType;
collection: string | null;
referenceId: string | null;
customUrl: string | null;
} {
switch (item.type) {
case "custom":
return {
type: "custom",
collection: null,
referenceId: null,
customUrl: item.url || "#",
};
case "post_type": {
// Map WordPress object type to collection
const collection = mapObjectToCollection(item.objectType);
const referenceId = item.objectId ? contentIdMap.get(item.objectId) || null : null;
// If we can't resolve the reference, fall back to custom URL
if (!referenceId && item.url) {
return {
type: "custom",
collection: null,
referenceId: null,
customUrl: item.url,
};
}
return {
type: collection === "pages" ? "page" : "post",
collection,
referenceId,
customUrl: null,
};
}
case "taxonomy":
// For taxonomies, we need taxonomy support in menus
// Fall back to custom URL for now
return {
type: "custom",
collection: null,
referenceId: null,
customUrl: item.url || "#",
};
default:
return {
type: "custom",
collection: null,
referenceId: null,
customUrl: item.url || "#",
};
}
}
/**
* Map Plugin menu item to EmDash format
*/
function mapPluginMenuItem(
item: PluginMenuItem,
contentIdMap: Map<number, string>,
): {
type: MenuItemType;
collection: string | null;
referenceId: string | null;
customUrl: string | null;
} {
switch (item.type) {
case "custom":
return {
type: "custom",
collection: null,
referenceId: null,
customUrl: item.url || "#",
};
case "post_type": {
const collection = mapObjectToCollection(item.object);
const referenceId = item.object_id ? contentIdMap.get(item.object_id) || null : null;
if (!referenceId && item.url) {
return {
type: "custom",
collection: null,
referenceId: null,
customUrl: item.url,
};
}
return {
type: collection === "pages" ? "page" : "post",
collection,
referenceId,
customUrl: null,
};
}
case "taxonomy":
return {
type: "custom",
collection: null,
referenceId: null,
customUrl: item.url || "#",
};
default:
return {
type: "custom",
collection: null,
referenceId: null,
customUrl: item.url || "#",
};
}
}
/**
* Map WordPress object type to EmDash collection name
*/
function mapObjectToCollection(objectType: string | undefined | null): string {
if (!objectType) return "posts";
const mapping: Record<string, string> = {
post: "posts",
page: "pages",
product: "products",
portfolio: "portfolio",
};
return mapping[objectType] || objectType;
}

View File

@@ -0,0 +1,112 @@
/**
* Import source registry
*
* Manages available import sources and provides URL probing.
*/
import { resolveAndValidateExternalUrl } from "./ssrf.js";
import type { ImportSource, ProbeResult, SourceProbeResult } from "./types.js";
// Regex pattern for URL normalization
const TRAILING_SLASHES_PATTERN = /\/+$/;
/** Registered import sources */
const sources = new Map<string, ImportSource>();
/**
* Register an import source
*/
export function registerSource(source: ImportSource): void {
sources.set(source.id, source);
}
/**
* Get a source by ID
*/
export function getSource(id: string): ImportSource | undefined {
return sources.get(id);
}
/**
* Get all registered sources
*/
export function getAllSources(): ImportSource[] {
return [...sources.values()];
}
/**
* Get sources that can handle file uploads
*/
export function getFileSources(): ImportSource[] {
return getAllSources().filter((s) => s.requiresFile);
}
/**
* Get sources that can probe URLs
*/
export function getUrlSources(): ImportSource[] {
return getAllSources().filter((s) => s.canProbe);
}
/**
* Probe a URL against all registered sources
*
* Returns probe results sorted by confidence (definite > likely > possible)
*/
export async function probeUrl(url: string): Promise<ProbeResult> {
// Normalize URL
let normalizedUrl = url.trim();
if (!normalizedUrl.startsWith("http")) {
normalizedUrl = `https://${normalizedUrl}`;
}
// Remove trailing slash for consistency
normalizedUrl = normalizedUrl.replace(TRAILING_SLASHES_PATTERN, "");
// SSRF: reject internal/private network targets. DNS resolution
// catches hostnames that resolve to private addresses.
await resolveAndValidateExternalUrl(normalizedUrl);
const results: SourceProbeResult[] = [];
const urlSources = getUrlSources();
// Probe all sources in parallel
const probePromises = urlSources.map(async (source) => {
try {
const result = await source.probe?.(normalizedUrl);
if (result) {
return result;
}
} catch (error) {
// Probe failed, skip this source
console.debug(`Probe failed for ${source.id}:`, error);
}
return null;
});
const probeResults = await Promise.allSettled(probePromises);
for (const result of probeResults) {
if (result.status === "fulfilled" && result.value) {
results.push(result.value);
}
}
// Sort by confidence
const confidenceOrder = { definite: 0, likely: 1, possible: 2 };
results.sort((a, b) => confidenceOrder[a.confidence] - confidenceOrder[b.confidence]);
return {
url: normalizedUrl,
isWordPress: results.length > 0,
bestMatch: results[0] ?? null,
allMatches: results,
};
}
/**
* Clear all registered sources (useful for testing)
*/
export function clearSources(): void {
sources.clear();
}

View File

@@ -0,0 +1,103 @@
/**
* Sections import functions
*
* Import reusable blocks from WordPress WXR exports as EmDash sections.
*/
import type { PortableTextBlock } from "@emdash-cms/gutenberg-to-portable-text";
import { gutenbergToPortableText } from "@emdash-cms/gutenberg-to-portable-text";
import type { Kysely } from "kysely";
import { ulid } from "ulidx";
import type { WxrPost } from "../cli/wxr/parser.js";
import type { Database } from "../database/types.js";
import { slugify } from "../utils/slugify.js";
/**
* Result of sections import operation
*/
export interface SectionsImportResult {
/** Number of sections created */
sectionsCreated: number;
/** Number of sections skipped (already exist) */
sectionsSkipped: number;
/** Errors encountered during import */
errors: Array<{ title: string; error: string }>;
}
/**
* Import reusable blocks (wp_block post type) from WXR as sections
*
* @param posts - All posts from WXR (will filter to wp_block)
* @param db - Database connection
* @returns Import result with counts
*/
export async function importReusableBlocksAsSections(
posts: WxrPost[],
db: Kysely<Database>,
): Promise<SectionsImportResult> {
const result: SectionsImportResult = {
sectionsCreated: 0,
sectionsSkipped: 0,
errors: [],
};
// Filter to only wp_block posts
const reusableBlocks = posts.filter((post) => post.postType === "wp_block");
if (reusableBlocks.length === 0) {
return result;
}
for (const block of reusableBlocks) {
try {
const slug = block.postName || slugify(block.title || `block-${block.id || Date.now()}`);
// Check if section already exists
const existing = await db
.selectFrom("_emdash_sections")
.select("id")
.where("slug", "=", slug)
.executeTakeFirst();
if (existing) {
result.sectionsSkipped++;
continue;
}
// Convert Gutenberg content to Portable Text
const content: PortableTextBlock[] = block.content
? gutenbergToPortableText(block.content)
: [];
const id = ulid();
const now = new Date().toISOString();
await db
.insertInto("_emdash_sections")
.values({
id,
slug,
title: block.title || "Untitled Block",
description: null,
keywords: null,
content: JSON.stringify(content),
preview_media_id: null,
source: "import",
theme_id: null,
created_at: now,
updated_at: now,
})
.execute();
result.sectionsCreated++;
} catch (error) {
result.errors.push({
title: block.title || "Untitled Block",
error: error instanceof Error ? error.message : String(error),
});
}
}
return result;
}

View File

@@ -0,0 +1,281 @@
/**
* Site settings import functions
*
* Import site settings from WordPress (title, tagline, logo, favicon, etc.)
*/
import type { Kysely } from "kysely";
import type { Database } from "../database/types.js";
/**
* Site settings analysis from import source
*/
export interface SiteSettingsAnalysis {
/** Site title */
title?: string;
/** Site tagline/description */
tagline?: string;
/** Custom logo */
logo?: { url: string; id?: number };
/** Favicon/site icon */
favicon?: { url: string; id?: number };
/** Front page settings */
frontPage?: { type: "posts" | "page"; pageId?: number };
/** SEO settings (Yoast, RankMath, etc.) */
seo?: Record<string, unknown>;
}
/**
* Widget area analysis
*/
export interface WidgetAreaAnalysis {
/** Widget area ID */
id: string;
/** Widget area name */
name: string;
/** Widget area label */
label: string;
/** Number of widgets */
widgetCount: number;
/** Widget summaries */
widgets: Array<{ type: string; title?: string }>;
}
/**
* Result of site settings import
*/
export interface SettingsImportResult {
/** Settings that were applied */
applied: string[];
/** Settings that were skipped (already set) */
skipped: string[];
/** Errors encountered */
errors: Array<{ setting: string; error: string }>;
}
/**
* Import site settings from analysis
*
* @param settings - Site settings analysis
* @param db - Database connection
* @param overwrite - Whether to overwrite existing settings
* @returns Import result
*/
export async function importSiteSettings(
settings: SiteSettingsAnalysis,
db: Kysely<Database>,
overwrite = false,
): Promise<SettingsImportResult> {
const result: SettingsImportResult = {
applied: [],
skipped: [],
errors: [],
};
// Import title
if (settings.title) {
try {
const applied = await setOption(db, "site_title", settings.title, overwrite);
if (applied) {
result.applied.push("site_title");
} else {
result.skipped.push("site_title");
}
} catch (error) {
result.errors.push({
setting: "site_title",
error: error instanceof Error ? error.message : String(error),
});
}
}
// Import tagline
if (settings.tagline) {
try {
const applied = await setOption(db, "site_tagline", settings.tagline, overwrite);
if (applied) {
result.applied.push("site_tagline");
} else {
result.skipped.push("site_tagline");
}
} catch (error) {
result.errors.push({
setting: "site_tagline",
error: error instanceof Error ? error.message : String(error),
});
}
}
// Import logo URL (actual media import handled separately)
if (settings.logo?.url) {
try {
const applied = await setOption(db, "site_logo_url", settings.logo.url, overwrite);
if (applied) {
result.applied.push("site_logo_url");
} else {
result.skipped.push("site_logo_url");
}
} catch (error) {
result.errors.push({
setting: "site_logo_url",
error: error instanceof Error ? error.message : String(error),
});
}
}
// Import favicon URL
if (settings.favicon?.url) {
try {
const applied = await setOption(db, "site_favicon_url", settings.favicon.url, overwrite);
if (applied) {
result.applied.push("site_favicon_url");
} else {
result.skipped.push("site_favicon_url");
}
} catch (error) {
result.errors.push({
setting: "site_favicon_url",
error: error instanceof Error ? error.message : String(error),
});
}
}
// Import front page settings
if (settings.frontPage) {
try {
const applied = await setOption(db, "front_page_type", settings.frontPage.type, overwrite);
if (applied) {
result.applied.push("front_page_type");
} else {
result.skipped.push("front_page_type");
}
if (settings.frontPage.pageId) {
const pageApplied = await setOption(
db,
"front_page_id",
String(settings.frontPage.pageId),
overwrite,
);
if (pageApplied) {
result.applied.push("front_page_id");
} else {
result.skipped.push("front_page_id");
}
}
} catch (error) {
result.errors.push({
setting: "front_page",
error: error instanceof Error ? error.message : String(error),
});
}
}
// Import SEO settings as JSON blob
if (settings.seo && Object.keys(settings.seo).length > 0) {
try {
const applied = await setOption(db, "seo_settings", JSON.stringify(settings.seo), overwrite);
if (applied) {
result.applied.push("seo_settings");
} else {
result.skipped.push("seo_settings");
}
} catch (error) {
result.errors.push({
setting: "seo_settings",
error: error instanceof Error ? error.message : String(error),
});
}
}
return result;
}
/**
* Set an option in the database
*
* @returns true if the option was set, false if skipped (already exists and !overwrite)
*/
async function setOption(
db: Kysely<Database>,
key: string,
value: string,
overwrite: boolean,
): Promise<boolean> {
const existing = await db
.selectFrom("options")
.select("value")
.where("name", "=", key)
.executeTakeFirst();
if (existing && !overwrite) {
return false;
}
if (existing) {
await db.updateTable("options").set({ value }).where("name", "=", key).execute();
} else {
await db.insertInto("options").values({ name: key, value }).execute();
}
return true;
}
/**
* Parse site settings from WordPress plugin options response
*/
export function parseSiteSettingsFromPlugin(
options: Record<string, unknown>,
): SiteSettingsAnalysis {
const settings: SiteSettingsAnalysis = {};
// Basic settings
if (typeof options.blogname === "string") {
settings.title = options.blogname;
}
if (typeof options.blogdescription === "string") {
settings.tagline = options.blogdescription;
}
// Logo and favicon
if (typeof options.custom_logo_url === "string") {
settings.logo = {
url: options.custom_logo_url,
id: typeof options.custom_logo === "number" ? options.custom_logo : undefined,
};
}
if (typeof options.site_icon_url === "string") {
settings.favicon = {
url: options.site_icon_url,
id: typeof options.site_icon === "number" ? options.site_icon : undefined,
};
}
// Front page settings
if (options.show_on_front === "page") {
settings.frontPage = {
type: "page",
pageId: typeof options.page_on_front === "number" ? options.page_on_front : undefined,
};
} else {
settings.frontPage = { type: "posts" };
}
// SEO settings (Yoast)
const seo: Record<string, unknown> = {};
if (typeof options.wpseo === "object" && options.wpseo !== null) {
seo.yoast = options.wpseo;
}
if (typeof options.wpseo_titles === "object" && options.wpseo_titles !== null) {
seo.yoast_titles = options.wpseo_titles;
}
if (typeof options.wpseo_social === "object" && options.wpseo_social !== null) {
seo.yoast_social = options.wpseo_social;
}
if (Object.keys(seo).length > 0) {
settings.seo = seo;
}
return settings;
}

View File

@@ -0,0 +1,641 @@
/**
* WordPress Plugin (EmDash Exporter) import source
*
* Connects to self-hosted WordPress sites running the EmDash Exporter plugin.
* Provides full access to all content including drafts, custom post types, and ACF fields.
*/
import { gutenbergToPortableText } from "@emdash-cms/gutenberg-to-portable-text";
import { encodeBase64 } from "../../utils/base64.js";
import { ssrfSafeFetch, validateExternalUrl } from "../ssrf.js";
import type {
ImportSource,
ImportAnalysis,
ImportContext,
SourceInput,
SourceProbeResult,
I18nDetection,
FetchOptions,
NormalizedItem,
PostTypeAnalysis,
AttachmentInfo,
} from "../types.js";
import {
BASE_REQUIRED_FIELDS,
FEATURED_IMAGE_FIELD,
mapPostTypeToCollection,
mapWpStatus,
normalizeUrl,
checkSchemaCompatibility,
} from "../utils.js";
// =============================================================================
// API Response Types
// =============================================================================
/** Detected i18n plugin info from the WordPress site */
interface PluginI18nInfo {
/** Which multilingual plugin is active */
plugin: "wpml" | "polylang";
/** BCP 47 default locale */
default_locale: string;
/** All configured locales */
locales: string[];
}
/** Probe response from /emdash/v1/probe */
interface PluginProbeResponse {
emdash_exporter: string;
wordpress_version: string;
site: {
title: string;
description: string;
url: string;
home: string;
language: string;
timezone: string;
};
capabilities: {
application_passwords: boolean;
acf: boolean;
yoast: boolean;
rankmath: boolean;
};
post_types: Array<{
name: string;
label: string;
count: number;
}>;
media_count: number;
endpoints: Record<string, string>;
auth_instructions: {
method: string;
instructions: string;
url?: string;
};
/** Detected multilingual plugin (WPML or Polylang). Absent when neither is active. */
i18n?: PluginI18nInfo;
}
/** Analyze response from /emdash/v1/analyze */
interface PluginAnalyzeResponse {
site: {
title: string;
url: string;
};
post_types: Array<{
name: string;
label: string;
label_singular: string;
total: number;
by_status: Record<string, number>;
supports: Record<string, unknown>;
taxonomies: string[];
custom_fields: Array<{
key: string;
count: number;
inferred_type: string;
sample: string | null;
}>;
hierarchical: boolean;
has_archive: boolean;
}>;
taxonomies: Array<{
name: string;
label: string;
hierarchical: boolean;
term_count: number;
object_types: string[];
}>;
authors: Array<{
id: number;
login: string;
email: string;
display_name: string;
post_count: number;
}>;
attachments: {
count: number;
by_type: Record<string, number>;
};
acf?: Array<{
key: string;
title: string;
fields: Array<{
key: string;
name: string;
label: string;
type: string;
required: boolean;
}>;
}>;
/** Detected multilingual plugin (WPML or Polylang). Absent when neither is active. */
i18n?: PluginI18nInfo;
}
/** Content response from /emdash/v1/content */
interface PluginContentResponse {
items: PluginPost[];
total: number;
pages: number;
page: number;
per_page: number;
}
/** Single post from plugin API */
interface PluginPost {
id: number;
post_type: string;
status: string;
slug: string;
title: string;
content: string;
excerpt: string;
date: string;
date_gmt: string;
modified: string;
modified_gmt: string;
author: {
id: number;
login: string;
email: string;
display_name: string;
} | null;
parent: number | null;
menu_order: number;
taxonomies: Record<string, Array<{ id: number; name: string; slug: string }>>;
featured_image?: {
id: number;
url: string;
filename: string;
mime_type: string;
alt: string;
title: string;
caption: string;
width: number | null;
height: number | null;
};
meta: Record<string, unknown>;
acf?: Record<string, unknown>;
yoast?: Record<string, string>;
rankmath?: Record<string, string>;
/** BCP 47 locale from WPML/Polylang (when detected) */
locale?: string;
/** Translation group ID from WPML trid or Polylang (when detected) */
translation_group?: string;
}
/** Media response from /emdash/v1/media */
interface PluginMediaResponse {
items: PluginMediaItem[];
total: number;
pages: number;
page: number;
per_page: number;
}
interface PluginMediaItem {
id: number;
url: string;
filename: string;
mime_type: string;
title: string;
alt: string;
caption: string;
description: string;
width?: number;
height?: number;
filesize?: number;
}
// =============================================================================
// Constants
// =============================================================================
/** Pattern to remove spaces from application passwords */
const SPACE_PATTERN = /\s/g;
// =============================================================================
// Import Source
// =============================================================================
export const wordpressPluginSource: ImportSource = {
id: "wordpress-plugin",
name: "WordPress (EmDash Exporter)",
description: "Import from WordPress sites with the EmDash Exporter plugin installed",
icon: "plug",
requiresFile: false,
canProbe: true,
async probe(url: string): Promise<SourceProbeResult | null> {
try {
const siteUrl = normalizeUrl(url);
// SSRF protection: validate URL before any outbound requests
validateExternalUrl(siteUrl);
const probeUrl = `${siteUrl}/wp-json/emdash/v1/probe`;
const response = await ssrfSafeFetch(probeUrl, {
headers: { Accept: "application/json" },
signal: AbortSignal.timeout(10000),
});
if (!response.ok) {
return null;
}
const data: PluginProbeResponse = await response.json();
// Verify it's actually our plugin
if (!data.emdash_exporter) {
return null;
}
return {
sourceId: "wordpress-plugin",
confidence: "definite",
detected: {
platform: "wordpress",
version: data.wordpress_version,
siteTitle: data.site.title,
siteUrl: data.site.url,
},
capabilities: {
publicContent: true,
privateContent: true, // Full access with auth
customPostTypes: true,
allMeta: true,
mediaStream: true,
},
auth: data.capabilities.application_passwords
? {
type: "password",
instructions: data.auth_instructions.instructions,
}
: undefined,
preview: {
posts: data.post_types.find((p) => p.name === "post")?.count,
pages: data.post_types.find((p) => p.name === "page")?.count,
media: data.media_count,
},
suggestedAction: {
type: "proceed",
},
i18n: pluginI18nToDetection(data.i18n),
};
} catch {
return null;
}
},
async analyze(input: SourceInput, context: ImportContext): Promise<ImportAnalysis> {
const { siteUrl, headers } = getRequestConfig(input);
const response = await ssrfSafeFetch(`${siteUrl}/wp-json/emdash/v1/analyze`, {
headers,
signal: AbortSignal.timeout(30000),
});
if (!response.ok) {
const error = await response.json().catch(() => ({}));
throw new Error(error.message || `Failed to analyze site: ${response.statusText}`);
}
const data: PluginAnalyzeResponse = await response.json();
// Get existing collections for schema check
const existingCollections = context.getExistingCollections
? await context.getExistingCollections()
: new Map();
// Build post type analysis
const postTypes: PostTypeAnalysis[] = data.post_types
.filter((pt) => pt.total > 0)
.map((pt) => {
const suggestedCollection = mapPostTypeToCollection(pt.name);
const existingCollection = existingCollections.get(suggestedCollection);
// Include featured_image if post type supports thumbnails
const supportsThumbnail = pt.supports && "thumbnail" in pt.supports;
const requiredFields = supportsThumbnail
? [...BASE_REQUIRED_FIELDS, FEATURED_IMAGE_FIELD]
: [...BASE_REQUIRED_FIELDS];
return {
name: pt.name,
count: pt.total,
suggestedCollection,
requiredFields,
schemaStatus: checkSchemaCompatibility(requiredFields, existingCollection),
};
});
// Fetch media list for attachment info
const attachments: AttachmentInfo[] = [];
if (data.attachments.count > 0) {
try {
// Fetch first page of media to populate attachment info
const mediaResponse = await ssrfSafeFetch(
`${siteUrl}/wp-json/emdash/v1/media?per_page=500`,
{
headers,
signal: AbortSignal.timeout(30000),
},
);
if (mediaResponse.ok) {
const mediaData: PluginMediaResponse = await mediaResponse.json();
for (const item of mediaData.items) {
attachments.push({
id: item.id,
url: item.url,
filename: item.filename,
mimeType: item.mime_type,
title: item.title,
alt: item.alt,
caption: item.caption,
width: item.width,
height: item.height,
});
}
}
} catch (e) {
console.warn("Failed to fetch media list:", e);
}
}
// Count categories and tags
const categoryTaxonomy = data.taxonomies.find((t) => t.name === "category");
const tagTaxonomy = data.taxonomies.find((t) => t.name === "post_tag");
return {
sourceId: "wordpress-plugin",
site: {
title: data.site.title,
url: data.site.url,
},
postTypes,
attachments: {
count: data.attachments.count,
items: attachments,
},
categories: categoryTaxonomy?.term_count ?? 0,
tags: tagTaxonomy?.term_count ?? 0,
authors: data.authors.map((a) => ({
id: a.id,
login: a.login,
email: a.email,
displayName: a.display_name,
postCount: a.post_count,
})),
i18n: pluginI18nToDetection(data.i18n),
};
},
async *fetchContent(input: SourceInput, options: FetchOptions): AsyncGenerator<NormalizedItem> {
const { siteUrl, headers } = getRequestConfig(input);
for (const postType of options.postTypes) {
let page = 1;
let totalPages = 1;
let yielded = 0;
while (page <= totalPages) {
const status = options.includeDrafts ? "any" : "publish";
const url = `${siteUrl}/wp-json/emdash/v1/content?post_type=${postType}&status=${status}&per_page=100&page=${page}`;
const response = await ssrfSafeFetch(url, {
headers,
signal: AbortSignal.timeout(60000),
});
if (!response.ok) {
throw new Error(`Failed to fetch ${postType}: ${response.statusText}`);
}
const data: PluginContentResponse = await response.json();
totalPages = data.pages;
for (const post of data.items) {
yield pluginPostToNormalizedItem(post);
yielded++;
if (options.limit && yielded >= options.limit) {
return;
}
}
page++;
}
}
},
async fetchMedia(url: string, _input: SourceInput): Promise<Blob> {
// SSRF protection: validate media URL before fetching
validateExternalUrl(url);
// Media URLs are publicly accessible on WP (ssrfSafeFetch validates redirects)
const response = await ssrfSafeFetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch media: ${response.statusText}`);
}
return response.blob();
},
};
// =============================================================================
// Helper Functions
// =============================================================================
/**
* Convert plugin i18n info to the shared I18nDetection type.
* Returns undefined when no multilingual plugin is detected.
*/
function pluginI18nToDetection(i18n: PluginI18nInfo | undefined): I18nDetection | undefined {
if (!i18n) return undefined;
return {
plugin: i18n.plugin,
defaultLocale: i18n.default_locale,
locales: i18n.locales,
};
}
/**
* Get request configuration from input
*/
function getRequestConfig(input: SourceInput): {
siteUrl: string;
headers: HeadersInit;
} {
if (input.type === "url") {
const siteUrl = normalizeUrl(input.url);
// SSRF protection: validate URL before any outbound requests
validateExternalUrl(siteUrl);
const headers: HeadersInit = {
Accept: "application/json",
};
if (input.token) {
// Token format: "username:password" base64 encoded
headers["Authorization"] = `Basic ${input.token}`;
}
return { siteUrl, headers };
}
if (input.type === "oauth") {
const oauthSiteUrl = normalizeUrl(input.url);
// SSRF protection: validate URL before any outbound requests
validateExternalUrl(oauthSiteUrl);
return {
siteUrl: oauthSiteUrl,
headers: {
Accept: "application/json",
Authorization: `Bearer ${input.accessToken}`,
},
};
}
throw new Error("WordPress plugin source requires URL or OAuth input");
}
/**
* Convert plugin post to normalized item
*/
function pluginPostToNormalizedItem(post: PluginPost): NormalizedItem {
const content = post.content ? gutenbergToPortableText(post.content) : [];
// Extract categories and tags from taxonomies
const categories =
post.taxonomies?.category?.map((c) => c.slug) ??
post.taxonomies?.categories?.map((c) => c.slug) ??
[];
const tags =
post.taxonomies?.post_tag?.map((t) => t.slug) ??
post.taxonomies?.tags?.map((t) => t.slug) ??
[];
// Build meta from various sources
const meta: Record<string, unknown> = { ...post.meta };
// Include ACF fields in meta
if (post.acf) {
meta._acf = post.acf;
}
// Include SEO data in meta
if (post.yoast) {
meta._yoast = post.yoast;
}
if (post.rankmath) {
meta._rankmath = post.rankmath;
}
return {
sourceId: post.id,
postType: post.post_type,
status: mapWpStatus(post.status),
slug: post.slug,
title: post.title,
content,
excerpt: post.excerpt || undefined,
date: new Date(post.date_gmt || post.date),
modified: post.modified_gmt ? new Date(post.modified_gmt) : new Date(post.modified),
author: post.author?.login,
categories,
tags,
meta,
featuredImage: post.featured_image?.url,
locale: post.locale,
translationGroup: post.translation_group,
};
}
// =============================================================================
// Utility Functions for External Use
// =============================================================================
/**
* Create a Basic Auth token from username and password
*/
export function createBasicAuthToken(username: string, password: string): string {
// Remove spaces from application password (WP formats them with spaces)
const cleanPassword = password.replace(SPACE_PATTERN, "");
return encodeBase64(`${username}:${cleanPassword}`);
}
/**
* Fetch media list from plugin API
*/
export async function fetchPluginMedia(
siteUrl: string,
authToken: string,
page = 1,
perPage = 100,
): Promise<PluginMediaResponse> {
const normalizedSiteUrl = normalizeUrl(siteUrl);
// SSRF protection: validate URL before any outbound requests
validateExternalUrl(normalizedSiteUrl);
const url = `${normalizedSiteUrl}/wp-json/emdash/v1/media?per_page=${perPage}&page=${page}`;
const response = await ssrfSafeFetch(url, {
headers: {
Accept: "application/json",
Authorization: `Basic ${authToken}`,
},
});
if (!response.ok) {
throw new Error(`Failed to fetch media: ${response.statusText}`);
}
return response.json();
}
/**
* Fetch taxonomies from plugin API
*/
export async function fetchPluginTaxonomies(
siteUrl: string,
authToken: string,
): Promise<
Array<{
name: string;
label: string;
hierarchical: boolean;
terms: Array<{
id: number;
name: string;
slug: string;
description: string;
parent: number | null;
count: number;
}>;
}>
> {
const normalizedSiteUrl = normalizeUrl(siteUrl);
// SSRF protection: validate URL before any outbound requests
validateExternalUrl(normalizedSiteUrl);
const url = `${normalizedSiteUrl}/wp-json/emdash/v1/taxonomies`;
const response = await ssrfSafeFetch(url, {
headers: {
Accept: "application/json",
Authorization: `Basic ${authToken}`,
},
});
if (!response.ok) {
throw new Error(`Failed to fetch taxonomies: ${response.statusText}`);
}
return response.json();
}

View File

@@ -0,0 +1,191 @@
/**
* WordPress REST API probe
*
* Probes self-hosted WordPress sites to detect capabilities.
* This source is probe-only - it tells users what's available
* and suggests next steps (usually: upload WXR file).
*/
import { ssrfSafeFetch, validateExternalUrl } from "../ssrf.js";
import type {
ImportSource,
ImportAnalysis,
ImportContext,
SourceInput,
SourceProbeResult,
FetchOptions,
NormalizedItem,
} from "../types.js";
const TRAILING_SLASHES = /\/+$/;
const WP_JSON_SUFFIX = /\/wp-json\/?$/;
/** WordPress REST API discovery response */
interface WpApiDiscovery {
name?: string;
description?: string;
url?: string;
home?: string;
gmt_offset?: number;
timezone_string?: string;
namespaces?: string[];
authentication?: Record<string, unknown>;
routes?: Record<string, unknown>;
}
export const wordpressRestSource: ImportSource = {
id: "wordpress-rest",
name: "WordPress Site",
description: "Connect to a self-hosted WordPress site",
icon: "globe",
requiresFile: false,
canProbe: true,
async probe(url: string): Promise<SourceProbeResult | null> {
try {
const siteUrl = normalizeUrl(url);
// SSRF protection: validate URL before any outbound requests
validateExternalUrl(siteUrl);
// Try to fetch the WP REST API root
const apiUrl = `${siteUrl}/wp-json/`;
const response = await ssrfSafeFetch(apiUrl, {
headers: { Accept: "application/json" },
signal: AbortSignal.timeout(10000),
});
if (!response.ok) {
// Try alternate location (some sites use different prefix)
const altResponse = await ssrfSafeFetch(`${siteUrl}/?rest_route=/`, {
headers: { Accept: "application/json" },
signal: AbortSignal.timeout(10000),
});
if (!altResponse.ok) {
return null;
}
}
const data: WpApiDiscovery = await response.json();
// Check if this looks like WordPress
if (!data.namespaces?.includes("wp/v2")) {
return null;
}
// Get content counts (unauthenticated - published only)
const preview = await getPublicContentCounts(siteUrl);
// Check for authentication methods
const hasAppPasswords = !!data.authentication?.["application-passwords"];
return {
sourceId: "wordpress-rest",
confidence: "definite",
detected: {
platform: "wordpress",
siteTitle: data.name,
siteUrl: data.url || data.home || siteUrl,
},
capabilities: {
publicContent: true,
privateContent: false, // Would need auth
customPostTypes: false, // Only if show_in_rest: true
allMeta: false, // Only if registered for REST
mediaStream: true,
},
auth: hasAppPasswords
? {
type: "password",
instructions:
"To import drafts and private content, create an Application Password in WordPress → Users → Your Profile → Application Passwords",
}
: undefined,
preview,
suggestedAction: {
type: "upload",
instructions:
"For a complete import including drafts, custom post types, and all metadata, export your content from WordPress (Tools → Export) and upload the file here.",
},
};
} catch {
// Probe failed - not a WordPress site or not accessible
return null;
}
},
async analyze(_input: SourceInput, _context: ImportContext): Promise<ImportAnalysis> {
// REST-only import not implemented - we use this for probe only
// and suggest WXR upload for actual import
throw new Error("Direct REST API import not implemented. Please upload a WXR export file.");
},
// eslint-disable-next-line require-yield
async *fetchContent(_input: SourceInput, _options: FetchOptions): AsyncGenerator<NormalizedItem> {
throw new Error("Direct REST API import not implemented. Please upload a WXR export file.");
},
};
/**
* Normalize a URL for API requests
*/
function normalizeUrl(url: string): string {
let normalized = url.trim();
// Add protocol if missing
if (!normalized.startsWith("http")) {
normalized = `https://${normalized}`;
}
// Remove trailing slash
normalized = normalized.replace(TRAILING_SLASHES, "");
// Remove /wp-json if included
normalized = normalized.replace(WP_JSON_SUFFIX, "");
return normalized;
}
/**
* Get public content counts from REST API
*/
async function getPublicContentCounts(
siteUrl: string,
): Promise<{ posts?: number; pages?: number; media?: number }> {
const result: { posts?: number; pages?: number; media?: number } = {};
try {
// Fetch with per_page=1 to get total from headers
const [postsRes, pagesRes, mediaRes] = await Promise.allSettled([
ssrfSafeFetch(`${siteUrl}/wp-json/wp/v2/posts?per_page=1`, {
signal: AbortSignal.timeout(5000),
}),
ssrfSafeFetch(`${siteUrl}/wp-json/wp/v2/pages?per_page=1`, {
signal: AbortSignal.timeout(5000),
}),
ssrfSafeFetch(`${siteUrl}/wp-json/wp/v2/media?per_page=1`, {
signal: AbortSignal.timeout(5000),
}),
]);
if (postsRes.status === "fulfilled" && postsRes.value.ok) {
const total = postsRes.value.headers.get("X-WP-Total");
if (total) result.posts = parseInt(total, 10);
}
if (pagesRes.status === "fulfilled" && pagesRes.value.ok) {
const total = pagesRes.value.headers.get("X-WP-Total");
if (total) result.pages = parseInt(total, 10);
}
if (mediaRes.status === "fulfilled" && mediaRes.value.ok) {
const total = mediaRes.value.headers.get("X-WP-Total");
if (total) result.media = parseInt(total, 10);
}
} catch {
// Counts are optional, continue without them
}
return result;
}

View File

@@ -0,0 +1,373 @@
/**
* WXR (WordPress eXtended RSS) import source
*
* Handles WordPress export file uploads (.xml).
* This wraps the existing WXR parsing and analysis logic.
*/
import { gutenbergToPortableText } from "@emdash-cms/gutenberg-to-portable-text";
import { parseWxrString, type WxrData, type WxrPost } from "../../cli/wxr/parser.js";
import type {
ImportSource,
ImportAnalysis,
ImportContext,
SourceInput,
FetchOptions,
NormalizedItem,
PostTypeAnalysis,
AttachmentInfo,
NavMenuAnalysis,
TaxonomyAnalysis,
ReusableBlockAnalysis,
} from "../types.js";
import {
BASE_REQUIRED_FIELDS,
FEATURED_IMAGE_FIELD,
isInternalPostType,
isInternalMetaKey,
mapWpStatus,
mapPostTypeToCollection,
mapMetaKeyToField,
inferMetaType,
slugify,
buildAttachmentMap,
getFilenameFromUrl,
guessMimeType,
checkSchemaCompatibility,
} from "../utils.js";
export const wxrSource: ImportSource = {
id: "wxr",
name: "WordPress Export File",
description: "Upload a WordPress export file (.xml)",
icon: "upload",
requiresFile: true,
canProbe: false,
async analyze(input: SourceInput, context: ImportContext): Promise<ImportAnalysis> {
if (input.type !== "file") {
throw new Error("WXR source requires a file input");
}
const text = await input.file.text();
const wxr = await parseWxrString(text);
// Get existing collections for schema compatibility check
const existingCollections = context.getExistingCollections
? await context.getExistingCollections()
: new Map();
return analyzeWxrData(wxr, existingCollections);
},
async *fetchContent(input: SourceInput, options: FetchOptions): AsyncGenerator<NormalizedItem> {
if (input.type !== "file") {
throw new Error("WXR source requires a file input");
}
const text = await input.file.text();
const wxr = await parseWxrString(text);
// Build attachment ID -> URL map for resolving featured images
const attachmentMap = buildAttachmentMap(wxr.attachments);
let count = 0;
for (const post of wxr.posts) {
const postType = post.postType || "post";
// Skip if not in requested post types
if (!options.postTypes.includes(postType)) {
continue;
}
// Skip internal post types
if (isInternalPostType(postType)) {
continue;
}
// Skip drafts if not requested
if (!options.includeDrafts && post.status !== "publish") {
continue;
}
// Convert to normalized item
yield wxrPostToNormalizedItem(post, attachmentMap);
count++;
if (options.limit && count >= options.limit) {
break;
}
}
},
};
/**
* Analyze WXR data and return normalized ImportAnalysis
*/
function analyzeWxrData(
wxr: WxrData,
existingCollections: Map<string, { slug: string; fields: Map<string, { type: string }> }>,
): ImportAnalysis {
// Count post types and track which have featured images
const postTypeCounts = new Map<string, number>();
const postTypesWithThumbnails = new Set<string>();
const metaKeys = new Map<string, { count: number; samples: string[]; isInternal: boolean }>();
const authorPostCounts = new Map<string, number>();
for (const post of wxr.posts) {
const type = post.postType || "post";
postTypeCounts.set(type, (postTypeCounts.get(type) || 0) + 1);
// Count posts per author (by login)
if (post.creator) {
authorPostCounts.set(post.creator, (authorPostCounts.get(post.creator) || 0) + 1);
}
// Track if this post type has featured images
if (post.meta.has("_thumbnail_id")) {
postTypesWithThumbnails.add(type);
}
// Analyze meta keys
for (const [key, value] of post.meta) {
const existing = metaKeys.get(key);
if (existing) {
existing.count++;
if (existing.samples.length < 3 && value) {
existing.samples.push(value.slice(0, 100));
}
} else {
metaKeys.set(key, {
count: 1,
samples: value ? [value.slice(0, 100)] : [],
isInternal: isInternalMetaKey(key),
});
}
}
}
// Map meta keys to fields (for custom fields analysis)
const customFields = [...metaKeys.entries()]
.filter(([_, info]) => !info.isInternal)
.map(([key, info]) => ({
key,
count: info.count,
samples: info.samples,
suggestedField: mapMetaKeyToField(key),
suggestedType: inferMetaType(key, info.samples[0]),
isInternal: info.isInternal,
}))
.toSorted((a, b) => b.count - a.count);
// Build post type analysis with schema compatibility
const postTypes: PostTypeAnalysis[] = [...postTypeCounts.entries()]
.filter(([type]) => !isInternalPostType(type))
.map(([name, count]) => {
const suggestedCollection = mapPostTypeToCollection(name);
const existingCollection = existingCollections.get(suggestedCollection);
// Build required fields - add featured_image only if posts have thumbnails
const requiredFields = [...BASE_REQUIRED_FIELDS];
if (postTypesWithThumbnails.has(name)) {
requiredFields.push(FEATURED_IMAGE_FIELD);
}
const schemaStatus = checkSchemaCompatibility(requiredFields, existingCollection);
return {
name,
count,
suggestedCollection,
requiredFields,
schemaStatus,
};
})
.toSorted((a, b) => b.count - a.count);
// Build attachment info list
const attachmentItems: AttachmentInfo[] = wxr.attachments.map((att) => {
const filename = att.url ? getFilenameFromUrl(att.url) : undefined;
const mimeType = filename ? guessMimeType(filename) : undefined;
return {
id: att.id,
title: att.title,
url: att.url,
filename,
mimeType,
};
});
// Analyze navigation menus
const navMenus: NavMenuAnalysis[] = wxr.navMenus.map((menu) => ({
name: menu.name,
label: menu.label,
itemCount: menu.items.length,
}));
// Analyze custom taxonomies (from wp:term elements, excluding category/post_tag/nav_menu)
const taxonomyMap = new Map<string, { count: number; samples: string[] }>();
for (const term of wxr.terms) {
if (
term.taxonomy === "category" ||
term.taxonomy === "post_tag" ||
term.taxonomy === "nav_menu"
) {
continue;
}
const existing = taxonomyMap.get(term.taxonomy);
if (existing) {
existing.count++;
if (existing.samples.length < 3) {
existing.samples.push(term.name);
}
} else {
taxonomyMap.set(term.taxonomy, {
count: 1,
samples: [term.name],
});
}
}
const customTaxonomies: TaxonomyAnalysis[] = Array.from(
taxonomyMap.entries(),
([slug, info]) => ({
slug,
termCount: info.count,
sampleTerms: info.samples,
}),
).toSorted((a, b) => b.termCount - a.termCount);
// Analyze reusable blocks (wp_block post type)
const reusableBlocks: ReusableBlockAnalysis[] = wxr.posts
.filter((post) => post.postType === "wp_block")
.map((post) => ({
id: post.id || 0,
title: post.title || "Untitled Block",
slug: post.postName || slugify(post.title || `block-${post.id || Date.now()}`),
}));
return {
sourceId: "wxr",
site: {
title: wxr.site.title || "WordPress Site",
url: wxr.site.link || "",
},
postTypes,
attachments: {
count: wxr.attachments.length,
items: attachmentItems,
},
categories: wxr.categories.length,
tags: wxr.tags.length,
authors: wxr.authors.map((a) => ({
id: a.id,
login: a.login,
email: a.email,
displayName: a.displayName || a.login || "Unknown",
postCount: a.login ? authorPostCounts.get(a.login) || 0 : 0,
})),
navMenus: navMenus.length > 0 ? navMenus : undefined,
customTaxonomies: customTaxonomies.length > 0 ? customTaxonomies : undefined,
reusableBlocks: reusableBlocks.length > 0 ? reusableBlocks : undefined,
customFields,
};
}
/**
* Convert a WXR post to a normalized item
*/
function wxrPostToNormalizedItem(
post: WxrPost,
attachmentMap: Map<string, string>,
): NormalizedItem {
const content = post.content ? gutenbergToPortableText(post.content) : [];
// Resolve featured image: _thumbnail_id is the attachment ID, look up the URL
const thumbnailId = post.meta.get("_thumbnail_id");
const featuredImage = thumbnailId ? attachmentMap.get(String(thumbnailId)) : undefined;
// Convert custom taxonomies Map to Record
let customTaxonomies: Record<string, string[]> | undefined;
if (post.customTaxonomies && post.customTaxonomies.size > 0) {
customTaxonomies = Object.fromEntries(post.customTaxonomies);
}
return {
sourceId: post.id || 0,
postType: post.postType || "post",
status: mapWpStatus(post.status),
slug: post.postName || slugify(post.title || `post-${post.id || Date.now()}`),
title: post.title || "Untitled",
content,
excerpt: post.excerpt,
date: parseWxrDate(post.postDateGmt, post.pubDate, post.postDate) ?? new Date(),
modified: parseWxrDate(post.postModifiedGmt, undefined, post.postModified),
author: post.creator,
categories: post.categories,
tags: post.tags,
meta: Object.fromEntries(post.meta),
featuredImage,
// Hierarchical content support
parentId: post.postParent && post.postParent !== 0 ? post.postParent : undefined,
menuOrder: post.menuOrder,
// Custom taxonomy assignments
customTaxonomies,
};
}
/**
* WordPress uses "0000-00-00 00:00:00" as a sentinel for missing GMT dates
* (e.g. unpublished drafts). This must be treated as absent.
*/
export const WXR_ZERO_DATE = "0000-00-00 00:00:00";
/**
* Parse a WXR date with the correct fallback chain:
* 1. GMT date (always UTC, most reliable)
* 2. pubDate (RFC 2822, includes timezone offset)
* 3. Site-local date (MySQL datetime without timezone, imprecise but best available)
*
* Returns undefined when none of the inputs yield a valid date.
* Callers that need a guaranteed Date should use `?? new Date()`.
*/
export function parseWxrDate(
gmtDate: string | undefined,
pubDate: string | undefined,
localDate: string | undefined,
): Date | undefined {
if (gmtDate && gmtDate !== WXR_ZERO_DATE) {
// GMT dates from WordPress are "YYYY-MM-DD HH:MM:SS" in UTC.
// Append "Z" so the JS Date constructor treats them as UTC.
return new Date(gmtDate.replace(" ", "T") + "Z");
}
if (pubDate) {
// RFC 2822 format includes timezone offset, JS Date parses it correctly
const d = new Date(pubDate);
if (!isNaN(d.getTime())) return d;
}
if (localDate) {
// Site-local time without timezone. Normalize to ISO-like form so
// runtimes that reject "YYYY-MM-DD HH:MM:SS" can still parse it as
// local time. If parsing still fails, return undefined.
const d = new Date(localDate.replace(" ", "T"));
if (!isNaN(d.getTime())) return d;
}
return undefined;
}
// Export for use in other sources
export { analyzeWxrData, wxrPostToNormalizedItem };
// Re-export shared utilities that other sources may need
export {
BASE_REQUIRED_FIELDS,
FEATURED_IMAGE_FIELD,
mapPostTypeToCollection,
isInternalPostType,
checkSchemaCompatibility,
} from "../utils.js";

View File

@@ -0,0 +1,501 @@
/**
* SSRF protection for import URLs.
*
* Validates that URLs don't target internal/private network addresses.
* Applied before any fetch() call in the import pipeline.
*/
const IPV4_MAPPED_IPV6_DOTTED_PATTERN = /^::ffff:(\d+\.\d+\.\d+\.\d+)$/i;
const IPV4_MAPPED_IPV6_HEX_PATTERN = /^::ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/i;
const IPV4_TRANSLATED_HEX_PATTERN = /^::ffff:0:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/i;
const IPV6_EXPANDED_MAPPED_PATTERN =
/^0{0,4}:0{0,4}:0{0,4}:0{0,4}:0{0,4}:ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/i;
/**
* IPv4-compatible (deprecated) addresses: ::XXXX:XXXX
*
* The WHATWG URL parser normalizes [::127.0.0.1] to [::7f00:1] (no ffff prefix).
* These are deprecated but still parsed, and bypass the ffff-based checks.
*/
const IPV4_COMPATIBLE_HEX_PATTERN = /^::([0-9a-f]{1,4}):([0-9a-f]{1,4})$/i;
/**
* NAT64 prefix (RFC 6052): 64:ff9b::XXXX:XXXX
*
* Used by NAT64 gateways to embed IPv4 addresses in IPv6.
* [64:ff9b::127.0.0.1] normalizes to [64:ff9b::7f00:1].
*/
const NAT64_HEX_PATTERN = /^64:ff9b::([0-9a-f]{1,4}):([0-9a-f]{1,4})$/i;
const IPV6_BRACKET_PATTERN = /^\[|\]$/g;
/** Match fc00::/7 ULA — first byte 0xfc or 0xfd followed by any byte. */
const IPV6_ULA_FC_PATTERN = /^fc[0-9a-f]{2}:/;
const IPV6_ULA_FD_PATTERN = /^fd[0-9a-f]{2}:/;
/** Strip trailing dots from an FQDN-form hostname ("localhost." -> "localhost"). */
const TRAILING_DOT_PATTERN = /\.+$/;
/**
* Private and reserved IP ranges that should never be fetched.
*
* Includes:
* - Loopback (127.0.0.0/8)
* - Private (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16)
* - Link-local (169.254.0.0/16)
* - Cloud metadata (169.254.169.254 — AWS/GCP/Azure)
* - IPv6 loopback and link-local
*/
const BLOCKED_PATTERNS: Array<{ start: number; end: number }> = [
// 127.0.0.0/8 — loopback
{ start: ip4ToNum(127, 0, 0, 0), end: ip4ToNum(127, 255, 255, 255) },
// 10.0.0.0/8 — private
{ start: ip4ToNum(10, 0, 0, 0), end: ip4ToNum(10, 255, 255, 255) },
// 172.16.0.0/12 — private
{ start: ip4ToNum(172, 16, 0, 0), end: ip4ToNum(172, 31, 255, 255) },
// 192.168.0.0/16 — private
{ start: ip4ToNum(192, 168, 0, 0), end: ip4ToNum(192, 168, 255, 255) },
// 169.254.0.0/16 — link-local (includes cloud metadata endpoint)
{ start: ip4ToNum(169, 254, 0, 0), end: ip4ToNum(169, 254, 255, 255) },
// 0.0.0.0/8 — current network
{ start: ip4ToNum(0, 0, 0, 0), end: ip4ToNum(0, 255, 255, 255) },
];
// Bracket-stripped form is used for lookups (validateExternalUrl strips
// brackets from parsed.hostname before checking), so "::1" appears here
// without brackets. The "::1" case is already covered by isPrivateIp, but
// keeping it here makes the intent explicit and gives a clearer error
// message for the common `http://[::1]/` form.
const BLOCKED_HOSTNAMES = new Set([
"localhost",
"metadata.google.internal",
"metadata.google",
"::1",
]);
/**
* Wildcard DNS services that publicly resolve arbitrary IPs embedded in the
* hostname. Commonly used in local dev and by SSRF exploit tooling to bypass
* hostname-only blocklists (e.g. 127.0.0.1.nip.io -> 127.0.0.1).
*
* Matched case-insensitively as a suffix, so both the apex and any subdomain
* are blocked.
*/
const BLOCKED_HOSTNAME_SUFFIXES = [
"nip.io",
"sslip.io",
"xip.io",
"traefik.me",
"lvh.me",
"localtest.me",
];
/** Blocked URL schemes */
const ALLOWED_SCHEMES = new Set(["http:", "https:"]);
function ip4ToNum(a: number, b: number, c: number, d: number): number {
return ((a << 24) | (b << 16) | (c << 8) | d) >>> 0;
}
function parseIpv4(ip: string): number | null {
const parts = ip.split(".");
if (parts.length !== 4) return null;
const nums = parts.map(Number);
if (nums.some((n) => isNaN(n) || n < 0 || n > 255)) return null;
return ip4ToNum(nums[0], nums[1], nums[2], nums[3]);
}
/**
* Convert IPv4-mapped/translated IPv6 addresses from hex form back to IPv4.
*
* The WHATWG URL parser normalizes dotted-decimal to hex:
* [::ffff:127.0.0.1] -> [::ffff:7f00:1]
* [::ffff:169.254.169.254] -> [::ffff:a9fe:a9fe]
*
* Without this conversion, the hex forms bypass isPrivateIp() regex checks.
*/
export function normalizeIPv6MappedToIPv4(ip: string): string | null {
// Match hex-form IPv4-mapped IPv6: ::ffff:XXXX:XXXX
let match = ip.match(IPV4_MAPPED_IPV6_HEX_PATTERN);
if (!match) {
// Match IPv4-translated (RFC 6052): ::ffff:0:XXXX:XXXX
match = ip.match(IPV4_TRANSLATED_HEX_PATTERN);
}
if (!match) {
// Match fully expanded form: 0000:0000:0000:0000:0000:ffff:XXXX:XXXX
match = ip.match(IPV6_EXPANDED_MAPPED_PATTERN);
}
if (!match) {
// Match IPv4-compatible (deprecated) form: ::XXXX:XXXX (no ffff prefix)
match = ip.match(IPV4_COMPATIBLE_HEX_PATTERN);
}
if (!match) {
// Match NAT64 prefix (RFC 6052): 64:ff9b::XXXX:XXXX
match = ip.match(NAT64_HEX_PATTERN);
}
if (match) {
const high = parseInt(match[1] ?? "", 16);
const low = parseInt(match[2] ?? "", 16);
return `${(high >> 8) & 0xff}.${high & 0xff}.${(low >> 8) & 0xff}.${low & 0xff}`;
}
return null;
}
function isPrivateIp(ip: string): boolean {
// Normalize IPv6 strings to lowercase. `new URL().hostname` already
// lowercases, but resolver output (from DoH or an injected resolver) may
// not. Without this, "FE80::1" bypasses the link-local check.
const normalized = ip.toLowerCase();
// Handle IPv6 loopback
if (normalized === "::1" || normalized === "::ffff:127.0.0.1") return true;
// Handle IPv4-mapped IPv6 in hex form (WHATWG URL parser normalizes to this)
// e.g. ::ffff:7f00:1 -> 127.0.0.1, ::ffff:a9fe:a9fe -> 169.254.169.254
const hexIpv4 = normalizeIPv6MappedToIPv4(normalized);
if (hexIpv4) return isPrivateIp(hexIpv4);
// Handle IPv4-mapped IPv6 in dotted-decimal form
const v4Match = normalized.match(IPV4_MAPPED_IPV6_DOTTED_PATTERN);
const ipv4 = v4Match ? v4Match[1] : normalized;
const num = parseIpv4(ipv4);
if (num === null) {
// If we can't parse it, block IPv6 addresses that look internal.
// fc00::/7 is Unique Local (first byte 0xfc or 0xfd), fe80::/10 is
// link-local. Only match when followed by hex digit + colon to avoid
// collisions with hypothetical non-address strings.
return (
normalized.startsWith("fe80:") ||
IPV6_ULA_FC_PATTERN.test(normalized) ||
IPV6_ULA_FD_PATTERN.test(normalized)
);
}
return BLOCKED_PATTERNS.some((range) => num >= range.start && num <= range.end);
}
/**
* Error thrown when SSRF protection blocks a URL.
*/
export class SsrfError extends Error {
code = "SSRF_BLOCKED" as const;
constructor(message: string) {
super(message);
this.name = "SsrfError";
}
}
/**
* Validate that a URL is safe to fetch (not targeting internal networks).
*
* Checks:
* 1. URL is well-formed with http/https scheme
* 2. Hostname is not a known internal name (localhost, metadata endpoints)
* 3. If hostname is an IP literal, it's not in a private range
*
* Note: DNS rebinding attacks are not fully mitigated (hostname could resolve
* to a private IP). Full protection requires resolving DNS and checking the IP
* before connecting, which needs a custom fetch implementation. This covers
* the most common SSRF vectors.
*
* @throws SsrfError if the URL targets an internal address
*/
/** Maximum number of redirects to follow in ssrfSafeFetch */
const MAX_REDIRECTS = 5;
export function validateExternalUrl(url: string): URL {
let parsed: URL;
try {
parsed = new URL(url);
} catch {
throw new SsrfError("Invalid URL");
}
// Only allow http/https
if (!ALLOWED_SCHEMES.has(parsed.protocol)) {
throw new SsrfError(`Scheme '${parsed.protocol}' is not allowed`);
}
// Strip brackets from IPv6 hostname
const hostname = parsed.hostname.replace(IPV6_BRACKET_PATTERN, "");
// Normalize the hostname for blocklist matching: lowercase + strip any
// trailing dots. WHATWG preserves trailing dots on .hostname, so without
// this normalization "localhost." and "nip.io." bypass the checks.
const normalizedHost = hostname.toLowerCase().replace(TRAILING_DOT_PATTERN, "");
// Check against known internal hostnames
if (BLOCKED_HOSTNAMES.has(normalizedHost)) {
throw new SsrfError("URLs targeting internal hosts are not allowed");
}
// Check against wildcard DNS services used by SSRF tooling to bypass
// hostname-only checks. Match the apex and any subdomain.
for (const suffix of BLOCKED_HOSTNAME_SUFFIXES) {
if (normalizedHost === suffix || normalizedHost.endsWith(`.${suffix}`)) {
throw new SsrfError("URLs targeting wildcard DNS services are not allowed");
}
}
// Check if hostname is an IP address in a private range. Use the
// normalized form so "127.0.0.1.." and friends don't bypass parseIpv4
// (which rejects extra trailing dots).
if (isPrivateIp(normalizedHost)) {
throw new SsrfError("URLs targeting private IP addresses are not allowed");
}
return parsed;
}
// ---------------------------------------------------------------------------
// DNS-aware validation
// ---------------------------------------------------------------------------
/**
* A resolver that maps a hostname to a list of IPv4/IPv6 addresses.
* Injectable so callers can swap in OS-level DNS on Node, stub it in tests,
* or point to a different DoH endpoint.
*/
export type DnsResolver = (hostname: string) => Promise<string[]>;
/**
* Module-level default resolver. Tests can swap this with a stub so fetch
* mocks don't see unexpected DoH round-trips. Production code should leave
* it alone.
*/
let defaultResolver: DnsResolver | null = null;
/** Override the default DNS resolver. Returns the previous value. */
export function setDefaultDnsResolver(resolver: DnsResolver | null): DnsResolver | null {
const previous = defaultResolver;
defaultResolver = resolver;
return previous;
}
/** Timeout for a single DoH request, in milliseconds. */
const DOH_TIMEOUT_MS = 3000;
/** Default DoH endpoint — Cloudflare's public resolver. */
const DEFAULT_DOH_URL = "https://cloudflare-dns.com/dns-query";
interface DohAnswer {
data: string;
}
interface DohResponse {
Status: number;
Answer: DohAnswer[];
}
function hasProperty<K extends string>(obj: unknown, key: K): obj is Record<K, unknown> {
return typeof obj === "object" && obj !== null && key in obj;
}
/**
* Narrow an unknown JSON body to a DohResponse shape we can read safely.
* Throws if the body doesn't look like a DoH response — a malformed body is
* indistinguishable from a failure and must not be silently treated as empty.
*/
function parseDohResponse(raw: unknown): DohResponse {
if (!hasProperty(raw, "Status") || typeof raw.Status !== "number") {
throw new Error("DoH response missing Status field");
}
const answers: DohAnswer[] = [];
if (hasProperty(raw, "Answer") && Array.isArray(raw.Answer)) {
for (const entry of raw.Answer) {
if (hasProperty(entry, "data") && typeof entry.data === "string") {
answers.push({ data: entry.data });
}
}
}
return { Status: raw.Status, Answer: answers };
}
/**
* Resolve a hostname via DNS over HTTPS (Cloudflare). Returns all A and AAAA
* records. Works in both Workers and Node without requiring node:dns.
*
* Fails closed: any network error, non-2xx response, or DNS rcode != 0
* causes a rejected promise so the calling validator treats it as a block.
*/
export const cloudflareDohResolver: DnsResolver = async (hostname) => {
async function query(type: "A" | "AAAA"): Promise<string[]> {
const params = new URLSearchParams({ name: hostname, type });
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), DOH_TIMEOUT_MS);
try {
const response = await globalThis.fetch(`${DEFAULT_DOH_URL}?${params.toString()}`, {
headers: { Accept: "application/dns-json" },
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`DoH lookup failed: ${response.status}`);
}
const raw = await response.json();
const body = parseDohResponse(raw);
// NXDOMAIN (3) is a legitimate "does not exist" — treat as empty.
// Any other non-zero status (SERVFAIL=2, REFUSED=5, etc.) is
// ambiguous and could be a split-view attacker hiding records
// from our resolver. Fail closed.
if (body.Status === 3) return [];
if (body.Status !== 0) {
throw new Error(`DoH ${type} lookup failed: rcode=${body.Status}`);
}
// DoH Answer arrays often include CNAME records alongside A/AAAA
// records. Their `data` is a hostname, not an IP. Filter to just
// IP literals so isPrivateIp sees real addresses.
return body.Answer.map((a) => a.data).filter(isIpLiteral);
} finally {
clearTimeout(timeout);
}
}
const [a, aaaa] = await Promise.all([query("A"), query("AAAA")]);
return [...a, ...aaaa];
};
/**
* Validate a URL and resolve its hostname to check the actual IPs against
* the private-range blocklist. This catches DNS rebinding attacks using
* attacker-controlled domains that publicly resolve to private addresses,
* and wildcard DNS services like nip.io used by exploit tooling.
*
* Runs `validateExternalUrl` first for cheap pre-flight checks (scheme,
* literal IP, known-bad hostnames). Then resolves the hostname and rejects
* if ANY returned address is private.
*
* Fails closed: if resolution fails or returns no records, throws SsrfError.
*
* **Caveats.** This does NOT fully close the TOCTOU between check and
* connect. Attacks that still work against this layer include:
*
* - TTL=0 rebind: authoritative server returns public IP to the check, then
* private IP to the subsequent fetch() a few milliseconds later.
* - Split-view via EDNS Client Subnet or source-IP inspection: the
* authoritative server returns public IP to Cloudflare's DoH resolver and
* private IP to the victim's own resolver (used by fetch()).
* - Host-file overrides or split-horizon corporate DNS on self-hosted Node.
* - Attacker-controlled rebinding services the caller has allowlisted.
*
* The only complete defense is a network-layer egress firewall. On
* Cloudflare Workers, the platform fetch pipeline provides most of that.
* On self-hosted Node, operators must restrict egress themselves.
*/
export async function resolveAndValidateExternalUrl(
url: string,
options?: { resolver?: DnsResolver },
): Promise<URL> {
const parsed = validateExternalUrl(url);
// Strip brackets from IPv6 hostnames
const hostname = parsed.hostname.replace(IPV6_BRACKET_PATTERN, "");
// If the hostname is already an IP literal, validateExternalUrl has
// already checked it against the private-range list. Skip DNS.
if (isIpLiteral(hostname)) {
return parsed;
}
const resolver = options?.resolver ?? defaultResolver ?? cloudflareDohResolver;
let addresses: string[];
try {
addresses = await resolver(hostname);
} catch (error) {
throw new SsrfError(
`Could not resolve hostname: ${error instanceof Error ? error.message : String(error)}`,
);
}
if (addresses.length === 0) {
throw new SsrfError("Hostname resolved to no addresses");
}
for (const ip of addresses) {
if (isPrivateIp(ip)) {
throw new SsrfError("Hostname resolves to a private IP address");
}
}
return parsed;
}
/** True when a string looks like an IPv4 or IPv6 literal. */
function isIpLiteral(host: string): boolean {
if (parseIpv4(host) !== null) return true;
// Very loose IPv6 heuristic — matches anything with a colon, which is
// never valid in DNS hostnames, so this is safe.
return host.includes(":");
}
/**
* Fetch a URL with SSRF protection on redirects.
*
* Uses `redirect: "manual"` to intercept redirects and re-validate each
* redirect target against SSRF rules before following it. This prevents
* an attacker from setting up an allowed external URL that redirects to
* an internal IP (e.g. 169.254.169.254 for cloud metadata).
*
* @throws SsrfError if the initial URL or any redirect target is internal
*/
/** Headers that must be stripped when a redirect crosses origins */
const CREDENTIAL_HEADERS = ["authorization", "cookie", "proxy-authorization"];
export async function ssrfSafeFetch(
url: string,
init?: RequestInit,
options?: { resolver?: DnsResolver },
): Promise<Response> {
let currentUrl = url;
let currentInit = init;
for (let i = 0; i <= MAX_REDIRECTS; i++) {
await resolveAndValidateExternalUrl(currentUrl, options);
const response = await globalThis.fetch(currentUrl, {
...currentInit,
redirect: "manual",
});
// Not a redirect -- return directly
if (response.status < 300 || response.status >= 400) {
return response;
}
// Extract redirect target
const location = response.headers.get("Location");
if (!location) {
return response;
}
// Resolve relative redirects against the current URL
const previousOrigin = new URL(currentUrl).origin;
currentUrl = new URL(location, currentUrl).href;
const nextOrigin = new URL(currentUrl).origin;
// Strip credential headers on cross-origin redirects
if (previousOrigin !== nextOrigin && currentInit) {
currentInit = stripCredentialHeaders(currentInit);
}
}
throw new SsrfError(`Too many redirects (max ${MAX_REDIRECTS})`);
}
/**
* Return a copy of init with credential headers removed.
*/
export function stripCredentialHeaders(init: RequestInit): RequestInit {
if (!init.headers) return init;
const headers = new Headers(init.headers);
for (const name of CREDENTIAL_HEADERS) {
headers.delete(name);
}
return { ...init, headers };
}

View File

@@ -0,0 +1,418 @@
/**
* Import source abstraction
*
* Allows different import sources (WXR file, WordPress.com API, REST API, plugin)
* to all produce the same normalized format for the import flow.
*/
import type { PortableTextBlock } from "@emdash-cms/gutenberg-to-portable-text";
// =============================================================================
// Author Types
// =============================================================================
/** Author info from WordPress */
export interface WpAuthorInfo {
id?: number;
login?: string;
email?: string;
displayName?: string;
postCount: number;
}
// =============================================================================
// Source Input Types
// =============================================================================
/** File-based input (WXR upload) */
export interface FileInput {
type: "file";
file: File;
}
/** URL-based input (REST API probe) */
export interface UrlInput {
type: "url";
url: string;
/** Optional auth token for authenticated requests */
token?: string;
}
/** OAuth-based input (WordPress.com) */
export interface OAuthInput {
type: "oauth";
url: string;
accessToken: string;
/** Site ID for WordPress.com */
siteId?: string;
}
export type SourceInput = FileInput | UrlInput | OAuthInput;
// =============================================================================
// Probe Result Types
// =============================================================================
/** Auth requirements for an import source */
export interface SourceAuth {
type: "oauth" | "token" | "password" | "none";
/** OAuth provider identifier */
provider?: string;
/** OAuth authorization URL */
oauthUrl?: string;
/** Human-readable instructions */
instructions?: string;
}
/** What the source can provide */
export interface SourceCapabilities {
/** Can fetch published content without auth */
publicContent: boolean;
/** Can fetch drafts/private (may need auth) */
privateContent: boolean;
/** Can fetch all custom post types */
customPostTypes: boolean;
/** Can fetch all meta fields */
allMeta: boolean;
/** Can stream media directly */
mediaStream: boolean;
}
/** Suggested next action after probe */
export type SuggestedAction =
| { type: "proceed" }
| { type: "oauth"; url: string; provider: string }
| { type: "upload"; instructions: string }
| { type: "install-plugin"; instructions: string };
/** Detected i18n/multilingual plugin info */
export interface I18nDetection {
/** Multilingual plugin name (e.g. "wpml", "polylang") */
plugin: string;
/** BCP 47 default locale */
defaultLocale: string;
/** All configured locales */
locales: string[];
}
/** Result of probing a URL for a specific source */
export interface SourceProbeResult {
/** Which source can handle this */
sourceId: string;
/** Confidence level */
confidence: "definite" | "likely" | "possible";
/** What we detected */
detected: {
platform: string;
version?: string;
siteTitle?: string;
siteUrl?: string;
};
/** What capabilities are available */
capabilities: SourceCapabilities;
/** What auth is needed, if any */
auth?: SourceAuth;
/** Suggested next step */
suggestedAction: SuggestedAction;
/** Preview data if available (e.g., post counts from REST API) */
preview?: {
posts?: number;
pages?: number;
media?: number;
};
/** Detected multilingual plugin. Absent when none detected. */
i18n?: I18nDetection;
}
/** Combined probe result from all sources */
export interface ProbeResult {
url: string;
isWordPress: boolean;
/** Best matching source (highest confidence) */
bestMatch: SourceProbeResult | null;
/** All matching sources */
allMatches: SourceProbeResult[];
}
// =============================================================================
// Analysis Types (normalized from all sources)
// =============================================================================
/** Field definition for import */
export interface ImportFieldDef {
slug: string;
label: string;
type: string;
required: boolean;
searchable?: boolean;
}
/** Field compatibility with existing schema */
export type FieldCompatibility = "compatible" | "type_mismatch" | "missing";
/** Schema status for a collection */
export interface CollectionSchemaStatus {
exists: boolean;
fieldStatus: Record<
string,
{
status: FieldCompatibility;
existingType?: string;
requiredType: string;
}
>;
canImport: boolean;
reason?: string;
}
/** Analysis of a single post type */
export interface PostTypeAnalysis {
name: string;
count: number;
suggestedCollection: string;
requiredFields: ImportFieldDef[];
schemaStatus: CollectionSchemaStatus;
}
/** Attachment/media info */
export interface AttachmentInfo {
id?: number;
title?: string;
url?: string;
filename?: string;
mimeType?: string;
alt?: string;
caption?: string;
width?: number;
height?: number;
}
/** Navigation menu analysis */
export interface NavMenuAnalysis {
/** Menu name/slug */
name: string;
/** Menu display label */
label: string;
/** Number of items in this menu */
itemCount: number;
}
/** Custom taxonomy analysis */
export interface TaxonomyAnalysis {
/** Taxonomy slug (e.g., 'genre', 'portfolio_category') */
slug: string;
/** Number of terms in this taxonomy */
termCount: number;
/** Sample term names */
sampleTerms: string[];
}
/** Reusable block analysis (wp_block post type) */
export interface ReusableBlockAnalysis {
/** Original WP ID */
id: number;
/** Block title */
title: string;
/** Block slug */
slug: string;
}
/** Normalized analysis result - same format for all sources */
export interface ImportAnalysis {
/** Source that produced this analysis */
sourceId: string;
site: {
title: string;
url: string;
};
postTypes: PostTypeAnalysis[];
attachments: {
count: number;
items: AttachmentInfo[];
};
categories: number;
tags: number;
authors: WpAuthorInfo[];
/** Navigation menus found in the export */
navMenus?: NavMenuAnalysis[];
/** Custom taxonomies (beyond categories/tags) */
customTaxonomies?: TaxonomyAnalysis[];
/** Reusable blocks (wp_block post type) - will be imported as sections */
reusableBlocks?: ReusableBlockAnalysis[];
/** Source-specific custom fields analysis */
customFields?: Array<{
key: string;
count: number;
samples: string[];
suggestedField: string;
suggestedType: "string" | "number" | "boolean" | "date" | "json";
isInternal: boolean;
}>;
/** Detected multilingual plugin. Absent when none detected. */
i18n?: I18nDetection;
}
// =============================================================================
// Normalized Content Types
// =============================================================================
/** Normalized content item - produced by all sources */
export interface NormalizedItem {
/** Original ID from source */
sourceId: string | number;
/** WordPress post type */
postType: string;
/** Content status */
status: "publish" | "draft" | "pending" | "private" | "future";
/** URL slug */
slug: string;
/** Title */
title: string;
/** Content as Portable Text (already converted) */
content: PortableTextBlock[];
/** Excerpt/summary */
excerpt?: string;
/** Publication date */
date: Date;
/** Last modified date */
modified?: Date;
/** Author identifier */
author?: string;
/** Category slugs */
categories?: string[];
/** Tag slugs */
tags?: string[];
/** Custom meta fields */
meta?: Record<string, unknown>;
/** Featured image URL */
featuredImage?: string;
/** Parent post ID (for hierarchical content like pages) */
parentId?: string | number;
/** Menu order for sorting */
menuOrder?: number;
/** Custom taxonomy assignments beyond categories/tags */
customTaxonomies?: Record<string, string[]>;
/** BCP 47 locale code. When omitted, defaults to defaultLocale. */
locale?: string;
/**
* Source-side translation group ID (opaque string from the origin system).
* Items sharing the same translationGroup are linked as translations.
* Resolved to an EmDash translation_group ULID during execute.
*/
translationGroup?: string;
}
// =============================================================================
// Import Configuration & Results
// =============================================================================
/** Post type mapping configuration */
export interface PostTypeMapping {
enabled: boolean;
collection: string;
}
/** Import configuration */
export interface ImportConfig {
postTypeMappings: Record<string, PostTypeMapping>;
skipExisting?: boolean;
}
/** Options for fetching content */
export interface FetchOptions {
/** Post types to fetch */
postTypes: string[];
/** Whether to include drafts */
includeDrafts?: boolean;
/** Limit number of items (for testing) */
limit?: number;
}
/** Import result */
export interface ImportResult {
success: boolean;
imported: number;
skipped: number;
errors: Array<{ title: string; error: string }>;
byCollection: Record<string, number>;
}
// =============================================================================
// Import Source Interface
// =============================================================================
/**
* An import source provides content from an external system.
* All sources produce the same normalized analysis and content format.
*/
export interface ImportSource {
/** Unique identifier */
id: string;
/** Display name */
name: string;
/** Description for UI */
description: string;
/** Icon identifier */
icon: "upload" | "globe" | "wordpress" | "plug";
/** Whether this source requires a file upload */
requiresFile?: boolean;
/** Whether this source can probe URLs */
canProbe?: boolean;
/**
* Probe a URL to see if this source can handle it.
* Returns null if not applicable.
*/
probe?(url: string): Promise<SourceProbeResult | null>;
/**
* Analyze content from this source.
* Returns normalized ImportAnalysis.
*/
analyze(input: SourceInput, context: ImportContext): Promise<ImportAnalysis>;
/**
* Stream content items for import.
* Yields normalized content items.
*/
fetchContent(input: SourceInput, options: FetchOptions): AsyncGenerator<NormalizedItem>;
/**
* Fetch a media item's data.
* Used for media import.
*/
fetchMedia?(url: string, input: SourceInput): Promise<Blob>;
}
/** Context passed to import sources */
export interface ImportContext {
/** Database connection for schema checks */
db?: unknown;
/** Function to check existing collections */
getExistingCollections?: () => Promise<
Map<string, { slug: string; fields: Map<string, { type: string }> }>
>;
}

View File

@@ -0,0 +1,412 @@
/**
* Shared import utilities
*
* Common constants and functions used across all WordPress import sources.
*/
import mime from "mime/lite";
import type { ImportFieldDef, CollectionSchemaStatus } from "./types.js";
// =============================================================================
// Constants
// =============================================================================
/** Internal WordPress post types that should be excluded from import */
export const INTERNAL_POST_TYPES = [
"revision",
"nav_menu_item",
"custom_css",
"customize_changeset",
"oembed_cache",
"wp_global_styles",
"wp_navigation",
"wp_template",
"wp_template_part",
"attachment", // Handled separately as media
"wp_block", // Handled separately as sections (reusable blocks)
];
/** Internal meta key prefixes to filter out */
export const INTERNAL_META_PREFIXES = ["_edit_", "_wp_"];
const NUMERIC_PATTERN = /^-?\d+(\.\d+)?$/;
const TRAILING_SLASHES = /\/+$/;
const WP_JSON_SUFFIX = /\/wp-json\/?.*$/;
/** Specific internal meta keys */
export const INTERNAL_META_KEYS = ["_edit_last", "_edit_lock", "_pingme", "_encloseme"];
/** Base fields required for any WordPress import */
export const BASE_REQUIRED_FIELDS: ImportFieldDef[] = [
{ slug: "title", label: "Title", type: "string", required: true, searchable: true },
{ slug: "content", label: "Content", type: "portableText", required: false, searchable: true },
{ slug: "excerpt", label: "Excerpt", type: "text", required: false },
];
/** Featured image field - only added to post types that have _thumbnail_id */
export const FEATURED_IMAGE_FIELD: ImportFieldDef = {
slug: "featured_image",
label: "Featured Image",
type: "image",
required: false,
};
// =============================================================================
// Type Guards
// =============================================================================
/**
* Check if a post type is internal/should be excluded
*/
export function isInternalPostType(type: string): boolean {
return INTERNAL_POST_TYPES.includes(type);
}
/**
* Check if a meta key is internal/should be filtered out
*/
export function isInternalMetaKey(key: string): boolean {
// Check specific keys
if (INTERNAL_META_KEYS.includes(key)) return true;
// Check prefixes
for (const prefix of INTERNAL_META_PREFIXES) {
if (key.startsWith(prefix)) return true;
}
// Keep these useful ones
if (key === "_thumbnail_id") return false;
if (key.startsWith("_yoast_")) return false;
if (key.startsWith("_rank_math_")) return false;
// Other underscore prefixes are usually internal
if (key.startsWith("_")) return true;
return false;
}
// =============================================================================
// Status Mapping
// =============================================================================
/** Valid WordPress statuses */
export type WpStatus = "publish" | "draft" | "pending" | "private" | "future";
/**
* Map WordPress status to normalized status
*/
export function mapWpStatus(status: string | undefined): WpStatus {
switch (status) {
case "publish":
return "publish";
case "draft":
return "draft";
case "pending":
return "pending";
case "private":
return "private";
case "future":
return "future";
default:
return "draft";
}
}
// =============================================================================
// Collection Mapping
// =============================================================================
/** Default mappings from WordPress post types to EmDash collections */
const POST_TYPE_TO_COLLECTION: Record<string, string> = {
post: "posts",
page: "pages",
attachment: "media",
product: "products",
portfolio: "portfolio",
testimonial: "testimonials",
team: "team",
event: "events",
faq: "faqs",
};
/**
* Map WordPress post type to EmDash collection name
*/
export function mapPostTypeToCollection(postType: string): string {
return POST_TYPE_TO_COLLECTION[postType] || postType;
}
// =============================================================================
// Meta Key Mapping
// =============================================================================
/**
* Map WordPress meta key to EmDash field slug
*/
export function mapMetaKeyToField(key: string): string {
// SEO plugins
if (key === "_yoast_wpseo_title") return "seo_title";
if (key === "_yoast_wpseo_metadesc") return "seo_description";
if (key === "_rank_math_title") return "seo_title";
if (key === "_rank_math_description") return "seo_description";
if (key === "_thumbnail_id") return "featured_image";
// Remove leading underscore
if (key.startsWith("_")) return key.slice(1);
return key;
}
/**
* Infer field type from meta key name and sample value
*/
export function inferMetaType(
key: string,
value: string | undefined,
): "string" | "number" | "boolean" | "date" | "json" {
if (key.endsWith("_id") || key === "_thumbnail_id") return "string";
if (key.endsWith("_date") || key.endsWith("_time")) return "date";
if (key.endsWith("_count") || key.endsWith("_number")) return "number";
if (!value) return "string";
// Serialized PHP or JSON
if (value.startsWith("a:") || value.startsWith("{") || value.startsWith("[")) return "json";
// Number
if (NUMERIC_PATTERN.test(value)) return "number";
// Boolean
if (["0", "1", "true", "false"].includes(value)) return "boolean";
return "string";
}
// =============================================================================
// String Utilities
// =============================================================================
export { slugify } from "../utils/slugify.js";
/**
* Normalize URL for API requests
*/
export function normalizeUrl(url: string): string {
let normalized = url.trim();
// Add protocol if missing
if (!normalized.startsWith("http")) {
normalized = `https://${normalized}`;
}
// Remove trailing slash
normalized = normalized.replace(TRAILING_SLASHES, "");
// Remove /wp-json if included
normalized = normalized.replace(WP_JSON_SUFFIX, "");
return normalized;
}
// =============================================================================
// File Utilities
// =============================================================================
/**
* Extract filename from URL
*/
export function getFilenameFromUrl(url: string): string | undefined {
try {
const parsed = new URL(url);
const segments = parsed.pathname.split("/").filter(Boolean);
return segments.pop();
} catch {
return undefined;
}
}
/**
* Guess MIME type from filename
*/
export function guessMimeType(filename: string): string | undefined {
return mime.getType(filename) ?? undefined;
}
// =============================================================================
// Attachment Map Builder
// =============================================================================
/**
* Build a map of attachment IDs to URLs for resolving featured images
*/
export function buildAttachmentMap(
attachments: Array<{ id?: number | string; url?: string }>,
): Map<string, string> {
const map = new Map<string, string>();
for (const att of attachments) {
if (att.id && att.url) {
map.set(String(att.id), att.url);
}
}
return map;
}
// =============================================================================
// Schema Compatibility
// =============================================================================
/**
* Check if two field types are compatible for import
*/
export function isTypeCompatible(requiredType: string, existingType: string): boolean {
if (requiredType === existingType) return true;
const compatibleTypes: Record<string, string[]> = {
string: ["string", "text", "slug"],
text: ["string", "text"],
portableText: ["portableText", "json"],
number: ["number", "integer"],
integer: ["number", "integer"],
};
const compatible = compatibleTypes[requiredType];
return compatible?.includes(existingType) ?? false;
}
// =============================================================================
// Byline Import Utilities
// =============================================================================
import type { BylineRepository } from "../database/repositories/byline.js";
import { slugify as slugifyFn } from "../utils/slugify.js";
const MAX_SLUG_COLLISION_ATTEMPTS = 1000;
/**
* Find or create a unique byline slug, capped at MAX_SLUG_COLLISION_ATTEMPTS.
*/
export async function ensureUniqueBylineSlug(
bylineRepo: BylineRepository,
baseSlug: string,
): Promise<string> {
let candidate = baseSlug;
let suffix = 2;
while (await bylineRepo.findBySlug(candidate)) {
if (suffix > MAX_SLUG_COLLISION_ATTEMPTS) {
throw new Error(
`Byline slug collision limit exceeded for base slug "${baseSlug}". ` +
`Tried ${MAX_SLUG_COLLISION_ATTEMPTS} variants.`,
);
}
candidate = `${baseSlug}-${suffix}`;
suffix++;
}
return candidate;
}
/**
* Resolve (find-or-create) a byline for an imported WordPress author.
* Caches results in `cache` keyed by `authorLogin:mappedUserId`.
*/
export async function resolveImportByline(
authorLogin: string | undefined,
displayName: string | undefined,
mappedUserId: string | undefined,
bylineRepo: BylineRepository,
cache: Map<string, string>,
): Promise<string | undefined> {
if (!authorLogin) return undefined;
const cacheKey = `${authorLogin}:${mappedUserId ?? ""}`;
const cached = cache.get(cacheKey);
if (cached) return cached;
if (mappedUserId) {
const existingForUser = await bylineRepo.findByUserId(mappedUserId);
if (existingForUser) {
cache.set(cacheKey, existingForUser.id);
return existingForUser.id;
}
}
const name = displayName || authorLogin;
const slugBase = slugifyFn(authorLogin);
const slug = await ensureUniqueBylineSlug(bylineRepo, slugBase || "author");
const created = await bylineRepo.create({
slug,
displayName: name,
userId: mappedUserId ?? null,
isGuest: !mappedUserId,
});
cache.set(cacheKey, created.id);
return created.id;
}
// =============================================================================
// Schema Compatibility
// =============================================================================
/**
* Check schema compatibility between required fields and existing collection
*/
export function checkSchemaCompatibility(
requiredFields: ImportFieldDef[],
existingCollection: { slug: string; fields: Map<string, { type: string }> } | undefined,
): CollectionSchemaStatus {
if (!existingCollection) {
// Collection doesn't exist - will need to create it
const fieldStatus: CollectionSchemaStatus["fieldStatus"] = {};
for (const field of requiredFields) {
fieldStatus[field.slug] = {
status: "missing",
requiredType: field.type,
};
}
return {
exists: false,
fieldStatus,
canImport: true,
};
}
// Collection exists - check field compatibility
const fieldStatus: CollectionSchemaStatus["fieldStatus"] = {};
const incompatibleFields: string[] = [];
for (const field of requiredFields) {
const existingField = existingCollection.fields.get(field.slug);
if (!existingField) {
fieldStatus[field.slug] = {
status: "missing",
requiredType: field.type,
};
} else if (isTypeCompatible(field.type, existingField.type)) {
fieldStatus[field.slug] = {
status: "compatible",
existingType: existingField.type,
requiredType: field.type,
};
} else {
fieldStatus[field.slug] = {
status: "type_mismatch",
existingType: existingField.type,
requiredType: field.type,
};
incompatibleFields.push(field.slug);
}
}
const canImport = incompatibleFields.length === 0;
const reason = canImport
? undefined
: `Incompatible field types: ${incompatibleFields.join(", ")}`;
return {
exists: true,
fieldStatus,
canImport,
reason,
};
}