dealplustech-astroreal/scripts/inject_faq_schema.py

"""
Extract FAQ Q&A pairs from product pages and inject as `faq={[...]}` prop.

Pattern detected:
  <h3 ...>Q: ...question...</h3>
  <p ...>...answer...</p>

Only operates on pages that have a FAQ section (search for 'คำถามที่พบบ่อย').
"""
import re
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
PAGES = ROOT / "src" / "pages"

# Product pages with FAQ UI (from earlier audit: lines >= 240 and grep -c FAQ > 0)
TARGETS = [
    "pipe-coupling.astro",
    "ท่อ-syler.astro",
    "หัวจ่าย-ball-jet.astro",
    "เม็กกรู๊ฟ-คับปลิ้ง.astro",
    "เครื่องเชื่อม-hdpe.astro",
    "เครื่องเชื่อม-ppr.astro",
    "เทอร์โมเบรค-thermobreak.astro",
    "realflex.astro",
    "water-treatment.astro",
    "วาล์ว-valve.astro",
    "รั้วเทวดา.astro",
    "ระบบรั้วไวน์แมน.astro",
    "durgo-avvs.astro",
    "ตู้ดับเพลิง.astro",
    "water-pump.astro",
    "grilles.astro",
    "ท่อ-upvc.astro",
    "armflex.astro",
    "aeroflex.astro",
    "maxflex.astro",
]


def extract_faq(content: str) -> list[tuple[str, str]]:
    """Return list of (question, answer) tuples from FAQ section.

    Scopes regex to the FAQ section only: starts at "คำถามที่พบบ่อย" and
    ends at the next "<!-- Contact CTA" or "Contact CTA" marker. This
    prevents pattern D from catching feature lists elsewhere on the page.

    Handles 4 patterns:
      A. <h3>Q: ...</h3><p>...</p>
      B. <h3>1. ...</h3><p>...</p>   (number prefix)
      C. <details><summary>...</summary><div>...</div></details>
      D. <h3>question</h3><p>answer</p>   (no prefix — grilles style)
    """
    if 'คำถามที่พบบ่อย' not in content:
        return []

    # Slice content to the FAQ block: from "คำถามที่พบบ่อย" up to the next
    # Contact CTA / section end. Fall back to end-of-file if no terminator.
    start = content.index('คำถามที่พบบ่อย')
    end = len(content)
    for marker in ('<!-- Contact CTA', 'Contact CTA', '<!-- End FAQ', '</section>\n    <!--'):
        idx = content.find(marker, start)
        if idx != -1 and idx < end:
            end = idx
    block = content[start:end]

    pairs: list[tuple[str, str]] = []

    # Pattern A + B: <h2-h4 ...>prefix...</h2-h4><p>answer</p>
    p_ab = re.compile(
        r'<h[234]\b[^>]*>\s*(?:Q:|\d+\.\s*)(?P<q>[^<]+?)\s*</h[234]>\s*<p\b[^>]*>(?P<a>.*?)</p>',
        re.DOTALL,
    )
    for m in p_ab.finditer(block):
        pairs.append((m.group('q').strip(), m.group('a').strip()))

    if not pairs:
        # Pattern D: <h2-h4>question</h2-h4><p>answer</p> (no prefix)
        p_d = re.compile(
            r'<h[234]\b[^>]*>\s*(?P<q>[^<]+?)\s*</h[234]>\s*<p\b[^>]*>(?P<a>.*?)</p>',
            re.DOTALL,
        )
        for m in p_d.finditer(block):
            pairs.append((m.group('q').strip(), m.group('a').strip()))

    if not pairs:
        # Pattern C: <details><summary>...</summary><div>...</div></details>
        p_c = re.compile(
            r'<details\b[^>]*>\s*<summary\b[^>]*>(?P<sum>.*?)</summary>'
            r'.*?<div\b[^>]*>(?P<a>.*?)</div>\s*</details>',
            re.DOTALL,
        )
        for m in p_c.finditer(block):
            sum_html = m.group('sum')
            sp = re.search(r'<span\b[^>]*>(?P<q>.*?)</span>', sum_html, re.DOTALL)
            q = sp.group('q').strip() if sp else re.sub(r'<[^>]+>', '', sum_html).strip()
            pairs.append((q, m.group('a').strip()))

    return pairs


def js_string(value: str) -> str:
    """Single-quoted JS literal that's safe to embed in Astro JSX."""
    return "'" + value.replace("\\", "\\\\").replace("'", "\\'") + "'"


def build_faq_prop(pairs: list[tuple[str, str]]) -> str:
    lines = ['\n  faq={[']
    for q, a in pairs:
        lines.append(f'    {{ question: {js_string(q)}, answer: {js_string(a)} }},')
    lines.append('  ]}')
    return '\n'.join(lines)


def inject_prop(content: str, prop_block: str) -> str:
    """Insert prop_block before the closing > of the first <BaseLayout> tag."""
    m = re.search(r'<BaseLayout\b[^>]*>', content)
    if not m:
        return content
    insert_at = m.end() - 1
    return content[:insert_at] + prop_block + content[insert_at:]


def process_file(path: Path) -> str:
    content = path.read_text(encoding='utf-8')
    # Idempotent: if faq prop already exists, skip.
    if 'faq={[' in content:
        return f"SKIP (already has faq prop): {path.name}"
    pairs = extract_faq(content)
    if not pairs:
        return f"SKIP (no FAQ): {path.name}"

    prop_block = build_faq_prop(pairs)
    new_content = inject_prop(content, prop_block)
    if new_content == content:
        return f"NO-CHANGE: {path.name}"

    path.write_text(new_content, encoding='utf-8')
    return f"OK ({len(pairs)} pairs): {path.name}"


def main() -> None:
    for name in TARGETS:
        path = PAGES / name
        if not path.exists():
            print(f"MISSING: {name}")
            continue
        print(process_file(path))


if __name__ == "__main__":
    main()