Files
ALwrity/backend/services/onboarding/website_intake_service.py
ajaysi b3cc83ed6e fix: resolve onboarding session not found warnings and frontend build OOM
- Use canonical Clerk user id (clerk_user_id) across all onboarding entrypoints to ensure consistent OnboardingSession.user_id lookup
- Fix API key persistence in api_key_manager.py to use correct APIKey model columns (session_id, provider, key)
- Increase Node heap for frontend build to 8GB and add build:nomap script to disable sourcemaps and reduce memory usage
- Update onboarding endpoints (endpoints_core.py, onboarding_control_service.py, step_management_service.py) to prefer clerk_user_id over id
- Fix frontend workflowStore.ts TypeScript error by returning WorkflowError instance
- Add website_automation_service.py for onboarding automation
2026-03-09 13:36:34 +05:30

286 lines
13 KiB
Python

"""Website Intake Service for generating site briefs from business information."""
from typing import Dict, Any, Optional
from loguru import logger
from services.llm_providers.main_text_generation import llm_text_gen
SITE_BRIEF_SCHEMA: Dict[str, Any] = {
"type": "object",
"properties": {
"site_brief": {
"type": "object",
"properties": {
"business_name": {"type": "string"},
"tagline": {"type": "string"},
"template_type": {"type": "string", "enum": ["blog", "profile", "shop", "dont_know"]},
"geo_scope": {"type": "string", "enum": ["global", "local", "hyper_local", "dont_know"]},
"primary_offerings": {"type": "array", "items": {"type": "string"}},
"product_assets": {
"type": "object",
"properties": {
"urls": {"type": "array", "items": {"type": "string"}},
"asset_ids": {"type": "array", "items": {"type": "string"}},
},
"required": ["urls", "asset_ids"],
},
"audience": {
"type": "object",
"properties": {
"segment": {"type": "string"},
"b2b_b2c": {"type": "string", "enum": ["B2B", "B2C", "Both", "dont_know"]},
"persona_notes": {"type": "string"},
},
"required": ["segment", "b2b_b2c", "persona_notes"],
},
"brand_voice": {
"type": "object",
"properties": {
"tone": {"type": "string"},
"adjectives": {"type": "array", "items": {"type": "string"}},
"avoid": {"type": "array", "items": {"type": "string"}},
},
"required": ["tone", "adjectives", "avoid"],
},
"contact": {
"type": "object",
"properties": {
"email": {"type": "string"},
"phone": {"type": ["string", "null"]},
"location": {"type": ["string", "null"]},
},
"required": ["email", "phone", "location"],
},
"competitor_urls": {"type": "array", "items": {"type": "string"}},
},
"required": [
"business_name",
"tagline",
"template_type",
"geo_scope",
"primary_offerings",
"audience",
"brand_voice",
"contact",
"competitor_urls",
],
},
"content_plan": {
"type": "object",
"properties": {
"required_pages": {
"type": "array",
"items": {
"type": "object",
"properties": {
"page": {
"type": "string",
"enum": ["home", "about", "services", "products", "contact", "blog", "faq"],
},
"goal": {"type": "string"},
"key_points": {"type": "array", "items": {"type": "string"}},
"cta": {"type": "string"},
},
"required": ["page", "goal", "key_points", "cta"],
},
},
"optional_sections": {"type": "array", "items": {"type": "string"}},
"min_content_items": {"type": "integer"},
},
"required": ["required_pages", "optional_sections", "min_content_items"],
},
"exa_query_map": {
"type": "object",
"properties": {
"home": {"$ref": "#/$defs/exaSection"},
"about": {"$ref": "#/$defs/exaSection"},
"services_or_products": {"$ref": "#/$defs/exaSection"},
"contact": {"$ref": "#/$defs/exaSection"},
"competitor_optional": {"$ref": "#/$defs/exaSection"},
},
"required": ["home", "about", "services_or_products", "contact", "competitor_optional"],
},
"quality_flags": {
"type": "object",
"properties": {
"confidence": {"type": "number"},
"missing_fields": {"type": "array", "items": {"type": "string"}},
"followup_questions": {"type": "array", "items": {"type": "string"}},
},
"required": ["confidence", "missing_fields", "followup_questions"],
},
},
"required": ["site_brief", "content_plan", "exa_query_map", "quality_flags"],
"$defs": {
"exaSection": {
"type": "object",
"properties": {
"queries": {"type": "array", "items": {"type": "string"}},
"summary_query": {"type": "string"},
"include_text": {"type": "array", "items": {"type": "string"}},
"search_type": {"type": "string", "enum": ["auto", "neural", "fast", "deep"]},
"category": {"type": "string"},
},
"required": ["queries", "summary_query", "include_text", "search_type", "category"],
}
},
}
class WebsiteIntakeService:
"""Generate site briefs and Exa query maps from minimal intake inputs."""
def _normalize_list(self, value: Any) -> list:
if not value:
return []
if isinstance(value, list):
return [str(item).strip() for item in value if str(item).strip()]
if isinstance(value, str):
return [item.strip() for item in value.split(",") if item.strip()]
return [str(value).strip()] if str(value).strip() else []
def _extract_product_assets(self, intake: Dict[str, Any]) -> Dict[str, list]:
urls = self._normalize_list(intake.get("product_asset_urls"))
asset_ids = self._normalize_list(intake.get("product_asset_ids"))
return {"urls": urls, "asset_ids": asset_ids}
def build_prompt(self, intake: Dict[str, Any]) -> str:
return (
"You are creating a website brief and research plan for a non-technical user. "
"Use the inputs below, keep assumptions minimal, and prefer 'dont_know' when unsure. "
"Ensure at least 5 content items across required pages.\n\n"
f"INTAKE INPUTS:\n{intake}\n\n"
"Output structured JSON that matches the schema exactly."
)
def generate_site_brief(self, intake: Dict[str, Any], user_id: Optional[str] = None) -> Dict[str, Any]:
logger.info("Generating site brief and Exa query map from intake")
try:
prompt = self.build_prompt(intake)
result = llm_text_gen(prompt=prompt, json_struct=SITE_BRIEF_SCHEMA, user_id=user_id)
if isinstance(result, str):
logger.warning("LLM returned string response; expected structured JSON")
return {"error": "invalid_response", "raw": result}
product_assets = self._extract_product_assets(intake)
if product_assets.get("urls") or product_assets.get("asset_ids"):
result.setdefault("site_brief", {})
result["site_brief"]["product_assets"] = product_assets
logger.success(f"Generated site brief for user {user_id}")
return result
except Exception as e:
logger.error(f"Failed to generate site brief: {str(e)}")
# Return a fallback site brief for development
return self._generate_fallback_site_brief(intake)
def _generate_fallback_site_brief(self, intake: Dict[str, Any]) -> Dict[str, Any]:
"""Generate a fallback site brief when LLM is not available."""
logger.info("Generating fallback site brief")
business_name = intake.get("business_name", "Your Business")
business_summary = intake.get("business_summary", "Business description")
template_type = intake.get("template_type", "blog")
fallback_brief = {
"site_brief": {
"business_name": business_name,
"tagline": f"Professional {template_type} website",
"template_type": template_type,
"geo_scope": "global",
"primary_offerings": self._normalize_list(intake.get("primary_offerings", ["Services"])),
"product_assets": self._extract_product_assets(intake),
"audience": {
"segment": intake.get("target_audience", "General audience"),
"b2b_b2c": intake.get("audience_type", "Both"),
"persona_notes": intake.get("target_audience", "General audience description")
},
"brand_voice": {
"tone": intake.get("brand_tone", "professional"),
"adjectives": self._normalize_list(intake.get("brand_adjectives", ["professional", "reliable"])),
"avoid": self._normalize_list(intake.get("avoid_terms", []))
},
"contact": {
"email": intake.get("contact_email", "contact@example.com"),
"phone": intake.get("contact_phone"),
"location": intake.get("contact_location")
},
"competitor_urls": self._normalize_list(intake.get("competitor_urls", []))
},
"content_plan": {
"required_pages": [
{
"page": "home",
"goal": "Welcome visitors and introduce the business",
"key_points": [business_name, business_summary],
"cta": "Get Started"
},
{
"page": "about",
"goal": "Share business story and values",
"key_points": ["Our story", "Our mission", "Our values"],
"cta": "Learn More"
},
{
"page": "contact",
"goal": "Enable visitors to get in touch",
"key_points": ["Contact information", "Business hours", "Location"],
"cta": "Contact Us"
}
],
"optional_sections": ["blog", "faq", "testimonials"],
"min_content_items": 5
},
"exa_query_map": {
"home": {
"queries": [f"{business_name} website", f"{business_name} services"],
"summary_query": f"What is {business_name} and what do they offer?",
"include_text": ["services", "about", "contact"],
"search_type": "auto",
"category": "business"
},
"about": {
"queries": [f"{business_name} about us", f"{business_name} story"],
"summary_query": f"Tell me about {business_name}'s history and mission",
"include_text": ["about", "story", "mission", "values"],
"search_type": "auto",
"category": "business"
},
"services_or_products": {
"queries": [f"{business_name} services", f"{business_name} products"],
"summary_query": f"What services and products does {business_name} offer?",
"include_text": ["services", "products", "offerings"],
"search_type": "auto",
"category": "business"
},
"contact": {
"queries": [f"{business_name} contact", f"{business_name} location"],
"summary_query": f"How can I contact {business_name}?",
"include_text": ["contact", "phone", "email", "address"],
"search_type": "auto",
"category": "business"
},
"competitor_optional": {
"queries": [f"{business_name} competitors", f"alternatives to {business_name}"],
"summary_query": f"Who are the main competitors of {business_name}?",
"include_text": ["competitors", "alternatives"],
"search_type": "auto",
"category": "business"
}
},
"quality_flags": {
"confidence": 0.8,
"missing_fields": [],
"followup_questions": []
}
}
return fallback_brief
# Singleton instance
website_intake_service = WebsiteIntakeService()