Issue #518 - Subscription not updating after checkout: - Fix stale closure in SubscriptionContext checkout polling (use subscriptionRef) - Move checkout success polling from InitialRouteHandler into SubscriptionContext - Remove redundant polling code from InitialRouteHandler - Fix plan label: 'Free' instead of 'No Plan', proper capitalization - Add plan refresh button in UserBadge - Add 'View Costing Details' to UserBadge dropdown - Rename 'ALwrity Podcast Maker' to 'Podcast Creator' across UI - Clean subscription=success URL param after verification Blog Writer WYSIWYG Editor enhancements: - Per-section preview toggle (view/edit icons) - Enhanced hover-based toolbar - Circular SVG progress stats bar with detailed tooltip - Research tool chips in stats bar footer - Per-section TTS with useTextToSpeech hook (browser native) - Full blog preview modal with print/PDF support - PlayAllTTSButton: sequential playback with progress bar - OnThisPageNav: floating sidebar with scroll tracking - Section data attributes for scroll anchoring GSC Brainstorm Topics feature: - Backend: gsc_brainstorm_service.py (rule-based + LLM recommendations) - Backend: POST /gsc/brainstorm endpoint with 3-word minimum validation - Frontend: gscBrainstorm.ts API client - Frontend: useGSCBrainstormConnection hook (popup OAuth, no /onboarding redirect) - Frontend: useGSCBrainstorm hook (connect check + brainstorm call) - Frontend: GSCBrainstormModal (3-tab results: Opportunities, Gaps, AI Recs) - Frontend: BrainstormButton (visible at 3+ words, GSC connect overlay) - Wire BrainstormButton into ManualResearchForm and ResearchAction - Add blog_writer to gsc_auth router features for ALWRITY_ENABLED_FEATURES
323 lines
12 KiB
Python
323 lines
12 KiB
Python
"""
|
|
Link Search Service — Internal & external link discovery and rewording.
|
|
|
|
Provides:
|
|
- Internal link search (Exa include_domains scoped to user's website)
|
|
- External link search (Exa general search, optionally excluding user's domain)
|
|
- Reword-with-links (LLM embeds selected links naturally into section/selected text)
|
|
"""
|
|
|
|
import re
|
|
from typing import Dict, Any, List, Optional
|
|
from loguru import logger
|
|
|
|
from services.llm_providers.main_text_generation import llm_text_gen
|
|
|
|
|
|
LINK_SEARCH_SYSTEM_PROMPT = """You are an SEO and content linking expert. Your task is to naturally incorporate provided links into text using markdown link syntax, following the best practices below.
|
|
|
|
## SEO Linking Best Practices
|
|
|
|
1. **Anchor text must be descriptive and keyword-rich.** Use the surrounding context to create natural, specific anchor text. Never use "click here", "read more", "learn more", or bare URLs as anchors.
|
|
- GOOD: [HubSpot's content marketing statistics](url) — descriptive, includes keywords
|
|
- BAD: [click here](url) — vague, no SEO value
|
|
- BAD: [https://example.com](url) — raw URL, harmful to readability
|
|
|
|
2. **Match link type to content context:**
|
|
- Internal links: Point anchor text at relevant topic keywords that describe the destination page
|
|
- External links: Cite authoritative sources (research, official docs, industry leaders) using the source name or key finding as anchor text
|
|
|
|
3. **Link equity (PageRank) distribution:** Spread links naturally. Aim for 1-2 links per paragraph at most. Don't cluster all links together.
|
|
|
|
4. **Preserve the original text's meaning, tone, structure, and approximate length.** You are inserting links, NOT rewriting the content.
|
|
|
|
5. **If selected_text is provided, ONLY modify that specific portion.** The rest of section_text must remain IDENTICAL — character-for-character unchanged.
|
|
|
|
6. **If selected_text is NOT provided, you may insert links throughout the entire section_text.**
|
|
|
|
7. **Link placement should feel earned, not forced.** Only insert a link where a reader would genuinely want to learn more. If a link doesn't naturally fit, skip it.
|
|
|
|
8. **Prioritize high-authority external sources** (research papers, official documentation, industry leaders) when linking externally.
|
|
|
|
9. **Return ONLY the reworded text.** No explanations, no preamble, no markdown code fences. Just the text with [anchor text](url) links embedded."""
|
|
|
|
|
|
LINK_SEARCH_USER_PROMPT = """## Section Heading
|
|
{section_heading}
|
|
|
|
## Full Section Text
|
|
{section_text}
|
|
|
|
{selected_text_block}
|
|
|
|
## Available Links to Incorporate
|
|
{links}
|
|
|
|
## Instructions
|
|
Carefully read the section text above and insert the most relevant links from the "Available Links" list using markdown format: [descriptive anchor text](url).
|
|
|
|
Remember:
|
|
- Use keyword-rich, descriptive anchor text (NOT "click here" or bare URLs)
|
|
- Only insert links where they naturally enhance the reader's experience
|
|
- Preserve the original text's meaning, tone, and structure
|
|
- Aim for 1-2 links per paragraph maximum
|
|
- If no links fit naturally, return the text unchanged
|
|
|
|
Return ONLY the text with links embedded. No explanations."""
|
|
|
|
|
|
def _extract_domain(url: str) -> str:
|
|
"""Extract the registered domain from a URL.
|
|
|
|
Handles common multi-part TLDs like .co.uk, .com.au, .co.jp, etc.
|
|
Falls back to last two parts for unknown TLDs.
|
|
"""
|
|
url = url.strip()
|
|
if not url:
|
|
return ""
|
|
# Add protocol if missing
|
|
if not url.startswith(("http://", "https://")):
|
|
url = "https://" + url
|
|
# Remove protocol
|
|
domain = re.sub(r"^https?://", "", url)
|
|
# Remove path and query
|
|
domain = domain.split("/")[0].split("?")[0].split("#")[0]
|
|
# Remove port
|
|
domain = domain.split(":")[0]
|
|
# Remove userinfo (user:pass@)
|
|
if "@" in domain:
|
|
domain = domain.split("@")[-1]
|
|
domain = domain.lower().strip()
|
|
if not domain:
|
|
return ""
|
|
|
|
# Known multi-part TLDs (common ccTLDs with second-level domains)
|
|
multi_part_tlds = {
|
|
"co.uk", "org.uk", "ac.uk", "gov.uk", "co.jp", "or.jp", "ne.jp", "ac.jp",
|
|
"co.au", "com.au", "org.au", "net.au", "co.nz", "net.nz", "org.nz",
|
|
"co.in", "net.in", "org.in", "ac.in", "co.kr", "co.za", "org.za", "web.za",
|
|
"com.br", "com.mx", "com.ar", "com.sg", "com.hk", "com.tw", "com.my",
|
|
"com.cn", "org.cn", "net.cn", "ac.ke", "co.ke",
|
|
}
|
|
parts = domain.split(".")
|
|
if len(parts) < 2:
|
|
return domain
|
|
|
|
# Check if last two parts form a known multi-part TLD
|
|
last_two = ".".join(parts[-2:])
|
|
if last_two in multi_part_tlds and len(parts) > 2:
|
|
# e.g. blog.example.co.uk → example.co.uk
|
|
return ".".join(parts[-3:])
|
|
# Default: last two parts (example.com)
|
|
return ".".join(parts[-2:])
|
|
|
|
|
|
def _filter_search_results(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Filter out results with empty URLs or missing essential fields."""
|
|
filtered = []
|
|
for r in results:
|
|
url = r.get("url", "").strip()
|
|
title = r.get("title", "").strip() or "Untitled"
|
|
if url:
|
|
filtered.append({
|
|
"title": title,
|
|
"url": url,
|
|
"text": r.get("text", ""),
|
|
"publishedDate": r.get("publishedDate", ""),
|
|
"author": r.get("author", ""),
|
|
"score": r.get("score", 0.5),
|
|
})
|
|
return filtered
|
|
|
|
|
|
class LinkSearchService:
|
|
"""Service for finding internal/external links and rewording text to include them."""
|
|
|
|
async def search_internal(
|
|
self,
|
|
query: str,
|
|
site_url: str,
|
|
user_id: Optional[str] = None,
|
|
num_results: int = 5,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Search for internal links (from the user's own website).
|
|
|
|
Args:
|
|
query: Search query (section topic/heading)
|
|
site_url: User's website URL to scope search via include_domains
|
|
user_id: Optional user ID for subscription tracking
|
|
num_results: Number of results to return
|
|
|
|
Returns:
|
|
{"results": [...], "warnings": [...]}
|
|
"""
|
|
warnings = []
|
|
domain = _extract_domain(site_url)
|
|
|
|
if not domain:
|
|
return {
|
|
"results": [],
|
|
"warnings": [f"Could not extract domain from '{site_url}'"],
|
|
}
|
|
|
|
try:
|
|
from services.blog_writer.research.exa_provider import ExaResearchProvider
|
|
|
|
provider = ExaResearchProvider()
|
|
results = await provider.simple_search(
|
|
query=query,
|
|
num_results=num_results,
|
|
user_id=user_id,
|
|
include_domains=[domain],
|
|
)
|
|
filtered = _filter_search_results(results)
|
|
return {"results": filtered, "warnings": warnings}
|
|
|
|
except ImportError:
|
|
msg = "Exa provider not available — link search requires Exa API."
|
|
logger.warning(f"[LinkSearchService] {msg}")
|
|
warnings.append(msg)
|
|
return {"results": [], "warnings": warnings}
|
|
except Exception as e:
|
|
logger.error(f"[LinkSearchService] Internal link search failed: {e}")
|
|
warnings.append(f"Search failed: {str(e)}")
|
|
return {"results": [], "warnings": warnings}
|
|
|
|
async def search_external(
|
|
self,
|
|
query: str,
|
|
site_url: Optional[str] = None,
|
|
user_id: Optional[str] = None,
|
|
num_results: int = 5,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Search for external links (optionally excluding the user's own domain).
|
|
|
|
Args:
|
|
query: Search query
|
|
site_url: User's website URL — results from this domain will be excluded
|
|
user_id: Optional user ID for subscription tracking
|
|
num_results: Number of results to return
|
|
|
|
Returns:
|
|
{"results": [...], "warnings": [...]}
|
|
"""
|
|
warnings = []
|
|
exclude_domains = None
|
|
|
|
if site_url:
|
|
domain = _extract_domain(site_url)
|
|
if domain:
|
|
exclude_domains = [domain]
|
|
|
|
try:
|
|
from services.blog_writer.research.exa_provider import ExaResearchProvider
|
|
|
|
provider = ExaResearchProvider()
|
|
results = await provider.simple_search(
|
|
query=query,
|
|
num_results=num_results,
|
|
user_id=user_id,
|
|
exclude_domains=exclude_domains,
|
|
)
|
|
filtered = _filter_search_results(results)
|
|
return {"results": filtered, "warnings": warnings}
|
|
|
|
except ImportError:
|
|
msg = "Exa provider not available — link search requires Exa API."
|
|
logger.warning(f"[LinkSearchService] {msg}")
|
|
warnings.append(msg)
|
|
return {"results": [], "warnings": warnings}
|
|
except Exception as e:
|
|
logger.error(f"[LinkSearchService] External link search failed: {e}")
|
|
warnings.append(f"Search failed: {str(e)}")
|
|
return {"results": [], "warnings": warnings}
|
|
|
|
def reword_with_links(
|
|
self,
|
|
section_text: str,
|
|
links: List[Dict[str, str]],
|
|
section_heading: Optional[str] = None,
|
|
selected_text: Optional[str] = None,
|
|
user_id: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Use LLM to reword text, naturally incorporating the selected links.
|
|
|
|
Args:
|
|
section_text: Full section text
|
|
links: List of {"url": str, "title": str} dicts
|
|
section_heading: Optional section heading for context
|
|
selected_text: If provided, only reword this portion of the text
|
|
user_id: Optional user ID for LLM routing
|
|
|
|
Returns:
|
|
{"reworded_text": str, "warnings": [...]}
|
|
"""
|
|
warnings = []
|
|
|
|
if not links:
|
|
return {
|
|
"reworded_text": section_text,
|
|
"warnings": ["No links provided — returning original text unchanged."],
|
|
}
|
|
|
|
links_text = "\n".join(
|
|
f"- [{link.get('title', 'Untitled')}]({link.get('url', '')}) — {link.get('title', '')}"
|
|
for link in links
|
|
)
|
|
|
|
selected_text_block = ""
|
|
if selected_text:
|
|
selected_text_block = f"Selected text to reword (keep surrounding text unchanged):\n{selected_text}"
|
|
|
|
prompt = LINK_SEARCH_USER_PROMPT.format(
|
|
section_heading=section_heading or "Blog Section",
|
|
section_text=section_text[:3000],
|
|
selected_text_block=selected_text_block,
|
|
links=links_text,
|
|
)
|
|
|
|
try:
|
|
result = llm_text_gen(
|
|
prompt=prompt,
|
|
system_prompt=LINK_SEARCH_SYSTEM_PROMPT,
|
|
json_struct=None,
|
|
max_tokens=3000,
|
|
user_id=user_id,
|
|
)
|
|
|
|
raw = result.get("text", "") if isinstance(result, dict) else str(result) if result else ""
|
|
raw = raw.strip()
|
|
|
|
# Strip markdown code fences if the LLM wrapped the output
|
|
if raw.startswith("```"):
|
|
match = re.search(r"```(?:markdown|md)?\s*(.*?)\s*```", raw, re.DOTALL)
|
|
if match:
|
|
raw = match.group(1).strip()
|
|
|
|
if not raw:
|
|
warnings.append("LLM returned empty reworded text — returning original.")
|
|
return {"reworded_text": section_text, "warnings": warnings}
|
|
|
|
logger.info(f"[LinkSearchService] Reworded text: {len(raw)} chars, {len(links)} links provided")
|
|
return {"reworded_text": raw, "warnings": warnings}
|
|
|
|
except Exception as e:
|
|
logger.error(f"[LinkSearchService] Reword failed: {e}")
|
|
warnings.append(f"Reword failed: {str(e)}")
|
|
return {"reworded_text": section_text, "warnings": warnings}
|
|
|
|
|
|
# Per-user service instances (not strictly needed since service is stateless,
|
|
# but kept for consistency with chart_service pattern)
|
|
_link_search_instances: Dict[str, LinkSearchService] = {}
|
|
|
|
|
|
def get_link_search_service(user_id: Optional[str] = None) -> LinkSearchService:
|
|
"""Get or create LinkSearchService for the given user."""
|
|
cache_key = user_id or "default"
|
|
if cache_key not in _link_search_instances:
|
|
_link_search_instances[cache_key] = LinkSearchService()
|
|
return _link_search_instances[cache_key] |