Files
ALwrity/backend/services/link_search_service.py
ajaysi 644e72d289 feat: Brainstorm Topics with GSC + Issue #518 fixes + Blog Editor enhancements
Issue #518 - Subscription not updating after checkout:
- Fix stale closure in SubscriptionContext checkout polling (use subscriptionRef)
- Move checkout success polling from InitialRouteHandler into SubscriptionContext
- Remove redundant polling code from InitialRouteHandler
- Fix plan label: 'Free' instead of 'No Plan', proper capitalization
- Add plan refresh button in UserBadge
- Add 'View Costing Details' to UserBadge dropdown
- Rename 'ALwrity Podcast Maker' to 'Podcast Creator' across UI
- Clean subscription=success URL param after verification

Blog Writer WYSIWYG Editor enhancements:
- Per-section preview toggle (view/edit icons)
- Enhanced hover-based toolbar
- Circular SVG progress stats bar with detailed tooltip
- Research tool chips in stats bar footer
- Per-section TTS with useTextToSpeech hook (browser native)
- Full blog preview modal with print/PDF support
- PlayAllTTSButton: sequential playback with progress bar
- OnThisPageNav: floating sidebar with scroll tracking
- Section data attributes for scroll anchoring

GSC Brainstorm Topics feature:
- Backend: gsc_brainstorm_service.py (rule-based + LLM recommendations)
- Backend: POST /gsc/brainstorm endpoint with 3-word minimum validation
- Frontend: gscBrainstorm.ts API client
- Frontend: useGSCBrainstormConnection hook (popup OAuth, no /onboarding redirect)
- Frontend: useGSCBrainstorm hook (connect check + brainstorm call)
- Frontend: GSCBrainstormModal (3-tab results: Opportunities, Gaps, AI Recs)
- Frontend: BrainstormButton (visible at 3+ words, GSC connect overlay)
- Wire BrainstormButton into ManualResearchForm and ResearchAction
- Add blog_writer to gsc_auth router features for ALWRITY_ENABLED_FEATURES
2026-05-20 22:44:15 +05:30

323 lines
12 KiB
Python

"""
Link Search Service — Internal & external link discovery and rewording.
Provides:
- Internal link search (Exa include_domains scoped to user's website)
- External link search (Exa general search, optionally excluding user's domain)
- Reword-with-links (LLM embeds selected links naturally into section/selected text)
"""
import re
from typing import Dict, Any, List, Optional
from loguru import logger
from services.llm_providers.main_text_generation import llm_text_gen
LINK_SEARCH_SYSTEM_PROMPT = """You are an SEO and content linking expert. Your task is to naturally incorporate provided links into text using markdown link syntax, following the best practices below.
## SEO Linking Best Practices
1. **Anchor text must be descriptive and keyword-rich.** Use the surrounding context to create natural, specific anchor text. Never use "click here", "read more", "learn more", or bare URLs as anchors.
- GOOD: [HubSpot's content marketing statistics](url) — descriptive, includes keywords
- BAD: [click here](url) — vague, no SEO value
- BAD: [https://example.com](url) — raw URL, harmful to readability
2. **Match link type to content context:**
- Internal links: Point anchor text at relevant topic keywords that describe the destination page
- External links: Cite authoritative sources (research, official docs, industry leaders) using the source name or key finding as anchor text
3. **Link equity (PageRank) distribution:** Spread links naturally. Aim for 1-2 links per paragraph at most. Don't cluster all links together.
4. **Preserve the original text's meaning, tone, structure, and approximate length.** You are inserting links, NOT rewriting the content.
5. **If selected_text is provided, ONLY modify that specific portion.** The rest of section_text must remain IDENTICAL — character-for-character unchanged.
6. **If selected_text is NOT provided, you may insert links throughout the entire section_text.**
7. **Link placement should feel earned, not forced.** Only insert a link where a reader would genuinely want to learn more. If a link doesn't naturally fit, skip it.
8. **Prioritize high-authority external sources** (research papers, official documentation, industry leaders) when linking externally.
9. **Return ONLY the reworded text.** No explanations, no preamble, no markdown code fences. Just the text with [anchor text](url) links embedded."""
LINK_SEARCH_USER_PROMPT = """## Section Heading
{section_heading}
## Full Section Text
{section_text}
{selected_text_block}
## Available Links to Incorporate
{links}
## Instructions
Carefully read the section text above and insert the most relevant links from the "Available Links" list using markdown format: [descriptive anchor text](url).
Remember:
- Use keyword-rich, descriptive anchor text (NOT "click here" or bare URLs)
- Only insert links where they naturally enhance the reader's experience
- Preserve the original text's meaning, tone, and structure
- Aim for 1-2 links per paragraph maximum
- If no links fit naturally, return the text unchanged
Return ONLY the text with links embedded. No explanations."""
def _extract_domain(url: str) -> str:
"""Extract the registered domain from a URL.
Handles common multi-part TLDs like .co.uk, .com.au, .co.jp, etc.
Falls back to last two parts for unknown TLDs.
"""
url = url.strip()
if not url:
return ""
# Add protocol if missing
if not url.startswith(("http://", "https://")):
url = "https://" + url
# Remove protocol
domain = re.sub(r"^https?://", "", url)
# Remove path and query
domain = domain.split("/")[0].split("?")[0].split("#")[0]
# Remove port
domain = domain.split(":")[0]
# Remove userinfo (user:pass@)
if "@" in domain:
domain = domain.split("@")[-1]
domain = domain.lower().strip()
if not domain:
return ""
# Known multi-part TLDs (common ccTLDs with second-level domains)
multi_part_tlds = {
"co.uk", "org.uk", "ac.uk", "gov.uk", "co.jp", "or.jp", "ne.jp", "ac.jp",
"co.au", "com.au", "org.au", "net.au", "co.nz", "net.nz", "org.nz",
"co.in", "net.in", "org.in", "ac.in", "co.kr", "co.za", "org.za", "web.za",
"com.br", "com.mx", "com.ar", "com.sg", "com.hk", "com.tw", "com.my",
"com.cn", "org.cn", "net.cn", "ac.ke", "co.ke",
}
parts = domain.split(".")
if len(parts) < 2:
return domain
# Check if last two parts form a known multi-part TLD
last_two = ".".join(parts[-2:])
if last_two in multi_part_tlds and len(parts) > 2:
# e.g. blog.example.co.uk → example.co.uk
return ".".join(parts[-3:])
# Default: last two parts (example.com)
return ".".join(parts[-2:])
def _filter_search_results(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Filter out results with empty URLs or missing essential fields."""
filtered = []
for r in results:
url = r.get("url", "").strip()
title = r.get("title", "").strip() or "Untitled"
if url:
filtered.append({
"title": title,
"url": url,
"text": r.get("text", ""),
"publishedDate": r.get("publishedDate", ""),
"author": r.get("author", ""),
"score": r.get("score", 0.5),
})
return filtered
class LinkSearchService:
"""Service for finding internal/external links and rewording text to include them."""
async def search_internal(
self,
query: str,
site_url: str,
user_id: Optional[str] = None,
num_results: int = 5,
) -> Dict[str, Any]:
"""
Search for internal links (from the user's own website).
Args:
query: Search query (section topic/heading)
site_url: User's website URL to scope search via include_domains
user_id: Optional user ID for subscription tracking
num_results: Number of results to return
Returns:
{"results": [...], "warnings": [...]}
"""
warnings = []
domain = _extract_domain(site_url)
if not domain:
return {
"results": [],
"warnings": [f"Could not extract domain from '{site_url}'"],
}
try:
from services.blog_writer.research.exa_provider import ExaResearchProvider
provider = ExaResearchProvider()
results = await provider.simple_search(
query=query,
num_results=num_results,
user_id=user_id,
include_domains=[domain],
)
filtered = _filter_search_results(results)
return {"results": filtered, "warnings": warnings}
except ImportError:
msg = "Exa provider not available — link search requires Exa API."
logger.warning(f"[LinkSearchService] {msg}")
warnings.append(msg)
return {"results": [], "warnings": warnings}
except Exception as e:
logger.error(f"[LinkSearchService] Internal link search failed: {e}")
warnings.append(f"Search failed: {str(e)}")
return {"results": [], "warnings": warnings}
async def search_external(
self,
query: str,
site_url: Optional[str] = None,
user_id: Optional[str] = None,
num_results: int = 5,
) -> Dict[str, Any]:
"""
Search for external links (optionally excluding the user's own domain).
Args:
query: Search query
site_url: User's website URL — results from this domain will be excluded
user_id: Optional user ID for subscription tracking
num_results: Number of results to return
Returns:
{"results": [...], "warnings": [...]}
"""
warnings = []
exclude_domains = None
if site_url:
domain = _extract_domain(site_url)
if domain:
exclude_domains = [domain]
try:
from services.blog_writer.research.exa_provider import ExaResearchProvider
provider = ExaResearchProvider()
results = await provider.simple_search(
query=query,
num_results=num_results,
user_id=user_id,
exclude_domains=exclude_domains,
)
filtered = _filter_search_results(results)
return {"results": filtered, "warnings": warnings}
except ImportError:
msg = "Exa provider not available — link search requires Exa API."
logger.warning(f"[LinkSearchService] {msg}")
warnings.append(msg)
return {"results": [], "warnings": warnings}
except Exception as e:
logger.error(f"[LinkSearchService] External link search failed: {e}")
warnings.append(f"Search failed: {str(e)}")
return {"results": [], "warnings": warnings}
def reword_with_links(
self,
section_text: str,
links: List[Dict[str, str]],
section_heading: Optional[str] = None,
selected_text: Optional[str] = None,
user_id: Optional[str] = None,
) -> Dict[str, Any]:
"""
Use LLM to reword text, naturally incorporating the selected links.
Args:
section_text: Full section text
links: List of {"url": str, "title": str} dicts
section_heading: Optional section heading for context
selected_text: If provided, only reword this portion of the text
user_id: Optional user ID for LLM routing
Returns:
{"reworded_text": str, "warnings": [...]}
"""
warnings = []
if not links:
return {
"reworded_text": section_text,
"warnings": ["No links provided — returning original text unchanged."],
}
links_text = "\n".join(
f"- [{link.get('title', 'Untitled')}]({link.get('url', '')}) — {link.get('title', '')}"
for link in links
)
selected_text_block = ""
if selected_text:
selected_text_block = f"Selected text to reword (keep surrounding text unchanged):\n{selected_text}"
prompt = LINK_SEARCH_USER_PROMPT.format(
section_heading=section_heading or "Blog Section",
section_text=section_text[:3000],
selected_text_block=selected_text_block,
links=links_text,
)
try:
result = llm_text_gen(
prompt=prompt,
system_prompt=LINK_SEARCH_SYSTEM_PROMPT,
json_struct=None,
max_tokens=3000,
user_id=user_id,
)
raw = result.get("text", "") if isinstance(result, dict) else str(result) if result else ""
raw = raw.strip()
# Strip markdown code fences if the LLM wrapped the output
if raw.startswith("```"):
match = re.search(r"```(?:markdown|md)?\s*(.*?)\s*```", raw, re.DOTALL)
if match:
raw = match.group(1).strip()
if not raw:
warnings.append("LLM returned empty reworded text — returning original.")
return {"reworded_text": section_text, "warnings": warnings}
logger.info(f"[LinkSearchService] Reworded text: {len(raw)} chars, {len(links)} links provided")
return {"reworded_text": raw, "warnings": warnings}
except Exception as e:
logger.error(f"[LinkSearchService] Reword failed: {e}")
warnings.append(f"Reword failed: {str(e)}")
return {"reworded_text": section_text, "warnings": warnings}
# Per-user service instances (not strictly needed since service is stateless,
# but kept for consistency with chart_service pattern)
_link_search_instances: Dict[str, LinkSearchService] = {}
def get_link_search_service(user_id: Optional[str] = None) -> LinkSearchService:
"""Get or create LinkSearchService for the given user."""
cache_key = user_id or "default"
if cache_key not in _link_search_instances:
_link_search_instances[cache_key] = LinkSearchService()
return _link_search_instances[cache_key]