""" Link Search Service — Internal & external link discovery and rewording. Provides: - Internal link search (Exa include_domains scoped to user's website) - External link search (Exa general search, optionally excluding user's domain) - Reword-with-links (LLM embeds selected links naturally into section/selected text) """ import re from typing import Dict, Any, List, Optional from loguru import logger from services.llm_providers.main_text_generation import llm_text_gen LINK_SEARCH_SYSTEM_PROMPT = """You are an SEO and content linking expert. Your task is to naturally incorporate provided links into text using markdown link syntax, following the best practices below. ## SEO Linking Best Practices 1. **Anchor text must be descriptive and keyword-rich.** Use the surrounding context to create natural, specific anchor text. Never use "click here", "read more", "learn more", or bare URLs as anchors. - GOOD: [HubSpot's content marketing statistics](url) — descriptive, includes keywords - BAD: [click here](url) — vague, no SEO value - BAD: [https://example.com](url) — raw URL, harmful to readability 2. **Match link type to content context:** - Internal links: Point anchor text at relevant topic keywords that describe the destination page - External links: Cite authoritative sources (research, official docs, industry leaders) using the source name or key finding as anchor text 3. **Link equity (PageRank) distribution:** Spread links naturally. Aim for 1-2 links per paragraph at most. Don't cluster all links together. 4. **Preserve the original text's meaning, tone, structure, and approximate length.** You are inserting links, NOT rewriting the content. 5. **If selected_text is provided, ONLY modify that specific portion.** The rest of section_text must remain IDENTICAL — character-for-character unchanged. 6. **If selected_text is NOT provided, you may insert links throughout the entire section_text.** 7. **Link placement should feel earned, not forced.** Only insert a link where a reader would genuinely want to learn more. If a link doesn't naturally fit, skip it. 8. **Prioritize high-authority external sources** (research papers, official documentation, industry leaders) when linking externally. 9. **Return ONLY the reworded text.** No explanations, no preamble, no markdown code fences. Just the text with [anchor text](url) links embedded.""" LINK_SEARCH_USER_PROMPT = """## Section Heading {section_heading} ## Full Section Text {section_text} {selected_text_block} ## Available Links to Incorporate {links} ## Instructions Carefully read the section text above and insert the most relevant links from the "Available Links" list using markdown format: [descriptive anchor text](url). Remember: - Use keyword-rich, descriptive anchor text (NOT "click here" or bare URLs) - Only insert links where they naturally enhance the reader's experience - Preserve the original text's meaning, tone, and structure - Aim for 1-2 links per paragraph maximum - If no links fit naturally, return the text unchanged Return ONLY the text with links embedded. No explanations.""" def _extract_domain(url: str) -> str: """Extract the registered domain from a URL. Handles common multi-part TLDs like .co.uk, .com.au, .co.jp, etc. Falls back to last two parts for unknown TLDs. """ url = url.strip() if not url: return "" # Add protocol if missing if not url.startswith(("http://", "https://")): url = "https://" + url # Remove protocol domain = re.sub(r"^https?://", "", url) # Remove path and query domain = domain.split("/")[0].split("?")[0].split("#")[0] # Remove port domain = domain.split(":")[0] # Remove userinfo (user:pass@) if "@" in domain: domain = domain.split("@")[-1] domain = domain.lower().strip() if not domain: return "" # Known multi-part TLDs (common ccTLDs with second-level domains) multi_part_tlds = { "co.uk", "org.uk", "ac.uk", "gov.uk", "co.jp", "or.jp", "ne.jp", "ac.jp", "co.au", "com.au", "org.au", "net.au", "co.nz", "net.nz", "org.nz", "co.in", "net.in", "org.in", "ac.in", "co.kr", "co.za", "org.za", "web.za", "com.br", "com.mx", "com.ar", "com.sg", "com.hk", "com.tw", "com.my", "com.cn", "org.cn", "net.cn", "ac.ke", "co.ke", } parts = domain.split(".") if len(parts) < 2: return domain # Check if last two parts form a known multi-part TLD last_two = ".".join(parts[-2:]) if last_two in multi_part_tlds and len(parts) > 2: # e.g. blog.example.co.uk → example.co.uk return ".".join(parts[-3:]) # Default: last two parts (example.com) return ".".join(parts[-2:]) def _filter_search_results(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Filter out results with empty URLs or missing essential fields.""" filtered = [] for r in results: url = r.get("url", "").strip() title = r.get("title", "").strip() or "Untitled" if url: filtered.append({ "title": title, "url": url, "text": r.get("text", ""), "publishedDate": r.get("publishedDate", ""), "author": r.get("author", ""), "score": r.get("score", 0.5), }) return filtered class LinkSearchService: """Service for finding internal/external links and rewording text to include them.""" async def search_internal( self, query: str, site_url: str, user_id: Optional[str] = None, num_results: int = 5, ) -> Dict[str, Any]: """ Search for internal links (from the user's own website). Args: query: Search query (section topic/heading) site_url: User's website URL to scope search via include_domains user_id: Optional user ID for subscription tracking num_results: Number of results to return Returns: {"results": [...], "warnings": [...]} """ warnings = [] domain = _extract_domain(site_url) if not domain: return { "results": [], "warnings": [f"Could not extract domain from '{site_url}'"], } try: from services.blog_writer.research.exa_provider import ExaResearchProvider provider = ExaResearchProvider() results = await provider.simple_search( query=query, num_results=num_results, user_id=user_id, include_domains=[domain], ) filtered = _filter_search_results(results) return {"results": filtered, "warnings": warnings} except ImportError: msg = "Exa provider not available — link search requires Exa API." logger.warning(f"[LinkSearchService] {msg}") warnings.append(msg) return {"results": [], "warnings": warnings} except Exception as e: logger.error(f"[LinkSearchService] Internal link search failed: {e}") warnings.append(f"Search failed: {str(e)}") return {"results": [], "warnings": warnings} async def search_external( self, query: str, site_url: Optional[str] = None, user_id: Optional[str] = None, num_results: int = 5, ) -> Dict[str, Any]: """ Search for external links (optionally excluding the user's own domain). Args: query: Search query site_url: User's website URL — results from this domain will be excluded user_id: Optional user ID for subscription tracking num_results: Number of results to return Returns: {"results": [...], "warnings": [...]} """ warnings = [] exclude_domains = None if site_url: domain = _extract_domain(site_url) if domain: exclude_domains = [domain] try: from services.blog_writer.research.exa_provider import ExaResearchProvider provider = ExaResearchProvider() results = await provider.simple_search( query=query, num_results=num_results, user_id=user_id, exclude_domains=exclude_domains, ) filtered = _filter_search_results(results) return {"results": filtered, "warnings": warnings} except ImportError: msg = "Exa provider not available — link search requires Exa API." logger.warning(f"[LinkSearchService] {msg}") warnings.append(msg) return {"results": [], "warnings": warnings} except Exception as e: logger.error(f"[LinkSearchService] External link search failed: {e}") warnings.append(f"Search failed: {str(e)}") return {"results": [], "warnings": warnings} def reword_with_links( self, section_text: str, links: List[Dict[str, str]], section_heading: Optional[str] = None, selected_text: Optional[str] = None, user_id: Optional[str] = None, ) -> Dict[str, Any]: """ Use LLM to reword text, naturally incorporating the selected links. Args: section_text: Full section text links: List of {"url": str, "title": str} dicts section_heading: Optional section heading for context selected_text: If provided, only reword this portion of the text user_id: Optional user ID for LLM routing Returns: {"reworded_text": str, "warnings": [...]} """ warnings = [] if not links: return { "reworded_text": section_text, "warnings": ["No links provided — returning original text unchanged."], } links_text = "\n".join( f"- [{link.get('title', 'Untitled')}]({link.get('url', '')}) — {link.get('title', '')}" for link in links ) selected_text_block = "" if selected_text: selected_text_block = f"Selected text to reword (keep surrounding text unchanged):\n{selected_text}" prompt = LINK_SEARCH_USER_PROMPT.format( section_heading=section_heading or "Blog Section", section_text=section_text[:3000], selected_text_block=selected_text_block, links=links_text, ) try: result = llm_text_gen( prompt=prompt, system_prompt=LINK_SEARCH_SYSTEM_PROMPT, json_struct=None, max_tokens=3000, user_id=user_id, ) raw = result.get("text", "") if isinstance(result, dict) else str(result) if result else "" raw = raw.strip() # Strip markdown code fences if the LLM wrapped the output if raw.startswith("```"): match = re.search(r"```(?:markdown|md)?\s*(.*?)\s*```", raw, re.DOTALL) if match: raw = match.group(1).strip() if not raw: warnings.append("LLM returned empty reworded text — returning original.") return {"reworded_text": section_text, "warnings": warnings} logger.info(f"[LinkSearchService] Reworded text: {len(raw)} chars, {len(links)} links provided") return {"reworded_text": raw, "warnings": warnings} except Exception as e: logger.error(f"[LinkSearchService] Reword failed: {e}") warnings.append(f"Reword failed: {str(e)}") return {"reworded_text": section_text, "warnings": warnings} # Per-user service instances (not strictly needed since service is stateless, # but kept for consistency with chart_service pattern) _link_search_instances: Dict[str, LinkSearchService] = {} def get_link_search_service(user_id: Optional[str] = None) -> LinkSearchService: """Get or create LinkSearchService for the given user.""" cache_key = user_id or "default" if cache_key not in _link_search_instances: _link_search_instances[cache_key] = LinkSearchService() return _link_search_instances[cache_key]