"""Deep website scraper for backlink outreach discovery. Orchestrates Exa neural search + DuckDuckGo fallback to find guest-post opportunities with full-page content extraction and quality scoring. """ from __future__ import annotations import asyncio import re from typing import Any, Dict, List, Optional from urllib.parse import quote, urlparse import httpx from bs4 import BeautifulSoup from loguru import logger class BacklinkOutreachScraper: """Scrapes websites for backlink outreach opportunities using Exa + DuckDuckGo.""" GUEST_POST_KEYWORDS = [ "write for us", "guest post", "submit guest post", "guest contributor", "become a guest blogger", "guest bloggers wanted", "add guest post", "submit article", "guest post opportunities", "contribute to our blog", "write for our blog", ] def __init__(self, user_id: Optional[str] = None): self.user_id = user_id self._exa_svc = None # -- Public API -- async def deep_discover( self, keyword: str, max_results: int = 15, scrape_timeout_seconds: float = 15.0, scrape_max_concurrency: int = 5, ) -> Dict[str, Any]: """Discover guest-post opportunities using Exa, falling back to DuckDuckGo.""" if self._is_exa_available(): logger.info(f"[BacklinkScraper] Using Exa for keyword: {keyword}") return await self._discover_with_exa(keyword, max_results) logger.info(f"[BacklinkScraper] Exa unavailable, falling back to DuckDuckGo for: {keyword}") return await self._discover_with_duckduckgo( keyword, max_results, scrape_timeout_seconds=scrape_timeout_seconds, scrape_max_concurrency=scrape_max_concurrency, ) async def scrape_urls( self, urls: List[str], timeout_seconds: float = 15.0, max_concurrency: int = 5, ) -> List[Dict[str, Any]]: """Fetch full page content with non-blocking fallbacks and bounded concurrency.""" exa = self._get_exa_sdk() if not exa: return await self._scrape_urls_fallback( urls, timeout_seconds=timeout_seconds, max_concurrency=max_concurrency ) loop = asyncio.get_running_loop() try: result = await loop.run_in_executor( None, lambda: exa.get_contents(urls, text={"max_characters": 5000}) ) return self._parse_get_contents_result(result) except Exception as e: logger.warning(f"[BacklinkScraper] Exa get_contents failed: {e}") return await self._scrape_urls_fallback( urls, timeout_seconds=timeout_seconds, max_concurrency=max_concurrency ) # -- Availability -- def _is_exa_available(self) -> bool: try: exa = self._get_exa_sdk() return exa is not None except Exception: return False def _get_exa_sdk(self): """Get Exa SDK instance via ExaService, respecting per-user API key.""" if self._exa_svc is None: from services.research.exa_service import ExaService self._exa_svc = ExaService() self._exa_svc._try_initialize() return self._exa_svc.exa if self._exa_svc.enabled else None # -- Preflight & Usage Tracking -- def _preflight_subscription_check(self, user_id: str) -> bool: """Check Exa usage limits. Returns True if allowed.""" if not user_id: return True try: from services.database import get_session_for_user from services.subscription import PricingService from models.subscription_models import APIProvider db = get_session_for_user(user_id) if not db: return True try: pricing = PricingService(db) allowed, _, _ = pricing.check_usage_limits( user_id=user_id, provider=APIProvider.EXA, tokens_requested=0, ) return allowed finally: db.close() except Exception as e: logger.warning(f"[BacklinkScraper] Preflight check failed: {e}") return True def _track_exa_usage(self, user_id: str, cost: float = 0.005): """Record Exa usage after successful search.""" if not user_id: return try: from services.database import get_session_for_user from services.subscription import PricingService from sqlalchemy import text as sql_text db = get_session_for_user(user_id) if not db: return try: pricing = PricingService(db) period = pricing.get_current_billing_period(user_id) db.execute(sql_text(""" UPDATE usage_summaries SET exa_calls = COALESCE(exa_calls, 0) + 1, exa_cost = COALESCE(exa_cost, 0) + :cost, total_calls = total_calls + 1, total_cost = total_cost + :cost WHERE user_id = :user_id AND billing_period = :period """), {"cost": cost, "user_id": user_id, "period": period}) db.commit() finally: db.close() except Exception as e: logger.warning(f"[BacklinkScraper] Usage tracking failed: {e}") # -- Exa Discovery -- async def _discover_with_exa(self, keyword: str, max_results: int) -> Dict[str, Any]: exa = self._get_exa_sdk() if not exa: return await self._discover_with_duckduckgo(keyword, max_results) queries = self._generate_search_queries(keyword) dedup: Dict[str, Dict[str, Any]] = {} results_per_query = max(1, max_results // len(queries)) for query in queries[:4]: rows = await self._exa_search_and_contents(exa, query, results_per_query) for row in rows: norm_url = self._normalize_url(row.get("url", "")) if not norm_url or norm_url in dedup: continue dedup[norm_url] = row if len(dedup) >= max_results: break opportunities = self._build_enriched_opportunities(dedup, keyword, "exa") self._track_exa_usage(self.user_id) return { "keyword": keyword, "source": "exa", "total_found": len(opportunities), "opportunities": opportunities, } async def _exa_search_and_contents( self, exa, query: str, num_results: int ) -> List[Dict[str, Any]]: """Run Exa search_and_contents in executor to avoid blocking.""" loop = asyncio.get_running_loop() try: result = await loop.run_in_executor( None, lambda: exa.search_and_contents( query, type="auto", num_results=num_results, text={"max_characters": 3000}, highlights={"num_sentences": 3, "highlights_per_url": 3}, ), ) return self._parse_search_and_contents_result(result) except Exception as e: logger.warning(f"[BacklinkScraper] Exa search_and_contents failed: {e}") return [] def _parse_search_and_contents_result(self, result) -> List[Dict[str, Any]]: rows = [] results = getattr(result, "results", []) for r in results: rows.append({ "url": getattr(r, "url", ""), "title": getattr(r, "title", ""), "text": getattr(r, "text", ""), "highlights": getattr(r, "highlights", []), "summary": getattr(r, "summary", ""), "score": getattr(r, "score", 0.5), "published_date": getattr(r, "publishedDate", None), }) return rows def _parse_get_contents_result(self, result) -> List[Dict[str, Any]]: rows = [] results = getattr(result, "results", []) for r in results: rows.append({ "url": getattr(r, "url", ""), "title": getattr(r, "title", ""), "text": getattr(r, "text", ""), "highlights": getattr(r, "highlights", []), "summary": getattr(r, "summary", ""), }) return rows # -- DuckDuckGo Fallback Discovery -- async def _discover_with_duckduckgo( self, keyword: str, max_results: int, scrape_timeout_seconds: float = 15.0, scrape_max_concurrency: int = 5, ) -> Dict[str, Any]: queries = self._generate_search_queries(keyword) dedup: Dict[str, Dict[str, Any]] = {} async with httpx.AsyncClient(timeout=httpx.Timeout(12.0), follow_redirects=True) as client: for query in queries[:4]: rows = await self._duckduckgo_search(query, client=client) for row in rows: norm_url = self._normalize_url(row.get("url", "")) if not norm_url or norm_url in dedup: continue dedup[norm_url] = row if len(dedup) >= max_results: break await asyncio.sleep(0.4) # Scrape discovered URLs with Exa get_contents (or fallback) urls_to_scrape = list(dedup.keys())[:max_results] scraped = await self.scrape_urls( urls_to_scrape, timeout_seconds=scrape_timeout_seconds, max_concurrency=scrape_max_concurrency, ) scraped_map = {self._normalize_url(s.get("url", "")): s for s in scraped} # Merge DDG results with scraped content merged = {} for norm_url, ddg_row in dedup.items(): full = scraped_map.get(norm_url, {}) merged[norm_url] = { "url": norm_url, "title": full.get("title") or ddg_row.get("title", ""), "text": full.get("text", ""), "highlights": full.get("highlights", ddg_row.get("highlights", [])), "summary": full.get("summary", ddg_row.get("snippet", "")), "snippet": ddg_row.get("snippet", ""), "score": 0.5, } opportunities = self._build_enriched_opportunities(merged, keyword, "duckduckgo") return { "keyword": keyword, "source": "duckduckgo", "total_found": len(opportunities), "opportunities": opportunities, } async def _duckduckgo_search( self, query: str, retries: int = 2, client: Optional[httpx.AsyncClient] = None, ) -> List[Dict[str, Any]]: encoded = quote(query) url = f"https://duckduckgo.com/html/?q={encoded}" headers = {"User-Agent": "Mozilla/5.0 ALwrityBacklinkBot/1.0"} async def _request(active_client: httpx.AsyncClient) -> List[Dict[str, Any]]: for attempt in range(retries + 1): try: resp = await active_client.get(url, headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") results = [] for result in soup.select("div.result")[:10]: anchor = result.select_one("a.result__a") snippet_el = result.select_one("a.result__snippet") or result.select_one("div.result__snippet") if not anchor or not anchor.get("href"): continue results.append({ "url": anchor.get("href"), "title": anchor.get_text(strip=True), "snippet": snippet_el.get_text(" ", strip=True) if snippet_el else "", "highlights": [], }) return results except (httpx.HTTPError, httpx.TimeoutException): if attempt == retries: return [] await asyncio.sleep(0.6 * (attempt + 1)) return [] if client is not None: return await _request(client) async with httpx.AsyncClient(timeout=httpx.Timeout(12.0), follow_redirects=True) as owned_client: return await _request(owned_client) async def _scrape_urls_fallback( self, urls: List[str], timeout_seconds: float = 15.0, max_concurrency: int = 5, ) -> List[Dict[str, Any]]: """Basic async HTTP scrape when Exa is unavailable.""" headers = {"User-Agent": "Mozilla/5.0 ALwrityBacklinkBot/1.0"} semaphore = asyncio.Semaphore(max(1, max_concurrency)) timeout = httpx.Timeout(timeout_seconds) async def scrape_one(client: httpx.AsyncClient, url: str) -> Optional[Dict[str, Any]]: async with semaphore: try: resp = await client.get(url, headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") for tag in soup(["script", "style", "nav", "footer", "header"]): tag.decompose() text = soup.get_text(separator=" ", strip=True) title = soup.title.get_text(strip=True) if soup.title else "" return {"url": url, "title": title, "text": text[:5000], "highlights": [], "summary": ""} except (httpx.HTTPError, httpx.TimeoutException): return None async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: tasks = [scrape_one(client, url) for url in urls] scraped = await asyncio.gather(*tasks) return [row for row in scraped if row] # -- Enrichment Pipeline -- def _build_enriched_opportunities( self, dedup: Dict[str, Dict[str, Any]], keyword: str, source: str ) -> List[Dict[str, Any]]: opportunities = [] for norm_url, row in dedup.items(): text = row.get("text", "") title = row.get("title", row.get("snippet", "")) quality = self._score_quality(text, title) contacts = self._extract_contacts(text) domain = self._extract_domain(norm_url) has_guidelines = self._check_guest_post_signals(text) opportunities.append({ "url": norm_url, "domain": domain, "page_title": title, "snippet": row.get("snippet") or (text[:300] if text else ""), "full_text": text[:5000], "email": contacts.get("email"), "contact_page": contacts.get("contact_page"), "confidence_score": min(1.0, quality + 0.1), "quality_score": quality, "word_count": len(text.split()), "has_guest_post_guidelines": has_guidelines, "discovery_source": source, }) opportunities.sort(key=lambda x: x["quality_score"], reverse=True) return opportunities def _extract_domain(self, url: str) -> str: try: return urlparse(url).netloc except Exception: return url def _normalize_url(self, url: str) -> str: u = (url or "").strip().strip("`") if not u: return "" if u.startswith("//"): u = f"https:{u}" if not re.match(r"^https?://", u): return "" return u.split("#")[0].rstrip("/") def _extract_contacts(self, text: str) -> Dict[str, Optional[str]]: result: Dict[str, Optional[str]] = {"email": None, "contact_page": None} if not text: return result email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text) if email_match: result["email"] = email_match.group(0) contact_match = re.search( r"(https?://[^\s\"'<>]*(?:contact|about|team|write-for-us|guest-post)[^\s\"'<>]*)", text, re.IGNORECASE, ) if contact_match: result["contact_page"] = contact_match.group(1).rstrip("/") return result def _score_quality(self, text: str, title: str) -> float: score = 0.3 words = text.split() wc = len(words) if wc > 2000: score += 0.3 elif wc > 800: score += 0.2 elif wc > 200: score += 0.1 hay = f"{title} {text[:2000]}".lower() cues_found = sum(1 for cue in self.GUEST_POST_KEYWORDS if cue in hay) score += min(0.3, cues_found * 0.06) spam_signals = [ r"buy\s+links?" in hay, r"cheap\s+backlinks?" in hay, r"pbn" in hay, r"private\s+blog\s+network" in hay, ] if any(spam_signals): score -= 0.3 return max(0.0, min(1.0, score)) def _check_guest_post_signals(self, text: str) -> bool: if not text: return False hay = text.lower() guidelines = [ "guest post guidelines", "submission guidelines", "write for us", "guest post", "submit a guest post", "guest contributor guidelines", "contributor guidelines", ] return any(g in hay for g in guidelines) def _generate_search_queries(self, keyword: str) -> List[str]: kw = (keyword or "").strip() if not kw: return [] return [ f"{kw} write for us", f"{kw} guest post", f"{kw} submit guest post", f"{kw} guest contributor", f"{kw} become a guest blogger", f"{kw} add guest post", f"{kw} guest post opportunities", f"{kw} submit article", ]