feat: Sprint 1 - Deep discovery, lead persistence, and dashboard nav
- Add BacklinkOutreachScraper (Exa + DuckDuckGo deep scraping) - Extend DB and Pydantic models for lead enrichment columns - Add StorageService methods for lead CRUD with auto-migration - Add backend endpoints: deep discover, campaign detail, lead management - Extend frontend API client and store with discovery + lead actions - Create BacklinkOutreachDashboard component with campaigns/discover/leads tabs - Register route at /backlink-outreach under SEO feature flag - Add nav entry under Enterprise & Advanced in tool categories
This commit is contained in:
406
backend/services/backlink_outreach_scraper.py
Normal file
406
backend/services/backlink_outreach_scraper.py
Normal file
@@ -0,0 +1,406 @@
|
||||
"""Deep website scraper for backlink outreach discovery.
|
||||
|
||||
Orchestrates Exa neural search + DuckDuckGo fallback to find guest-post
|
||||
opportunities with full-page content extraction and quality scoring.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class BacklinkOutreachScraper:
|
||||
"""Scrapes websites for backlink outreach opportunities using Exa + DuckDuckGo."""
|
||||
|
||||
GUEST_POST_KEYWORDS = [
|
||||
"write for us", "guest post", "submit guest post",
|
||||
"guest contributor", "become a guest blogger", "guest bloggers wanted",
|
||||
"add guest post", "submit article", "guest post opportunities",
|
||||
"contribute to our blog", "write for our blog",
|
||||
]
|
||||
|
||||
def __init__(self, user_id: Optional[str] = None):
|
||||
self.user_id = user_id
|
||||
self._exa_svc = None
|
||||
|
||||
# -- Public API --
|
||||
|
||||
async def deep_discover(
|
||||
self, keyword: str, max_results: int = 15
|
||||
) -> Dict[str, Any]:
|
||||
"""Discover guest-post opportunities using Exa, falling back to DuckDuckGo."""
|
||||
if self._is_exa_available():
|
||||
logger.info(f"[BacklinkScraper] Using Exa for keyword: {keyword}")
|
||||
return await self._discover_with_exa(keyword, max_results)
|
||||
logger.info(f"[BacklinkScraper] Exa unavailable, falling back to DuckDuckGo for: {keyword}")
|
||||
return await self._discover_with_duckduckgo(keyword, max_results)
|
||||
|
||||
def scrape_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
|
||||
"""Fetch full page content for a list of URLs using Exa get_contents."""
|
||||
exa = self._get_exa_sdk()
|
||||
if not exa:
|
||||
return self._scrape_urls_fallback(urls)
|
||||
try:
|
||||
result = exa.get_contents(urls, text={"max_characters": 5000})
|
||||
return self._parse_get_contents_result(result)
|
||||
except Exception as e:
|
||||
logger.warning(f"[BacklinkScraper] Exa get_contents failed: {e}")
|
||||
return self._scrape_urls_fallback(urls)
|
||||
|
||||
# -- Availability --
|
||||
|
||||
def _is_exa_available(self) -> bool:
|
||||
try:
|
||||
exa = self._get_exa_sdk()
|
||||
return exa is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _get_exa_sdk(self):
|
||||
"""Get Exa SDK instance via ExaService, respecting per-user API key."""
|
||||
if self._exa_svc is None:
|
||||
from services.research.exa_service import ExaService
|
||||
self._exa_svc = ExaService()
|
||||
self._exa_svc._try_initialize()
|
||||
return self._exa_svc.exa if self._exa_svc.enabled else None
|
||||
|
||||
# -- Preflight & Usage Tracking --
|
||||
|
||||
def _preflight_subscription_check(self, user_id: str) -> bool:
|
||||
"""Check Exa usage limits. Returns True if allowed."""
|
||||
if not user_id:
|
||||
return True
|
||||
try:
|
||||
from services.database import get_session_for_user
|
||||
from services.subscription import PricingService
|
||||
from models.subscription_models import APIProvider
|
||||
db = get_session_for_user(user_id)
|
||||
if not db:
|
||||
return True
|
||||
try:
|
||||
pricing = PricingService(db)
|
||||
allowed, _, _ = pricing.check_usage_limits(
|
||||
user_id=user_id, provider=APIProvider.EXA, tokens_requested=0,
|
||||
)
|
||||
return allowed
|
||||
finally:
|
||||
db.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"[BacklinkScraper] Preflight check failed: {e}")
|
||||
return True
|
||||
|
||||
def _track_exa_usage(self, user_id: str, cost: float = 0.005):
|
||||
"""Record Exa usage after successful search."""
|
||||
if not user_id:
|
||||
return
|
||||
try:
|
||||
from services.database import get_session_for_user
|
||||
from services.subscription import PricingService
|
||||
from sqlalchemy import text as sql_text
|
||||
db = get_session_for_user(user_id)
|
||||
if not db:
|
||||
return
|
||||
try:
|
||||
pricing = PricingService(db)
|
||||
period = pricing.get_current_billing_period(user_id)
|
||||
db.execute(sql_text("""
|
||||
UPDATE usage_summaries
|
||||
SET exa_calls = COALESCE(exa_calls, 0) + 1,
|
||||
exa_cost = COALESCE(exa_cost, 0) + :cost,
|
||||
total_calls = total_calls + 1,
|
||||
total_cost = total_cost + :cost
|
||||
WHERE user_id = :user_id AND billing_period = :period
|
||||
"""), {"cost": cost, "user_id": user_id, "period": period})
|
||||
db.commit()
|
||||
finally:
|
||||
db.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"[BacklinkScraper] Usage tracking failed: {e}")
|
||||
|
||||
# -- Exa Discovery --
|
||||
|
||||
async def _discover_with_exa(self, keyword: str, max_results: int) -> Dict[str, Any]:
|
||||
exa = self._get_exa_sdk()
|
||||
if not exa:
|
||||
return await self._discover_with_duckduckgo(keyword, max_results)
|
||||
|
||||
queries = self._generate_search_queries(keyword)
|
||||
dedup: Dict[str, Dict[str, Any]] = {}
|
||||
results_per_query = max(1, max_results // len(queries))
|
||||
|
||||
for query in queries[:4]:
|
||||
rows = await self._exa_search_and_contents(exa, query, results_per_query)
|
||||
for row in rows:
|
||||
norm_url = self._normalize_url(row.get("url", ""))
|
||||
if not norm_url or norm_url in dedup:
|
||||
continue
|
||||
dedup[norm_url] = row
|
||||
if len(dedup) >= max_results:
|
||||
break
|
||||
|
||||
opportunities = self._build_enriched_opportunities(dedup, keyword, "exa")
|
||||
self._track_exa_usage(self.user_id)
|
||||
|
||||
return {
|
||||
"keyword": keyword,
|
||||
"source": "exa",
|
||||
"total_found": len(opportunities),
|
||||
"opportunities": opportunities,
|
||||
}
|
||||
|
||||
async def _exa_search_and_contents(
|
||||
self, exa, query: str, num_results: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Run Exa search_and_contents in executor to avoid blocking."""
|
||||
loop = asyncio.get_running_loop()
|
||||
try:
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: exa.search_and_contents(
|
||||
query,
|
||||
type="auto",
|
||||
num_results=num_results,
|
||||
text={"max_characters": 3000},
|
||||
highlights={"num_sentences": 3, "highlights_per_url": 3},
|
||||
),
|
||||
)
|
||||
return self._parse_search_and_contents_result(result)
|
||||
except Exception as e:
|
||||
logger.warning(f"[BacklinkScraper] Exa search_and_contents failed: {e}")
|
||||
return []
|
||||
|
||||
def _parse_search_and_contents_result(self, result) -> List[Dict[str, Any]]:
|
||||
rows = []
|
||||
results = getattr(result, "results", [])
|
||||
for r in results:
|
||||
rows.append({
|
||||
"url": getattr(r, "url", ""),
|
||||
"title": getattr(r, "title", ""),
|
||||
"text": getattr(r, "text", ""),
|
||||
"highlights": getattr(r, "highlights", []),
|
||||
"summary": getattr(r, "summary", ""),
|
||||
"score": getattr(r, "score", 0.5),
|
||||
"published_date": getattr(r, "publishedDate", None),
|
||||
})
|
||||
return rows
|
||||
|
||||
def _parse_get_contents_result(self, result) -> List[Dict[str, Any]]:
|
||||
rows = []
|
||||
results = getattr(result, "results", [])
|
||||
for r in results:
|
||||
rows.append({
|
||||
"url": getattr(r, "url", ""),
|
||||
"title": getattr(r, "title", ""),
|
||||
"text": getattr(r, "text", ""),
|
||||
"highlights": getattr(r, "highlights", []),
|
||||
"summary": getattr(r, "summary", ""),
|
||||
})
|
||||
return rows
|
||||
|
||||
# -- DuckDuckGo Fallback Discovery --
|
||||
|
||||
async def _discover_with_duckduckgo(self, keyword: str, max_results: int) -> Dict[str, Any]:
|
||||
queries = self._generate_search_queries(keyword)
|
||||
dedup: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
for query in queries[:4]:
|
||||
rows = self._duckduckgo_search(query)
|
||||
for row in rows:
|
||||
norm_url = self._normalize_url(row.get("url", ""))
|
||||
if not norm_url or norm_url in dedup:
|
||||
continue
|
||||
dedup[norm_url] = row
|
||||
if len(dedup) >= max_results:
|
||||
break
|
||||
time.sleep(0.4)
|
||||
|
||||
# Scrape discovered URLs with Exa get_contents (or fallback)
|
||||
urls_to_scrape = list(dedup.keys())[:max_results]
|
||||
scraped = self.scrape_urls(urls_to_scrape)
|
||||
scraped_map = {self._normalize_url(s.get("url", "")): s for s in scraped}
|
||||
|
||||
# Merge DDG results with scraped content
|
||||
merged = {}
|
||||
for norm_url, ddg_row in dedup.items():
|
||||
full = scraped_map.get(norm_url, {})
|
||||
merged[norm_url] = {
|
||||
"url": norm_url,
|
||||
"title": full.get("title") or ddg_row.get("title", ""),
|
||||
"text": full.get("text", ""),
|
||||
"highlights": full.get("highlights", ddg_row.get("highlights", [])),
|
||||
"summary": full.get("summary", ddg_row.get("snippet", "")),
|
||||
"snippet": ddg_row.get("snippet", ""),
|
||||
"score": 0.5,
|
||||
}
|
||||
|
||||
opportunities = self._build_enriched_opportunities(merged, keyword, "duckduckgo")
|
||||
|
||||
return {
|
||||
"keyword": keyword,
|
||||
"source": "duckduckgo",
|
||||
"total_found": len(opportunities),
|
||||
"opportunities": opportunities,
|
||||
}
|
||||
|
||||
def _duckduckgo_search(self, query: str, retries: int = 2) -> List[Dict[str, Any]]:
|
||||
encoded = requests.utils.quote(query)
|
||||
url = f"https://duckduckgo.com/html/?q={encoded}"
|
||||
headers = {"User-Agent": "Mozilla/5.0 ALwrityBacklinkBot/1.0"}
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, timeout=12)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
results = []
|
||||
for result in soup.select("div.result")[:10]:
|
||||
anchor = result.select_one("a.result__a")
|
||||
snippet_el = result.select_one("a.result__snippet") or result.select_one("div.result__snippet")
|
||||
if not anchor or not anchor.get("href"):
|
||||
continue
|
||||
results.append({
|
||||
"url": anchor.get("href"),
|
||||
"title": anchor.get_text(strip=True),
|
||||
"snippet": snippet_el.get_text(" ", strip=True) if snippet_el else "",
|
||||
"highlights": [],
|
||||
})
|
||||
return results
|
||||
except Exception:
|
||||
if attempt == retries:
|
||||
return []
|
||||
time.sleep(0.6 * (attempt + 1))
|
||||
return []
|
||||
|
||||
def _scrape_urls_fallback(self, urls: List[str]) -> List[Dict[str, Any]]:
|
||||
"""Basic HTTP scrape when Exa is unavailable."""
|
||||
results = []
|
||||
headers = {"User-Agent": "Mozilla/5.0 ALwrityBacklinkBot/1.0"}
|
||||
for url in urls[:5]:
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, timeout=15)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator=" ", strip=True)
|
||||
title = soup.title.get_text(strip=True) if soup.title else ""
|
||||
results.append({"url": url, "title": title, "text": text[:5000], "highlights": [], "summary": ""})
|
||||
except Exception:
|
||||
continue
|
||||
return results
|
||||
|
||||
# -- Enrichment Pipeline --
|
||||
|
||||
def _build_enriched_opportunities(
|
||||
self, dedup: Dict[str, Dict[str, Any]], keyword: str, source: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
opportunities = []
|
||||
for norm_url, row in dedup.items():
|
||||
text = row.get("text", "")
|
||||
title = row.get("title", row.get("snippet", ""))
|
||||
quality = self._score_quality(text, title)
|
||||
contacts = self._extract_contacts(text)
|
||||
domain = self._extract_domain(norm_url)
|
||||
has_guidelines = self._check_guest_post_signals(text)
|
||||
|
||||
opportunities.append({
|
||||
"url": norm_url,
|
||||
"domain": domain,
|
||||
"page_title": title,
|
||||
"snippet": row.get("snippet") or (text[:300] if text else ""),
|
||||
"full_text": text[:5000],
|
||||
"email": contacts.get("email"),
|
||||
"contact_page": contacts.get("contact_page"),
|
||||
"confidence_score": min(1.0, quality + 0.1),
|
||||
"quality_score": quality,
|
||||
"word_count": len(text.split()),
|
||||
"has_guest_post_guidelines": has_guidelines,
|
||||
"discovery_source": source,
|
||||
})
|
||||
opportunities.sort(key=lambda x: x["quality_score"], reverse=True)
|
||||
return opportunities
|
||||
|
||||
def _extract_domain(self, url: str) -> str:
|
||||
try:
|
||||
return urlparse(url).netloc
|
||||
except Exception:
|
||||
return url
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
u = (url or "").strip().strip("`")
|
||||
if not u:
|
||||
return ""
|
||||
if u.startswith("//"):
|
||||
u = f"https:{u}"
|
||||
if not re.match(r"^https?://", u):
|
||||
return ""
|
||||
return u.split("#")[0].rstrip("/")
|
||||
|
||||
def _extract_contacts(self, text: str) -> Dict[str, Optional[str]]:
|
||||
result: Dict[str, Optional[str]] = {"email": None, "contact_page": None}
|
||||
if not text:
|
||||
return result
|
||||
email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
|
||||
if email_match:
|
||||
result["email"] = email_match.group(0)
|
||||
contact_match = re.search(
|
||||
r"(https?://[^\s\"'<>]*(?:contact|about|team|write-for-us|guest-post)[^\s\"'<>]*)",
|
||||
text, re.IGNORECASE,
|
||||
)
|
||||
if contact_match:
|
||||
result["contact_page"] = contact_match.group(1).rstrip("/")
|
||||
return result
|
||||
|
||||
def _score_quality(self, text: str, title: str) -> float:
|
||||
score = 0.3
|
||||
words = text.split()
|
||||
wc = len(words)
|
||||
if wc > 2000:
|
||||
score += 0.3
|
||||
elif wc > 800:
|
||||
score += 0.2
|
||||
elif wc > 200:
|
||||
score += 0.1
|
||||
hay = f"{title} {text[:2000]}".lower()
|
||||
cues_found = sum(1 for cue in self.GUEST_POST_KEYWORDS if cue in hay)
|
||||
score += min(0.3, cues_found * 0.06)
|
||||
spam_signals = [
|
||||
r"buy\s+links?" in hay, r"cheap\s+backlinks?" in hay,
|
||||
r"pbn" in hay, r"private\s+blog\s+network" in hay,
|
||||
]
|
||||
if any(spam_signals):
|
||||
score -= 0.3
|
||||
return max(0.0, min(1.0, score))
|
||||
|
||||
def _check_guest_post_signals(self, text: str) -> bool:
|
||||
if not text:
|
||||
return False
|
||||
hay = text.lower()
|
||||
guidelines = [
|
||||
"guest post guidelines", "submission guidelines",
|
||||
"write for us", "guest post", "submit a guest post",
|
||||
"guest contributor guidelines", "contributor guidelines",
|
||||
]
|
||||
return any(g in hay for g in guidelines)
|
||||
|
||||
def _generate_search_queries(self, keyword: str) -> List[str]:
|
||||
kw = (keyword or "").strip()
|
||||
if not kw:
|
||||
return []
|
||||
return [
|
||||
f"{kw} write for us",
|
||||
f"{kw} guest post",
|
||||
f"{kw} submit guest post",
|
||||
f"{kw} guest contributor",
|
||||
f"{kw} become a guest blogger",
|
||||
f"{kw} add guest post",
|
||||
f"{kw} guest post opportunities",
|
||||
f"{kw} submit article",
|
||||
]
|
||||
Reference in New Issue
Block a user