463 lines
18 KiB
Python
463 lines
18 KiB
Python
"""Deep website scraper for backlink outreach discovery.
|
|
|
|
Orchestrates Exa neural search + DuckDuckGo fallback to find guest-post
|
|
opportunities with full-page content extraction and quality scoring.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import re
|
|
from typing import Any, Dict, List, Optional
|
|
from urllib.parse import quote, urlparse
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
from loguru import logger
|
|
|
|
|
|
class BacklinkOutreachScraper:
|
|
"""Scrapes websites for backlink outreach opportunities using Exa + DuckDuckGo."""
|
|
|
|
GUEST_POST_KEYWORDS = [
|
|
"write for us", "guest post", "submit guest post",
|
|
"guest contributor", "become a guest blogger", "guest bloggers wanted",
|
|
"add guest post", "submit article", "guest post opportunities",
|
|
"contribute to our blog", "write for our blog",
|
|
]
|
|
|
|
def __init__(self, user_id: Optional[str] = None):
|
|
self.user_id = user_id
|
|
self._exa_svc = None
|
|
|
|
# -- Public API --
|
|
|
|
async def deep_discover(
|
|
self,
|
|
keyword: str,
|
|
max_results: int = 15,
|
|
scrape_timeout_seconds: float = 15.0,
|
|
scrape_max_concurrency: int = 5,
|
|
) -> Dict[str, Any]:
|
|
"""Discover guest-post opportunities using Exa, falling back to DuckDuckGo."""
|
|
if self._is_exa_available():
|
|
logger.info(f"[BacklinkScraper] Using Exa for keyword: {keyword}")
|
|
return await self._discover_with_exa(keyword, max_results)
|
|
logger.info(f"[BacklinkScraper] Exa unavailable, falling back to DuckDuckGo for: {keyword}")
|
|
return await self._discover_with_duckduckgo(
|
|
keyword,
|
|
max_results,
|
|
scrape_timeout_seconds=scrape_timeout_seconds,
|
|
scrape_max_concurrency=scrape_max_concurrency,
|
|
)
|
|
|
|
async def scrape_urls(
|
|
self,
|
|
urls: List[str],
|
|
timeout_seconds: float = 15.0,
|
|
max_concurrency: int = 5,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Fetch full page content with non-blocking fallbacks and bounded concurrency."""
|
|
exa = self._get_exa_sdk()
|
|
if not exa:
|
|
return await self._scrape_urls_fallback(
|
|
urls, timeout_seconds=timeout_seconds, max_concurrency=max_concurrency
|
|
)
|
|
loop = asyncio.get_running_loop()
|
|
try:
|
|
result = await loop.run_in_executor(
|
|
None, lambda: exa.get_contents(urls, text={"max_characters": 5000})
|
|
)
|
|
return self._parse_get_contents_result(result)
|
|
except Exception as e:
|
|
logger.warning(f"[BacklinkScraper] Exa get_contents failed: {e}")
|
|
return await self._scrape_urls_fallback(
|
|
urls, timeout_seconds=timeout_seconds, max_concurrency=max_concurrency
|
|
)
|
|
|
|
# -- Availability --
|
|
|
|
def _is_exa_available(self) -> bool:
|
|
try:
|
|
exa = self._get_exa_sdk()
|
|
return exa is not None
|
|
except Exception:
|
|
return False
|
|
|
|
def _get_exa_sdk(self):
|
|
"""Get Exa SDK instance via ExaService, respecting per-user API key."""
|
|
if self._exa_svc is None:
|
|
from services.research.exa_service import ExaService
|
|
self._exa_svc = ExaService()
|
|
self._exa_svc._try_initialize()
|
|
return self._exa_svc.exa if self._exa_svc.enabled else None
|
|
|
|
# -- Preflight & Usage Tracking --
|
|
|
|
def _preflight_subscription_check(self, user_id: str) -> bool:
|
|
"""Check Exa usage limits. Returns True if allowed."""
|
|
if not user_id:
|
|
return True
|
|
try:
|
|
from services.database import get_session_for_user
|
|
from services.subscription import PricingService
|
|
from models.subscription_models import APIProvider
|
|
db = get_session_for_user(user_id)
|
|
if not db:
|
|
return True
|
|
try:
|
|
pricing = PricingService(db)
|
|
allowed, _, _ = pricing.check_usage_limits(
|
|
user_id=user_id, provider=APIProvider.EXA, tokens_requested=0,
|
|
)
|
|
return allowed
|
|
finally:
|
|
db.close()
|
|
except Exception as e:
|
|
logger.warning(f"[BacklinkScraper] Preflight check failed: {e}")
|
|
return True
|
|
|
|
def _track_exa_usage(self, user_id: str, cost: float = 0.005):
|
|
"""Record Exa usage after successful search."""
|
|
if not user_id:
|
|
return
|
|
try:
|
|
from services.database import get_session_for_user
|
|
from services.subscription import PricingService
|
|
from sqlalchemy import text as sql_text
|
|
db = get_session_for_user(user_id)
|
|
if not db:
|
|
return
|
|
try:
|
|
pricing = PricingService(db)
|
|
period = pricing.get_current_billing_period(user_id)
|
|
db.execute(sql_text("""
|
|
UPDATE usage_summaries
|
|
SET exa_calls = COALESCE(exa_calls, 0) + 1,
|
|
exa_cost = COALESCE(exa_cost, 0) + :cost,
|
|
total_calls = total_calls + 1,
|
|
total_cost = total_cost + :cost
|
|
WHERE user_id = :user_id AND billing_period = :period
|
|
"""), {"cost": cost, "user_id": user_id, "period": period})
|
|
db.commit()
|
|
finally:
|
|
db.close()
|
|
except Exception as e:
|
|
logger.warning(f"[BacklinkScraper] Usage tracking failed: {e}")
|
|
|
|
# -- Exa Discovery --
|
|
|
|
async def _discover_with_exa(self, keyword: str, max_results: int) -> Dict[str, Any]:
|
|
exa = self._get_exa_sdk()
|
|
if not exa:
|
|
return await self._discover_with_duckduckgo(keyword, max_results)
|
|
|
|
queries = self._generate_search_queries(keyword)
|
|
dedup: Dict[str, Dict[str, Any]] = {}
|
|
results_per_query = max(1, max_results // len(queries))
|
|
|
|
for query in queries[:4]:
|
|
rows = await self._exa_search_and_contents(exa, query, results_per_query)
|
|
for row in rows:
|
|
norm_url = self._normalize_url(row.get("url", ""))
|
|
if not norm_url or norm_url in dedup:
|
|
continue
|
|
dedup[norm_url] = row
|
|
if len(dedup) >= max_results:
|
|
break
|
|
|
|
opportunities = self._build_enriched_opportunities(dedup, keyword, "exa")
|
|
self._track_exa_usage(self.user_id)
|
|
|
|
return {
|
|
"keyword": keyword,
|
|
"source": "exa",
|
|
"total_found": len(opportunities),
|
|
"opportunities": opportunities,
|
|
}
|
|
|
|
async def _exa_search_and_contents(
|
|
self, exa, query: str, num_results: int
|
|
) -> List[Dict[str, Any]]:
|
|
"""Run Exa search_and_contents in executor to avoid blocking."""
|
|
loop = asyncio.get_running_loop()
|
|
try:
|
|
result = await loop.run_in_executor(
|
|
None,
|
|
lambda: exa.search_and_contents(
|
|
query,
|
|
type="auto",
|
|
num_results=num_results,
|
|
text={"max_characters": 3000},
|
|
highlights={"num_sentences": 3, "highlights_per_url": 3},
|
|
),
|
|
)
|
|
return self._parse_search_and_contents_result(result)
|
|
except Exception as e:
|
|
logger.warning(f"[BacklinkScraper] Exa search_and_contents failed: {e}")
|
|
return []
|
|
|
|
def _parse_search_and_contents_result(self, result) -> List[Dict[str, Any]]:
|
|
rows = []
|
|
results = getattr(result, "results", [])
|
|
for r in results:
|
|
rows.append({
|
|
"url": getattr(r, "url", ""),
|
|
"title": getattr(r, "title", ""),
|
|
"text": getattr(r, "text", ""),
|
|
"highlights": getattr(r, "highlights", []),
|
|
"summary": getattr(r, "summary", ""),
|
|
"score": getattr(r, "score", 0.5),
|
|
"published_date": getattr(r, "publishedDate", None),
|
|
})
|
|
return rows
|
|
|
|
def _parse_get_contents_result(self, result) -> List[Dict[str, Any]]:
|
|
rows = []
|
|
results = getattr(result, "results", [])
|
|
for r in results:
|
|
rows.append({
|
|
"url": getattr(r, "url", ""),
|
|
"title": getattr(r, "title", ""),
|
|
"text": getattr(r, "text", ""),
|
|
"highlights": getattr(r, "highlights", []),
|
|
"summary": getattr(r, "summary", ""),
|
|
})
|
|
return rows
|
|
|
|
# -- DuckDuckGo Fallback Discovery --
|
|
|
|
async def _discover_with_duckduckgo(
|
|
self,
|
|
keyword: str,
|
|
max_results: int,
|
|
scrape_timeout_seconds: float = 15.0,
|
|
scrape_max_concurrency: int = 5,
|
|
) -> Dict[str, Any]:
|
|
queries = self._generate_search_queries(keyword)
|
|
dedup: Dict[str, Dict[str, Any]] = {}
|
|
|
|
async with httpx.AsyncClient(timeout=httpx.Timeout(12.0), follow_redirects=True) as client:
|
|
for query in queries[:4]:
|
|
rows = await self._duckduckgo_search(query, client=client)
|
|
for row in rows:
|
|
norm_url = self._normalize_url(row.get("url", ""))
|
|
if not norm_url or norm_url in dedup:
|
|
continue
|
|
dedup[norm_url] = row
|
|
if len(dedup) >= max_results:
|
|
break
|
|
await asyncio.sleep(0.4)
|
|
|
|
# Scrape discovered URLs with Exa get_contents (or fallback)
|
|
urls_to_scrape = list(dedup.keys())[:max_results]
|
|
scraped = await self.scrape_urls(
|
|
urls_to_scrape,
|
|
timeout_seconds=scrape_timeout_seconds,
|
|
max_concurrency=scrape_max_concurrency,
|
|
)
|
|
scraped_map = {self._normalize_url(s.get("url", "")): s for s in scraped}
|
|
|
|
# Merge DDG results with scraped content
|
|
merged = {}
|
|
for norm_url, ddg_row in dedup.items():
|
|
full = scraped_map.get(norm_url, {})
|
|
merged[norm_url] = {
|
|
"url": norm_url,
|
|
"title": full.get("title") or ddg_row.get("title", ""),
|
|
"text": full.get("text", ""),
|
|
"highlights": full.get("highlights", ddg_row.get("highlights", [])),
|
|
"summary": full.get("summary", ddg_row.get("snippet", "")),
|
|
"snippet": ddg_row.get("snippet", ""),
|
|
"score": 0.5,
|
|
}
|
|
|
|
opportunities = self._build_enriched_opportunities(merged, keyword, "duckduckgo")
|
|
|
|
return {
|
|
"keyword": keyword,
|
|
"source": "duckduckgo",
|
|
"total_found": len(opportunities),
|
|
"opportunities": opportunities,
|
|
}
|
|
|
|
async def _duckduckgo_search(
|
|
self,
|
|
query: str,
|
|
retries: int = 2,
|
|
client: Optional[httpx.AsyncClient] = None,
|
|
) -> List[Dict[str, Any]]:
|
|
encoded = quote(query)
|
|
url = f"https://duckduckgo.com/html/?q={encoded}"
|
|
headers = {"User-Agent": "Mozilla/5.0 ALwrityBacklinkBot/1.0"}
|
|
|
|
async def _request(active_client: httpx.AsyncClient) -> List[Dict[str, Any]]:
|
|
for attempt in range(retries + 1):
|
|
try:
|
|
resp = await active_client.get(url, headers=headers)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
results = []
|
|
for result in soup.select("div.result")[:10]:
|
|
anchor = result.select_one("a.result__a")
|
|
snippet_el = result.select_one("a.result__snippet") or result.select_one("div.result__snippet")
|
|
if not anchor or not anchor.get("href"):
|
|
continue
|
|
results.append({
|
|
"url": anchor.get("href"),
|
|
"title": anchor.get_text(strip=True),
|
|
"snippet": snippet_el.get_text(" ", strip=True) if snippet_el else "",
|
|
"highlights": [],
|
|
})
|
|
return results
|
|
except (httpx.HTTPError, httpx.TimeoutException):
|
|
if attempt == retries:
|
|
return []
|
|
await asyncio.sleep(0.6 * (attempt + 1))
|
|
return []
|
|
|
|
if client is not None:
|
|
return await _request(client)
|
|
|
|
async with httpx.AsyncClient(timeout=httpx.Timeout(12.0), follow_redirects=True) as owned_client:
|
|
return await _request(owned_client)
|
|
|
|
async def _scrape_urls_fallback(
|
|
self,
|
|
urls: List[str],
|
|
timeout_seconds: float = 15.0,
|
|
max_concurrency: int = 5,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Basic async HTTP scrape when Exa is unavailable."""
|
|
headers = {"User-Agent": "Mozilla/5.0 ALwrityBacklinkBot/1.0"}
|
|
semaphore = asyncio.Semaphore(max(1, max_concurrency))
|
|
timeout = httpx.Timeout(timeout_seconds)
|
|
|
|
async def scrape_one(client: httpx.AsyncClient, url: str) -> Optional[Dict[str, Any]]:
|
|
async with semaphore:
|
|
try:
|
|
resp = await client.get(url, headers=headers)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
|
tag.decompose()
|
|
text = soup.get_text(separator=" ", strip=True)
|
|
title = soup.title.get_text(strip=True) if soup.title else ""
|
|
return {"url": url, "title": title, "text": text[:5000], "highlights": [], "summary": ""}
|
|
except (httpx.HTTPError, httpx.TimeoutException):
|
|
return None
|
|
|
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
|
tasks = [scrape_one(client, url) for url in urls]
|
|
scraped = await asyncio.gather(*tasks)
|
|
return [row for row in scraped if row]
|
|
|
|
# -- Enrichment Pipeline --
|
|
|
|
def _build_enriched_opportunities(
|
|
self, dedup: Dict[str, Dict[str, Any]], keyword: str, source: str
|
|
) -> List[Dict[str, Any]]:
|
|
opportunities = []
|
|
for norm_url, row in dedup.items():
|
|
text = row.get("text", "")
|
|
title = row.get("title", row.get("snippet", ""))
|
|
quality = self._score_quality(text, title)
|
|
contacts = self._extract_contacts(text)
|
|
domain = self._extract_domain(norm_url)
|
|
has_guidelines = self._check_guest_post_signals(text)
|
|
|
|
opportunities.append({
|
|
"url": norm_url,
|
|
"domain": domain,
|
|
"page_title": title,
|
|
"snippet": row.get("snippet") or (text[:300] if text else ""),
|
|
"full_text": text[:5000],
|
|
"email": contacts.get("email"),
|
|
"contact_page": contacts.get("contact_page"),
|
|
"confidence_score": min(1.0, quality + 0.1),
|
|
"quality_score": quality,
|
|
"word_count": len(text.split()),
|
|
"has_guest_post_guidelines": has_guidelines,
|
|
"discovery_source": source,
|
|
})
|
|
opportunities.sort(key=lambda x: x["quality_score"], reverse=True)
|
|
return opportunities
|
|
|
|
def _extract_domain(self, url: str) -> str:
|
|
try:
|
|
return urlparse(url).netloc
|
|
except Exception:
|
|
return url
|
|
|
|
def _normalize_url(self, url: str) -> str:
|
|
u = (url or "").strip().strip("`")
|
|
if not u:
|
|
return ""
|
|
if u.startswith("//"):
|
|
u = f"https:{u}"
|
|
if not re.match(r"^https?://", u):
|
|
return ""
|
|
return u.split("#")[0].rstrip("/")
|
|
|
|
def _extract_contacts(self, text: str) -> Dict[str, Optional[str]]:
|
|
result: Dict[str, Optional[str]] = {"email": None, "contact_page": None}
|
|
if not text:
|
|
return result
|
|
email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
|
|
if email_match:
|
|
result["email"] = email_match.group(0)
|
|
contact_match = re.search(
|
|
r"(https?://[^\s\"'<>]*(?:contact|about|team|write-for-us|guest-post)[^\s\"'<>]*)",
|
|
text, re.IGNORECASE,
|
|
)
|
|
if contact_match:
|
|
result["contact_page"] = contact_match.group(1).rstrip("/")
|
|
return result
|
|
|
|
def _score_quality(self, text: str, title: str) -> float:
|
|
score = 0.3
|
|
words = text.split()
|
|
wc = len(words)
|
|
if wc > 2000:
|
|
score += 0.3
|
|
elif wc > 800:
|
|
score += 0.2
|
|
elif wc > 200:
|
|
score += 0.1
|
|
hay = f"{title} {text[:2000]}".lower()
|
|
cues_found = sum(1 for cue in self.GUEST_POST_KEYWORDS if cue in hay)
|
|
score += min(0.3, cues_found * 0.06)
|
|
spam_signals = [
|
|
r"buy\s+links?" in hay, r"cheap\s+backlinks?" in hay,
|
|
r"pbn" in hay, r"private\s+blog\s+network" in hay,
|
|
]
|
|
if any(spam_signals):
|
|
score -= 0.3
|
|
return max(0.0, min(1.0, score))
|
|
|
|
def _check_guest_post_signals(self, text: str) -> bool:
|
|
if not text:
|
|
return False
|
|
hay = text.lower()
|
|
guidelines = [
|
|
"guest post guidelines", "submission guidelines",
|
|
"write for us", "guest post", "submit a guest post",
|
|
"guest contributor guidelines", "contributor guidelines",
|
|
]
|
|
return any(g in hay for g in guidelines)
|
|
|
|
def _generate_search_queries(self, keyword: str) -> List[str]:
|
|
kw = (keyword or "").strip()
|
|
if not kw:
|
|
return []
|
|
return [
|
|
f"{kw} write for us",
|
|
f"{kw} guest post",
|
|
f"{kw} submit guest post",
|
|
f"{kw} guest contributor",
|
|
f"{kw} become a guest blogger",
|
|
f"{kw} add guest post",
|
|
f"{kw} guest post opportunities",
|
|
f"{kw} submit article",
|
|
]
|