Merge remote-tracking branch 'origin/codex/refactor-backlink-outreach-services-for-async-support'

# Conflicts: # backend/routers/backlink_outreach.py
2026-06-03 18:49:01 +05:30
parent 57e46a20f8 55b7209554
commit 7749b4db0e
3 changed files with 202 additions and 105 deletions
--- a/backend/services/backlink_outreach_scraper.py
+++ b/backend/services/backlink_outreach_scraper.py
@@ -8,11 +8,10 @@ from __future__ import annotations

 import asyncio
 import re
-import time
 from typing import Any, Dict, List, Optional
-from urllib.parse import urlparse
+from urllib.parse import quote, urlparse

-import requests
+import httpx
 from bs4 import BeautifulSoup
 from loguru import logger

@@ -34,26 +33,47 @@ class BacklinkOutreachScraper:
    # -- Public API --

    async def deep_discover(
-        self, keyword: str, max_results: int = 15
+        self,
+        keyword: str,
+        max_results: int = 15,
+        scrape_timeout_seconds: float = 15.0,
+        scrape_max_concurrency: int = 5,
    ) -> Dict[str, Any]:
        """Discover guest-post opportunities using Exa, falling back to DuckDuckGo."""
        if self._is_exa_available():
            logger.info(f"[BacklinkScraper] Using Exa for keyword: {keyword}")
            return await self._discover_with_exa(keyword, max_results)
        logger.info(f"[BacklinkScraper] Exa unavailable, falling back to DuckDuckGo for: {keyword}")
-        return await self._discover_with_duckduckgo(keyword, max_results)
+        return await self._discover_with_duckduckgo(
+            keyword,
+            max_results,
+            scrape_timeout_seconds=scrape_timeout_seconds,
+            scrape_max_concurrency=scrape_max_concurrency,
+        )

-    def scrape_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
-        """Fetch full page content for a list of URLs using Exa get_contents."""
+    async def scrape_urls(
+        self,
+        urls: List[str],
+        timeout_seconds: float = 15.0,
+        max_concurrency: int = 5,
+    ) -> List[Dict[str, Any]]:
+        """Fetch full page content with non-blocking fallbacks and bounded concurrency."""
        exa = self._get_exa_sdk()
        if not exa:
-            return self._scrape_urls_fallback(urls)
+            return await self._scrape_urls_fallback(
+                urls, timeout_seconds=timeout_seconds, max_concurrency=max_concurrency
+            )
+        loop = asyncio.get_running_loop()
        try:
-            result = exa.get_contents(urls, text={"max_characters": 5000})
+            result = await loop.run_in_executor(
+                None, lambda: exa.get_contents(urls, text={"max_characters": 5000})
+            )
            return self._parse_get_contents_result(result)
        except Exception as e:
            logger.warning(f"[BacklinkScraper] Exa get_contents failed: {e}")
-            return self._scrape_urls_fallback(urls)
+            return await self._scrape_urls_fallback(
+                urls, timeout_seconds=timeout_seconds, max_concurrency=max_concurrency
+            )

    # -- Availability --

@@ -207,24 +227,35 @@ class BacklinkOutreachScraper:

    # -- DuckDuckGo Fallback Discovery --

-    async def _discover_with_duckduckgo(self, keyword: str, max_results: int) -> Dict[str, Any]:
+    async def _discover_with_duckduckgo(
+        self,
+        keyword: str,
+        max_results: int,
+        scrape_timeout_seconds: float = 15.0,
+        scrape_max_concurrency: int = 5,
+    ) -> Dict[str, Any]:
        queries = self._generate_search_queries(keyword)
        dedup: Dict[str, Dict[str, Any]] = {}

-        for query in queries[:4]:
-            rows = self._duckduckgo_search(query)
-            for row in rows:
-                norm_url = self._normalize_url(row.get("url", ""))
-                if not norm_url or norm_url in dedup:
-                    continue
-                dedup[norm_url] = row
-            if len(dedup) >= max_results:
-                break
-            time.sleep(0.4)
+        async with httpx.AsyncClient(timeout=httpx.Timeout(12.0), follow_redirects=True) as client:
+            for query in queries[:4]:
+                rows = await self._duckduckgo_search(query, client=client)
+                for row in rows:
+                    norm_url = self._normalize_url(row.get("url", ""))
+                    if not norm_url or norm_url in dedup:
+                        continue
+                    dedup[norm_url] = row
+                if len(dedup) >= max_results:
+                    break
+                await asyncio.sleep(0.4)

        # Scrape discovered URLs with Exa get_contents (or fallback)
        urls_to_scrape = list(dedup.keys())[:max_results]
-        scraped = self.scrape_urls(urls_to_scrape)
+        scraped = await self.scrape_urls(
+            urls_to_scrape,
+            timeout_seconds=scrape_timeout_seconds,
+            max_concurrency=scrape_max_concurrency,
+        )
        scraped_map = {self._normalize_url(s.get("url", "")): s for s in scraped}

        # Merge DDG results with scraped content
@@ -250,51 +281,76 @@ class BacklinkOutreachScraper:
            "opportunities": opportunities,
        }

-    def _duckduckgo_search(self, query: str, retries: int = 2) -> List[Dict[str, Any]]:
-        encoded = requests.utils.quote(query)
+    async def _duckduckgo_search(
+        self,
+        query: str,
+        retries: int = 2,
+        client: Optional[httpx.AsyncClient] = None,
+    ) -> List[Dict[str, Any]]:
+        encoded = quote(query)
        url = f"https://duckduckgo.com/html/?q={encoded}"
        headers = {"User-Agent": "Mozilla/5.0 ALwrityBacklinkBot/1.0"}
-        for attempt in range(retries + 1):
-            try:
-                resp = requests.get(url, headers=headers, timeout=12)
-                resp.raise_for_status()
-                soup = BeautifulSoup(resp.text, "html.parser")
-                results = []
-                for result in soup.select("div.result")[:10]:
-                    anchor = result.select_one("a.result__a")
-                    snippet_el = result.select_one("a.result__snippet") or result.select_one("div.result__snippet")
-                    if not anchor or not anchor.get("href"):
-                        continue
-                    results.append({
-                        "url": anchor.get("href"),
-                        "title": anchor.get_text(strip=True),
-                        "snippet": snippet_el.get_text(" ", strip=True) if snippet_el else "",
-                        "highlights": [],
-                    })
-                return results
-            except Exception:
-                if attempt == retries:
-                    return []
-                time.sleep(0.6 * (attempt + 1))
-        return []

-    def _scrape_urls_fallback(self, urls: List[str]) -> List[Dict[str, Any]]:
-        """Basic HTTP scrape when Exa is unavailable."""
-        results = []
+        async def _request(active_client: httpx.AsyncClient) -> List[Dict[str, Any]]:
+            for attempt in range(retries + 1):
+                try:
+                    resp = await active_client.get(url, headers=headers)
+                    resp.raise_for_status()
+                    soup = BeautifulSoup(resp.text, "html.parser")
+                    results = []
+                    for result in soup.select("div.result")[:10]:
+                        anchor = result.select_one("a.result__a")
+                        snippet_el = result.select_one("a.result__snippet") or result.select_one("div.result__snippet")
+                        if not anchor or not anchor.get("href"):
+                            continue
+                        results.append({
+                            "url": anchor.get("href"),
+                            "title": anchor.get_text(strip=True),
+                            "snippet": snippet_el.get_text(" ", strip=True) if snippet_el else "",
+                            "highlights": [],
+                        })
+                    return results
+                except (httpx.HTTPError, httpx.TimeoutException):
+                    if attempt == retries:
+                        return []
+                    await asyncio.sleep(0.6 * (attempt + 1))
+            return []
+
+        if client is not None:
+            return await _request(client)
+
+        async with httpx.AsyncClient(timeout=httpx.Timeout(12.0), follow_redirects=True) as owned_client:
+            return await _request(owned_client)
+
+    async def _scrape_urls_fallback(
+        self,
+        urls: List[str],
+        timeout_seconds: float = 15.0,
+        max_concurrency: int = 5,
+    ) -> List[Dict[str, Any]]:
+        """Basic async HTTP scrape when Exa is unavailable."""
        headers = {"User-Agent": "Mozilla/5.0 ALwrityBacklinkBot/1.0"}
-        for url in urls[:5]:
-            try:
-                resp = requests.get(url, headers=headers, timeout=15)
-                resp.raise_for_status()
-                soup = BeautifulSoup(resp.text, "html.parser")
-                for tag in soup(["script", "style", "nav", "footer", "header"]):
-                    tag.decompose()
-                text = soup.get_text(separator=" ", strip=True)
-                title = soup.title.get_text(strip=True) if soup.title else ""
-                results.append({"url": url, "title": title, "text": text[:5000], "highlights": [], "summary": ""})
-            except Exception:
-                continue
-        return results
+        semaphore = asyncio.Semaphore(max(1, max_concurrency))
+        timeout = httpx.Timeout(timeout_seconds)
+
+        async def scrape_one(client: httpx.AsyncClient, url: str) -> Optional[Dict[str, Any]]:
+            async with semaphore:
+                try:
+                    resp = await client.get(url, headers=headers)
+                    resp.raise_for_status()
+                    soup = BeautifulSoup(resp.text, "html.parser")
+                    for tag in soup(["script", "style", "nav", "footer", "header"]):
+                        tag.decompose()
+                    text = soup.get_text(separator=" ", strip=True)
+                    title = soup.title.get_text(strip=True) if soup.title else ""
+                    return {"url": url, "title": title, "text": text[:5000], "highlights": [], "summary": ""}
+                except (httpx.HTTPError, httpx.TimeoutException):
+                    return None
+
+        async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
+            tasks = [scrape_one(client, url) for url in urls]
+            scraped = await asyncio.gather(*tasks)
+        return [row for row in scraped if row]

    # -- Enrichment Pipeline --

--- a/backend/services/backlink_outreach_service.py
+++ b/backend/services/backlink_outreach_service.py
@@ -4,10 +4,11 @@ from __future__ import annotations

 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
+from urllib.parse import quote
+import asyncio
 import re
-import time

-import requests
+import httpx
 from bs4 import BeautifulSoup

 import csv
@@ -55,51 +56,67 @@ class BacklinkOutreachService:
            f"{normalized} + 'Submit article'",
        ]

-    def search_for_urls(self, query: str, timeout_seconds: int = 12, retries: int = 2) -> List[SearchResult]:
-        encoded_query = requests.utils.quote(query)
+    async def search_for_urls(
+        self,
+        query: str,
+        timeout_seconds: int = 12,
+        retries: int = 2,
+        client: Optional[httpx.AsyncClient] = None,
+    ) -> List[SearchResult]:
+        """Search DuckDuckGo HTML using a non-blocking HTTP client."""
+        encoded_query = quote(query)
        url = f"https://duckduckgo.com/html/?q={encoded_query}"
        headers = {"User-Agent": "Mozilla/5.0 ALwrityBacklinkBot/1.0"}

-        for attempt in range(retries + 1):
-            try:
-                response = requests.get(url, headers=headers, timeout=timeout_seconds)
-                response.raise_for_status()
-                soup = BeautifulSoup(response.text, "html.parser")
-                rows: List[SearchResult] = []
-                for result in soup.select("div.result")[:10]:
-                    anchor = result.select_one("a.result__a")
-                    snippet = result.select_one("a.result__snippet") or result.select_one("div.result__snippet")
-                    if not anchor or not anchor.get("href"):
-                        continue
-                    rows.append(
-                        SearchResult(
-                            url=anchor.get("href"),
-                            title=anchor.get_text(strip=True),
-                            snippet=snippet.get_text(" ", strip=True) if snippet else "",
+        async def _request(active_client: httpx.AsyncClient) -> List[SearchResult]:
+            for attempt in range(retries + 1):
+                try:
+                    response = await active_client.get(url, headers=headers)
+                    response.raise_for_status()
+                    soup = BeautifulSoup(response.text, "html.parser")
+                    rows: List[SearchResult] = []
+                    for result in soup.select("div.result")[:10]:
+                        anchor = result.select_one("a.result__a")
+                        snippet = result.select_one("a.result__snippet") or result.select_one("div.result__snippet")
+                        if not anchor or not anchor.get("href"):
+                            continue
+                        rows.append(
+                            SearchResult(
+                                url=anchor.get("href"),
+                                title=anchor.get_text(strip=True),
+                                snippet=snippet.get_text(" ", strip=True) if snippet else "",
+                            )
                        )
-                    )
-                return rows
-            except Exception:
-                if attempt == retries:
-                    return []
-                time.sleep(0.6 * (attempt + 1))
-        return []
+                    return rows
+                except (httpx.HTTPError, httpx.TimeoutException):
+                    if attempt == retries:
+                        return []
+                    await asyncio.sleep(0.6 * (attempt + 1))
+            return []

-    def discover_opportunities(self, keyword: str, max_results: int = 10) -> Dict[str, Any]:
+        if client is not None:
+            return await _request(client)
+
+        timeout = httpx.Timeout(timeout_seconds)
+        async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as owned_client:
+            return await _request(owned_client)
+
+    async def discover_opportunities_async(self, keyword: str, max_results: int = 10) -> Dict[str, Any]:
        queries = self.generate_guest_post_queries(keyword)[:4]
        dedup: Dict[str, SearchResult] = {}

-        for query in queries:
-            for result in self.search_for_urls(query):
-                normalized_url = self._normalize_url(result.url)
-                if not normalized_url or normalized_url in dedup:
-                    continue
-                dedup[normalized_url] = result
+        async with httpx.AsyncClient(timeout=httpx.Timeout(12.0), follow_redirects=True) as client:
+            for query in queries:
+                for result in await self.search_for_urls(query, client=client):
+                    normalized_url = self._normalize_url(result.url)
+                    if not normalized_url or normalized_url in dedup:
+                        continue
+                    dedup[normalized_url] = result
+                    if len(dedup) >= max_results:
+                        break
                if len(dedup) >= max_results:
                    break
-            if len(dedup) >= max_results:
-                break
-            time.sleep(0.4)
+                await asyncio.sleep(0.4)

        opportunities: List[OpportunityRecord] = []
        for normalized_url, row in dedup.items():
@@ -118,6 +135,10 @@ class BacklinkOutreachService:

        return {"keyword": keyword, "queries": queries, "opportunities": opportunities}

+    def discover_opportunities(self, keyword: str, max_results: int = 10) -> Dict[str, Any]:
+        """Synchronous compatibility wrapper for non-async callers."""
+        return asyncio.run(self.discover_opportunities_async(keyword, max_results))
+
    def _normalize_url(self, url: str) -> str:
        u = (url or "").strip()
        if not u:
@@ -323,11 +344,23 @@ class BacklinkOutreachService:
            writer.writerows([{k: self._sanitize_csv_value(v) for k, v in row.items()}])
        return output.getvalue()

-    async def deep_discover(self, keyword: str, max_results: int = 15) -> Dict[str, Any]:
+    async def deep_discover(
+        self,
+        keyword: str,
+        max_results: int = 15,
+        user_id: Optional[str] = None,
+        scrape_timeout_seconds: float = 15.0,
+        scrape_max_concurrency: int = 5,
+    ) -> Dict[str, Any]:
        """Enhanced discovery using Exa neural search + DuckDuckGo with full-page scraping."""
        from services.backlink_outreach_scraper import BacklinkOutreachScraper
-        scraper = BacklinkOutreachScraper(user_id=self._user_id if hasattr(self, '_user_id') else None)
-        return await scraper.deep_discover(keyword, max_results)
+        scraper = BacklinkOutreachScraper(user_id=user_id)
+        return await scraper.deep_discover(
+            keyword,
+            max_results,
+            scrape_timeout_seconds=scrape_timeout_seconds,
+            scrape_max_concurrency=scrape_max_concurrency,
+        )

    def get_migration_coverage(self) -> Dict[str, Any]:
        implemented = [