Files
ALwrity/backend/services/research/google_search_service.py
ajaysi 923fa671fe feat: ContentGuardianAgent, onboarding UX, Team Activity action wiring, docs, agent help modal
ContentGuardianAgent consolidation:
- Merge 3 duplicate classes into single source in specialized/content_guardian.py
- Watchdog audit_committee() with heuristic scoring, coverage gaps, overlaps, alerts
- Remove misleading rejection_rate() helper; use acceptance_rate directly
- Integrate audit + alerts + trend signals into today_workflow_service.py

Team Activity page:
- QualityAuditPanel: health ring, per-agent critiques, coverage gaps, overlaps
- TrendSignalsPanel: opportunity cards with urgency/impact/coverage bars
- AlertBanner: persistent dismiss via POST /alerts/{id}/mark-read
- AgentHelpModal: dialog showing all 8 agents with descriptions, tools, schedule
- QualityAuditPanel action buttons: Fill gap -> /content-planning, Resolve overlap, View CTA on alerts/issues
- TrendSignalsPanel action buttons: Create content from this trend -> /blog-writer with trend context state

Onboarding system:
- Step 4 validation: no auto-pass via basic_ready; requires persona data or explicit progression
- Step 5 validation: logs warning on auto-pass without integration data
- OnboardingCompletionService: single DB session, transactional task creation, upsert pattern
- Business-without-website: nullable website_url on SIFIndexingTask and MarketTrendsTask
- DeepCompetitorAnalysisExecutor: 5-min timeout, 10-competitor cap, asyncio.wait_for
- Persona generation: async with 30s timeout, falls back to scheduler
- OnboardingProgressService.reset_onboarding(): resets session + pauses all DB tasks
- OnboardingControlService.reset_onboarding(): also cancels APScheduler jobs
- FinalStep TaskSchedulingPanel: shows scheduled/failed tasks after completion, 8s auto-redirect
- onboarding_completed agent activity event logged to feed

Documentation:
- docs-site/features/onboarding/: overview, steps, scheduler-tasks, technical-reference (4 pages)
- docs-site/mkdocs.yml: added Onboarding System nav section
- docs-site/features/sif-agents/: overview, agent-directory, committee-system, content-guardian (4 pages)
- docs-site/features/team-activity/: overview, quality-audit, trend-signals, alert-system (4 pages)
- docs-site/features/todays-workflow/: updated overview, technical-architecture, workflow-guide, api-reference
2026-06-01 12:24:31 +05:30

508 lines
19 KiB
Python

"""
Google Search Service for ALwrity
This service provides real-time industry research using Google Custom Search API,
replacing the mock research system with actual web search capabilities.
Key Features:
- Industry-specific search queries
- Source credibility scoring and ranking
- Content extraction and insight generation
- Real-time information from the last month
- Fallback mechanisms for API failures
Dependencies:
- google-api-python-client
- aiohttp (for async HTTP requests)
- os (for environment variables)
- logging (for debugging)
Author: ALwrity Team
Version: 1.0
Last Updated: January 2025
"""
import os
import json
import asyncio
import aiohttp
from typing import Dict, List, Optional, Any
from datetime import datetime, timedelta
from loguru import logger
class GoogleSearchService:
"""
Service for conducting real industry research using Google Custom Search API.
This service replaces the mock research system with actual web search capabilities,
providing current, relevant industry information for content grounding.
"""
def __init__(self):
"""Initialize the Google Search Service with API credentials."""
self.api_key = os.getenv("GOOGLE_SEARCH_API_KEY")
self.search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
self.base_url = "https://www.googleapis.com/customsearch/v1"
if not self.api_key or not self.search_engine_id:
raise ValueError("Google Search API credentials not configured. Please set GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables.")
else:
self.enabled = True
logger.info("Google Search Service initialized successfully")
async def search_industry_trends(
self,
topic: str,
industry: str,
max_results: int = 10
) -> List[Dict[str, Any]]:
"""
Search for current industry trends and insights.
Args:
topic: The specific topic to research
industry: The industry context for the search
max_results: Maximum number of search results to return
Returns:
List of search results with credibility scoring
"""
if not self.enabled:
raise RuntimeError("Google Search Service is not enabled. Please configure API credentials.")
try:
# Construct industry-specific search query
search_query = self._build_search_query(topic, industry)
logger.info(f"Searching for: {search_query}")
# Perform the search
search_results = await self.perform_search(search_query, max_results)
# Process and rank results
processed_results = await self._process_search_results(search_results, topic, industry)
# Extract insights and statistics
insights = await self._extract_insights(processed_results, topic, industry)
logger.info(f"Search completed successfully. Found {len(processed_results)} relevant sources.")
return {
"sources": processed_results,
"key_insights": insights["insights"],
"statistics": insights["statistics"],
"grounding_enabled": True,
"search_query": search_query,
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
logger.error(f"Google search failed: {str(e)}")
raise RuntimeError(f"Google search failed: {str(e)}")
def _build_search_query(self, topic: str, industry: str) -> str:
"""
Build an optimized search query for industry research.
Args:
topic: The specific topic to research
industry: The industry context
Returns:
Optimized search query string
"""
# Add industry-specific terms and current year for relevance
current_year = datetime.now().year
# Industry-specific search patterns
industry_patterns = {
"Technology": ["trends", "innovations", "developments", "insights"],
"Healthcare": ["advances", "research", "treatments", "studies"],
"Finance": ["market analysis", "trends", "reports", "insights"],
"Marketing": ["strategies", "trends", "best practices", "case studies"],
"Education": ["innovations", "trends", "research", "best practices"]
}
# Get industry-specific terms
industry_terms = industry_patterns.get(industry, ["trends", "insights", "developments"])
# Build the query
query_components = [
topic,
industry,
f"{current_year}",
"latest",
"trends",
"insights"
]
# Add industry-specific terms
query_components.extend(industry_terms[:2])
return " ".join(query_components)
async def perform_search(self, query: str, max_results: int, **overrides) -> List[Dict[str, Any]]:
"""
Perform the actual Google Custom Search API call.
Args:
query: The search query to execute
max_results: Maximum number of results to return
**overrides: Override or disable default params.
Pass `param=None` to remove a default param entirely.
Pass `param=value` to override its value.
Returns:
Raw search results from Google API
"""
params = {
"key": self.api_key,
"cx": self.search_engine_id,
"q": query,
"num": min(max_results, 10), # Google CSE max is 10 per request
"dateRestrict": "m1", # Last month
"sort": "date", # Sort by date for current information
"safe": "active", # Safe search for professional content
}
# Apply overrides: None removes the key, non-None overrides the value
if overrides:
for k, v in overrides.items():
if v is None:
params.pop(k, None)
else:
params[k] = v
async with aiohttp.ClientSession() as session:
async with session.get(self.base_url, params=params) as response:
if response.status == 200:
data = await response.json()
return data.get("items", [])
else:
error_text = await response.text()
logger.error(f"Google Search API error: {response.status} - {error_text}")
raise Exception(f"Search API returned status {response.status}")
async def _process_search_results(
self,
raw_results: List[Dict[str, Any]],
topic: str,
industry: str
) -> List[Dict[str, Any]]:
"""
Process and rank search results by relevance and credibility.
Args:
raw_results: Raw search results from Google API
topic: The research topic for relevance scoring
industry: The industry context for relevance scoring
Returns:
Processed and ranked search results
"""
processed_results = []
for result in raw_results:
try:
# Extract basic information
title = result.get("title", "")
url = result.get("link", "")
snippet = result.get("snippet", "")
# Calculate relevance score
relevance_score = self._calculate_relevance_score(title, snippet, topic, industry)
# Calculate credibility score
credibility_score = self._calculate_credibility_score(url, title)
# Extract publication date if available
publication_date = self._extract_publication_date(result)
# Calculate domain authority
domain_authority = self._calculate_domain_authority(url)
processed_result = {
"title": title,
"url": url,
"content": snippet,
"relevance_score": relevance_score,
"credibility_score": credibility_score,
"domain_authority": domain_authority,
"publication_date": publication_date,
"source_type": self._categorize_source(url, title),
"raw_result": result
}
processed_results.append(processed_result)
except Exception as e:
logger.warning(f"Failed to process search result: {str(e)}")
continue
# Sort by combined score (relevance + credibility)
processed_results.sort(
key=lambda x: (x["relevance_score"] + x["credibility_score"]) / 2,
reverse=True
)
return processed_results
def _calculate_relevance_score(self, title: str, snippet: str, topic: str, industry: str) -> float:
"""
Calculate relevance score based on topic and industry alignment.
Args:
title: The title of the search result
snippet: The snippet/description of the result
topic: The research topic
industry: The industry context
Returns:
Relevance score between 0.0 and 1.0
"""
score = 0.0
text = f"{title} {snippet}".lower()
# Topic relevance (40% of score)
topic_words = topic.lower().split()
topic_matches = sum(1 for word in topic_words if word in text)
topic_score = min(topic_matches / len(topic_words), 1.0) * 0.4
# Industry relevance (30% of score)
industry_words = industry.lower().split()
industry_matches = sum(1 for word in industry_words if word in text)
industry_score = min(industry_matches / len(industry_words), 1.0) * 0.3
# Content quality indicators (30% of score)
quality_indicators = [
"research", "study", "analysis", "report", "insights",
"trends", "data", "statistics", "findings", "expert"
]
quality_matches = sum(1 for indicator in quality_indicators if indicator in text)
quality_score = min(quality_matches / len(quality_indicators), 1.0) * 0.3
score = topic_score + industry_score + quality_score
return round(score, 3)
def _calculate_credibility_score(self, url: str, title: str) -> float:
"""
Calculate credibility score based on URL and title analysis.
Args:
url: The URL of the source
title: The title of the content
Returns:
Credibility score between 0.0 and 1.0
"""
score = 0.5 # Base score
# Domain credibility indicators
credible_domains = [
"harvard.edu", "stanford.edu", "mit.edu", "berkeley.edu", # Academic
"forbes.com", "bloomberg.com", "reuters.com", "wsj.com", # Business
"nature.com", "science.org", "ieee.org", "acm.org", # Scientific
"linkedin.com", "medium.com", "substack.com" # Professional
]
# Check if domain is in credible list
domain = self._extract_domain(url)
if any(credible_domain in domain for credible_domain in credible_domains):
score += 0.3
# Title credibility indicators
credible_indicators = [
"research", "study", "analysis", "report", "insights",
"expert", "professional", "industry", "trends"
]
title_lower = title.lower()
credible_matches = sum(1 for indicator in credible_indicators if indicator in title_lower)
score += min(credible_matches * 0.1, 0.2)
return round(min(score, 1.0), 3)
def _calculate_domain_authority(self, url: str) -> float:
"""
Calculate domain authority based on URL analysis.
Args:
url: The URL to analyze
Returns:
Domain authority score between 0.0 and 1.0
"""
domain = self._extract_domain(url)
# High authority domains
high_authority = [
"harvard.edu", "stanford.edu", "mit.edu", "berkeley.edu",
"forbes.com", "bloomberg.com", "reuters.com", "wsj.com",
"nature.com", "science.org", "ieee.org", "acm.org"
]
# Medium authority domains
medium_authority = [
"linkedin.com", "medium.com", "substack.com", "techcrunch.com",
"venturebeat.com", "wired.com", "theverge.com"
]
if any(auth_domain in domain for auth_domain in high_authority):
return 0.9
elif any(auth_domain in domain for auth_domain in medium_authority):
return 0.7
else:
# Basic scoring for other domains
return 0.5
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL."""
try:
from urllib.parse import urlparse
parsed = urlparse(url)
return parsed.netloc.lower()
except:
return url.lower()
def _extract_publication_date(self, result: Dict[str, Any]) -> Optional[str]:
"""Extract publication date from search result if available."""
# Check for various date fields
date_fields = ["pagemap", "metatags", "date"]
for field in date_fields:
if field in result:
date_value = result[field]
if isinstance(date_value, dict):
# Look for common date keys
for date_key in ["date", "pubdate", "article:published_time"]:
if date_key in date_value:
return date_value[date_key]
elif isinstance(date_value, str):
return date_value
return None
def _categorize_source(self, url: str, title: str) -> str:
"""Categorize the source type based on URL and title."""
domain = self._extract_domain(url)
title_lower = title.lower()
# Academic sources
if any(edu in domain for edu in [".edu", "harvard", "stanford", "mit"]):
return "academic"
# Business/News sources
if any(biz in domain for biz in ["forbes", "bloomberg", "reuters", "wsj"]):
return "business_news"
# Professional platforms
if any(prof in domain for prof in ["linkedin", "medium", "substack"]):
return "professional_platform"
# Research/Scientific
if any(research in domain for research in ["nature", "science", "ieee", "acm"]):
return "research_scientific"
# Industry reports
if any(report in title_lower for report in ["report", "study", "analysis", "research"]):
return "industry_report"
return "general"
async def _extract_insights(
self,
sources: List[Dict[str, Any]],
topic: str,
industry: str
) -> Dict[str, List[str]]:
"""
Extract key insights and statistics from search results.
Args:
sources: Processed search results
topic: The research topic
industry: The industry context
Returns:
Dictionary containing insights and statistics
"""
insights = []
statistics = []
# Extract insights from top sources
top_sources = sources[:5] # Top 5 most relevant sources
for source in top_sources:
content = source.get("content", "")
# Look for insight patterns
insight_patterns = [
"shows", "indicates", "suggests", "reveals", "demonstrates",
"highlights", "emphasizes", "points to", "suggests that"
]
for pattern in insight_patterns:
if pattern in content.lower():
# Extract the sentence containing the insight
sentences = content.split(". ")
for sentence in sentences:
if pattern in sentence.lower():
insights.append(sentence.strip())
break
# Look for statistical patterns
stat_patterns = [
r'\d+%', # Percentages
r'\d+ percent', # Written percentages
r'\$\d+', # Dollar amounts
r'\d+ million', # Millions
r'\d+ billion', # Billions
r'\d+ out of \d+', # Ratios
]
import re
for pattern in stat_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
for match in matches:
statistics.append(f"{match}")
# Limit the number of insights and statistics
insights = insights[:10] # Top 10 insights
statistics = statistics[:10] # Top 10 statistics
return {
"insights": insights,
"statistics": statistics
}
async def test_api_connection(self) -> Dict[str, Any]:
"""
Test the Google Search API connection.
Returns:
Test results and status information
"""
if not self.enabled:
raise RuntimeError("Google Search Service is not enabled. Please configure API credentials.")
try:
# Perform a simple test search
test_query = "AI technology trends 2024"
test_results = await self.perform_search(test_query, 1)
return {
"status": "success",
"message": "Google Search API connection successful",
"enabled": True,
"test_results_count": len(test_results),
"api_key_configured": bool(self.api_key),
"search_engine_configured": bool(self.search_engine_id)
}
except Exception as e:
return {
"status": "error",
"message": f"Google Search API connection failed: {str(e)}",
"enabled": False,
"error": str(e)
}