chore: bulk commit of local changes across blog writer, SEO dashboard, scheduler, docs-site, and frontend

This commit is contained in:
ajaysi
2026-06-05 12:40:04 +05:30
parent b894bc0abb
commit e54aaa7a3e
74 changed files with 5667 additions and 996 deletions

View File

@@ -40,8 +40,10 @@ class GroundingContextEngine:
}
# Temporal relevance patterns
cy = str(datetime.now().year)
ny = str(datetime.now().year + 1)
self.temporal_patterns = {
'recent': ['2024', '2025', 'latest', 'new', 'recent', 'current', 'updated'],
'recent': [cy, ny, 'latest', 'new', 'recent', 'current', 'updated'],
'trending': ['trend', 'emerging', 'growing', 'increasing', 'rising'],
'evergreen': ['fundamental', 'basic', 'principles', 'foundation', 'core']
}

View File

@@ -137,6 +137,15 @@ class KeywordCurator:
lines.append(f"### Competitive advantage signal (must weave into narrative): {content_gap[0]}")
lines.append(" → This is your primary differentiation hook. Surface it prominently in the unique value section.")
lines.append("")
lines.append("### SUGGESTED SECTION → KEYWORD MAPPING")
lines.append("Map each outline section's keyword focus according to its narrative role:")
lines.append("- Hook / Introduction → lead with primary and trending keywords for timeliness & relevance")
lines.append("- Problem / Pain Point → anchor on secondary and long-tail keywords (informational intent)")
lines.append("- Solution / How-To → weave in primary and secondary keywords for solution-oriented search")
lines.append("- Comparison / Analysis → embed semantic keywords to prevent topical drift into tangents")
lines.append("- Case Studies / Evidence → surface content gap keywords as differentiation proof points")
lines.append("- Future / Trends → leverage trending and content gap keywords for forward-looking authority")
lines.append("")
lines.append("GUIDELINE: Treat these as the primary keyword anchors. You may include closely related")
lines.append("intent-matching variations where natural, but avoid inserting every raw research keyword.")
@@ -176,7 +185,11 @@ class KeywordCurator:
slot_key: Optional[str] = None,
) -> List[str]:
"""
Pick up to N items from a keyword list.
Pick up to N items from a keyword list with diversity sampling.
When the raw list is significantly larger than the limit, selects
evenly-spaced entries to capture semantic diversity rather than
just the first N entries.
Args:
data: The raw keyword_analysis dict.
@@ -184,11 +197,24 @@ class KeywordCurator:
slot_key: The internal slot name for looking up the limit.
Falls back to source_key if not provided.
Returns:
Sliced list of at most N strings.
List of at most N strings with diversity sampling.
"""
limit_key = slot_key or source_key
limit = self.SLOTS.get(limit_key, 5)
raw: Any = data.get(source_key, [])
if not isinstance(raw, list):
return []
return raw[:limit]
if len(raw) <= limit:
return raw
if len(raw) <= limit * 2:
return raw[:limit]
indices = set()
if limit >= 2:
indices.add(0)
indices.add(len(raw) - 1)
step = (len(raw) - 1) / max(limit - 1, 1)
for i in range(1, limit - 1):
indices.add(int(round(i * step)))
else:
indices.add(0)
return [raw[i] for i in sorted(indices) if i < len(raw)][:limit]

View File

@@ -124,7 +124,8 @@ class OutlineGenerator:
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
# Combine AI-generated titles with content angles (full primary keywords for title variety)
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
research_topic = getattr(request, 'topic', '') or ''
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords, research_topic)
logger.info(f"Generated optimized outline with {len(balanced_sections)} sections and {len(title_options)} title options")
@@ -224,7 +225,8 @@ class OutlineGenerator:
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
# Combine AI-generated titles with content angles (full primary keywords for title variety)
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
research_topic = getattr(request, 'topic', '') or ''
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords, research_topic)
await task_manager.update_progress(task_id, "✅ Outline generation and optimization completed successfully!")

View File

@@ -36,12 +36,56 @@ class PromptBuilder:
competitor_text = ', '.join(research.competitor_analysis.get('top_competitors', [])) if research and research.competitor_analysis else "Not available"
opportunity_text = ', '.join(research.competitor_analysis.get('opportunities', [])) if research and research.competitor_analysis else "Not available"
advantages_text = ', '.join(research.competitor_analysis.get('competitive_advantages', [])) if research and research.competitor_analysis else "Not available"
competitor_headings_text = ', '.join(research.competitor_analysis.get('competitor_headings', [])[:3]) if research and research.competitor_analysis and research.competitor_analysis.get('competitor_headings') else ""
# Extract additional UI-mapped context fields
analysis_insights_text = (research.keyword_analysis.get('analysis_insights', '') or '') if research and research.keyword_analysis else ''
market_positioning_text = (research.competitor_analysis.get('market_positioning', '') or '') if research and research.competitor_analysis else ''
difficulty_score = research.keyword_analysis.get('difficulty', None) if research and research.keyword_analysis else None
# Extract top 3 authoritative source excerpts as factual data points
source_excerpts_text = ""
if sources:
sorted_sources = sorted(
[s for s in sources if (s.excerpt or s.summary)],
key=lambda s: s.credibility_score or 0.8, reverse=True
)[:3]
excerpts = []
for i, src in enumerate(sorted_sources, 1):
excerpt = src.excerpt or src.summary or ""
if len(excerpt) > 300:
excerpt = excerpt[:297] + "..."
excerpts.append(f" {i}. \"{src.title}\"{excerpt}")
if excerpts:
source_excerpts_text = "FACTUAL DATA POINTS FROM RESEARCH:\n" + "\n".join(excerpts)
# Extract recency: newest source publication date
newest_date_str = ""
if sources:
valid_dates = [s.published_at for s in sources if s.published_at]
if valid_dates:
try:
parsed = [d for d in valid_dates if d[:4].isdigit()]
if parsed:
sorted_dates = sorted(parsed, reverse=True)
newest_date_str = f"Most Recent Source: {sorted_dates[0]}"
except Exception:
pass
# Extract top grounding evidence snippets as verified data points
grounding_evidence_text = ""
if research and research.grounding_metadata and research.grounding_metadata.grounding_supports:
supports = research.grounding_metadata.grounding_supports
top_supports = [s for s in supports if s.segment_text and len(s.segment_text) > 20][:3]
if top_supports:
evidence_parts = []
for i, s in enumerate(top_supports, 1):
text = s.segment_text[:250]
if len(s.segment_text) > 250:
text += "..."
evidence_parts.append(f" {i}. {text}")
grounding_evidence_text = "VERIFIED EVIDENCE (high-confidence snippets):\n" + "\n".join(evidence_parts)
# Build selected angle prominence section
if selected_content_angle and selected_content_angle.strip():
selected_angle_section = f"""
@@ -106,8 +150,14 @@ Top Competitors: {competitor_text}
Market Opportunities: {opportunity_text}
Competitive Advantages: {advantages_text}
{f"Market Positioning: {market_positioning_text}" if market_positioning_text else ""}
{f"Competitor Headings (AVOID duplicating): {competitor_headings_text}" if competitor_headings_text else ""}
RESEARCH SOURCES: {len(sources)} authoritative sources available
{newest_date_str}
{source_excerpts_text}
{grounding_evidence_text}
{f"CUSTOM INSTRUCTIONS: {custom_instructions}" if custom_instructions else ""}

View File

@@ -54,58 +54,58 @@ class TitleGenerator:
Returns:
Formatted title string
"""
if not angle or len(angle.strip()) < 10: # Too short to be a good title
if not angle or len(angle.strip()) < 10:
return ""
# Clean up the angle
cleaned_angle = angle.strip()
# Capitalize first letter of each sentence and proper nouns
sentences = cleaned_angle.split('. ')
formatted_sentences = []
for sentence in sentences:
if sentence.strip():
# Use title case for better formatting
formatted_sentence = sentence.strip().title()
formatted_sentences.append(formatted_sentence)
formatted_title = '. '.join(formatted_sentences)
# Ensure it ends with proper punctuation
if not formatted_title.endswith(('.', '!', '?')):
formatted_title += '.'
# Use sentence case: capitalize first letter, rest as-is
if cleaned_angle:
cleaned_angle = cleaned_angle[0].upper() + cleaned_angle[1:]
# Limit length to reasonable blog title size
if len(formatted_title) > 200:
formatted_title = formatted_title[:197] + "..."
if len(cleaned_angle) > 120:
cleaned_angle = cleaned_angle[:117] + "..."
return formatted_title
return cleaned_angle
def combine_title_options(self, ai_titles: List[str], content_angle_titles: List[str], primary_keywords: List[str]) -> List[str]:
def combine_title_options(self, ai_titles: List[str], content_angle_titles: List[str], primary_keywords: List[str], research_topic: str = "") -> List[str]:
"""
Combine AI-generated titles with content angle titles, ensuring variety and quality.
AI titles (proper SEO titles generated by LLM) take priority.
Content angle titles (long-format descriptions) are used as fallback.
The research topic is the last resort when nothing else exists.
Args:
ai_titles: AI-generated title options
content_angle_titles: Titles derived from content angles
ai_titles: AI-generated title options (proper blog titles, 50-65 chars)
content_angle_titles: Titles derived from content angles (longer, descriptive)
primary_keywords: Primary keywords for fallback generation
research_topic: Original user research topic as ultimate fallback
Returns:
Combined list of title options (max 6 total)
"""
all_titles = []
# Add content angle titles first (these are research-based and valuable)
for title in content_angle_titles[:3]: # Limit to top 3 content angles
if title and title not in all_titles:
all_titles.append(title)
# Add AI-generated titles
# 1. AI-generated titles first (proper SEO titles from LLM)
for title in ai_titles:
if title and title not in all_titles:
all_titles.append(title)
# Note: Removed fallback titles as requested - only use research and AI-generated titles
# 2. Content angle titles as fallback (research-based, but verbose)
for title in content_angle_titles[:3]:
if title and title not in all_titles:
all_titles.append(title)
# 3. Research topic as last resort when nothing was generated
if not all_titles and research_topic:
all_titles.append(research_topic)
# 4. Primary keyword fallback as absolute last resort
if not all_titles and primary_keywords:
kw = primary_keywords[0]
all_titles.append(kw)
# Limit to 6 titles maximum for UI usability
final_titles = all_titles[:6]
@@ -115,9 +115,10 @@ class TitleGenerator:
def generate_fallback_titles(self, primary_keywords: List[str]) -> List[str]:
"""Generate fallback titles when AI generation fails."""
from datetime import datetime
primary_keyword = primary_keywords[0] if primary_keywords else "Topic"
return [
f"The Complete Guide to {primary_keyword}",
f"{primary_keyword}: Everything You Need to Know",
f"How to Master {primary_keyword} in 2024"
f"How to Master {primary_keyword} in {datetime.now().year}"
]

View File

@@ -432,7 +432,7 @@ class ResearchDataFilter:
'how to', 'guide', 'tutorial', 'steps', 'process', 'method',
'best practices', 'tips', 'strategies', 'techniques', 'approach',
'comparison', 'vs', 'versus', 'difference', 'pros and cons',
'trends', 'future', '2024', '2025', 'emerging', 'new'
'trends', 'future', str(datetime.now().year), str(datetime.now().year + 1), 'emerging', 'new'
]
for indicator in actionable_indicators:

View File

@@ -720,7 +720,7 @@ class ResearchService:
url=src.get("url", ""),
excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
credibility_score=float(src.get("credibility_score", 0.8)),
published_at=str(src.get("publication_date", "2024-01-01")),
published_at=str(src.get("publication_date", f"{datetime.now().year}-01-01")),
index=src.get("index"),
source_type=src.get("type", "web")
)

View File

@@ -6,6 +6,7 @@ Different strategies for executing research based on depth and focus.
from abc import ABC, abstractmethod
from typing import Dict, Any
from datetime import datetime
from loguru import logger
from models.blog_models import BlogResearchRequest, ResearchMode, ResearchConfig
@@ -87,7 +88,7 @@ Provide analysis in this EXACT format:
- For each: Quote/claim, source URL, published date, metric/context.
REQUIREMENTS:
- Every claim MUST include a source URL (authoritative, recent: 2024-2025 preferred).
- Every claim MUST include a source URL (authoritative, recent: {datetime.now().year}-{datetime.now().year + 1} preferred).
- Use concrete numbers, dates, outcomes; avoid generic advice.
- Keep bullets tight and scannable for spoken narration."""
return prompt.strip()
@@ -116,7 +117,7 @@ Research Topic: "{topic}"{date_filter}{source_filter}
Provide COMPLETE analysis in this EXACT format:
## WHAT'S CHANGED (2024-2025)
## WHAT'S CHANGED ({datetime.now().year}-{datetime.now().year + 1})
[5-7 concise trend bullets with numbers + source URLs]
## PROOF & NUMBERS
@@ -151,7 +152,7 @@ Primary (3), Secondary (8-10), Long-tail (5-7) with intent hints.
VERIFICATION REQUIREMENTS:
- Minimum 2 authoritative sources per major claim.
- Prefer industry reports > research papers > news > blogs.
- 2024-2025 data strongly preferred.
- {datetime.now().year}-{datetime.now().year + 1} data strongly preferred.
- All numbers must include timeframe and methodology.
- Every bullet must be concise for spoken narration and actionable for {target_audience}."""
return prompt.strip()
@@ -213,7 +214,7 @@ REQUIREMENTS:
- Cite all claims with authoritative source URLs
- Include specific numbers, dates, examples
- Focus on actionable insights for {target_audience}
- Use 2024-2025 data when available"""
- Use {datetime.now().year}-{datetime.now().year + 1} data when available"""
return prompt.strip()

View File

@@ -36,6 +36,8 @@ from models.podcast_models import PodcastProject
from models.research_models import ResearchProject
# Video Studio models
from models.video_models import VideoGenerationTask
# YouTube Creator task models
from models.youtube_task_models import YouTubeVideoTask
# Bing Analytics models
from models.bing_analytics_models import Base as BingAnalyticsBase

View File

@@ -47,6 +47,10 @@ class GSCBrainstormService:
if not site_url:
sites = self.gsc_service.get_site_list(user_id)
if not sites:
logger.info(f"No GSC sites found for user {user_id} — falling back to AI-only brainstorm")
fallback = self._generate_ai_only_brainstorm(user_id, keywords, None, None, None)
if fallback:
return fallback
return {
"error": "No GSC sites found. Make sure your site is verified in Google Search Console.",
"content_opportunities": [],
@@ -70,6 +74,10 @@ class GSCBrainstormService:
)
if "error" in analytics:
logger.info(f"GSC analytics error for user {user_id}: {analytics.get('error')} — falling back to AI-only brainstorm")
fallback = self._generate_ai_only_brainstorm(user_id, keywords, site_url, start_date, end_date)
if fallback:
return fallback
return {
"error": analytics.get("error", "Failed to fetch GSC data"),
"content_opportunities": [],
@@ -88,6 +96,10 @@ class GSCBrainstormService:
pages_data = self._parse_page_rows(page_rows)
if not keywords_data:
logger.info(f"No GSC keyword data for user {user_id} — falling back to AI-only brainstorm")
fallback = self._generate_ai_only_brainstorm(user_id, keywords, site_url, start_date, end_date)
if fallback:
return fallback
return {
"error": "No keyword data available for the selected period. This usually means your site is new to GSC or hasn't received search traffic yet.",
"content_opportunities": [],
@@ -110,6 +122,10 @@ class GSCBrainstormService:
logger.info(f"After topic filter: {len(keywords_data)} keywords, {len(pages_data)} pages")
if not keywords_data:
logger.info(f"No GSC keywords matched topic '{keywords}' for user {user_id} — falling back to AI-only brainstorm")
fallback = self._generate_ai_only_brainstorm(user_id, keywords, site_url, start_date, end_date)
if fallback:
return fallback
return {
"error": "No GSC keywords matched your topic. Try a broader research topic or check your GSC data.",
"content_opportunities": [],
@@ -155,6 +171,128 @@ class GSCBrainstormService:
"summary": summary,
}
# ------------------------------------------------------------------ #
# AI-only fallback (when GSC has no data)
# ------------------------------------------------------------------ #
def _generate_ai_only_brainstorm(
self,
user_id: str,
keywords: str,
site_url: Optional[str],
start_date: Optional[str],
end_date: Optional[str],
) -> Optional[Dict[str, Any]]:
"""
Generate topic ideas using AI alone when GSC data is unavailable.
Returns a brainstorm-shaped result with empty GSC-specific arrays
but populated ai_recommendations.
"""
try:
prompt = f"""You are an expert content strategist helping a blog writer brainstorm topic ideas.
The user is interested in writing about: "{keywords}"
Since they are a new or early-stage website, there is no Google Search Console data available yet.
Generate compelling blog post ideas they can write RIGHT NOW to start building traffic.
For each suggestion include:
1. A specific, compelling blog post TITLE (not a vague topic)
2. The primary keyword it should target
3. Why this topic will perform well (search demand, competition level, timing)
4. The recommended content format (how-to, listicle, comparison, pillar page, etc.)
5. Estimated difficulty level (Easy / Medium / Hard)
Return your response in this EXACT JSON format (no markdown, no code fences):
{{
"immediate_opportunities": [
{{
"title": "Specific Blog Post Title",
"keyword": "primary target keyword",
"reason": "Why this will perform well",
"format": "How-To Guide | Listicle | Comparison | Pillar Page | etc.",
"estimated_impact": "Beginner-friendly traffic opportunity"
}}
],
"content_strategy": [
{{
"title": "Pillar Content Title",
"keyword": "target keyword",
"reason": "Strategic importance for building topical authority",
"format": "Pillar Page | Ultimate Guide | Resource",
"estimated_impact": "Foundation for long-term organic growth"
}}
],
"long_term_strategy": [
{{
"title": "Authority Building Title",
"keyword": "target keyword",
"reason": "Establishes expertise and captures high-intent traffic over time",
"format": "Research-Backed Analysis | Expert Roundup | Original Study",
"estimated_impact": "Compound traffic growth over 6-12 months"
}}
]
}}
IMPORTANT:
- Provide 3-5 items in each category
- All suggestions MUST relate to the user's interest in "{keywords}"
- Titles should be specific, compelling, and SEO-aware
- Prioritize topics with clear search intent and realistic ranking potential for a new site
- Include a mix of easy wins (long-tail, low competition) and strategic pillar content
- For estimated_impact, describe the opportunity type (not click numbers since we lack data)"""
system_prompt = (
"You are an expert content strategist specializing in SEO and blog topic generation. "
"You help new websites identify high-potential content topics even without search console data. "
"You always respond with valid JSON matching the requested format exactly."
)
result = llm_text_gen(
prompt=prompt,
system_prompt=system_prompt,
user_id=user_id,
flow_type="gsc_brainstorm_fallback",
)
if result:
parsed = self._parse_ai_response(result)
if parsed:
return {
"content_opportunities": [],
"keyword_gaps": [],
"quick_wins": [],
"page_opportunities": [],
"ai_recommendations": parsed,
"summary": {
"site_url": site_url or "",
"date_range": {
"start": start_date or "",
"end": end_date or "",
},
"total_keywords_analyzed": 0,
"total_impressions": 0,
"total_clicks": 0,
"avg_ctr": 0,
"avg_position": 0,
"ctr_vs_benchmark": 0,
"health_score": 0,
"keyword_distribution": {
"positions_1_3": 0,
"positions_4_10": 0,
"positions_11_20": 0,
"positions_21_plus": 0,
},
"top_keywords": [],
"top_pages": [],
"note": "AI-generated suggestions based on your topic. No GSC data was available — these are strategic recommendations, not data-driven insights."
},
}
except Exception as e:
logger.warning(f"AI-only brainstorm fallback failed for user {user_id}: {e}")
return None
# ------------------------------------------------------------------ #
# Data parsing helpers
# ------------------------------------------------------------------ #

View File

@@ -188,7 +188,6 @@ class GSCService:
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
# Check if table exists first to avoid error on fresh DB
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='gsc_credentials'")
if not cursor.fetchone():
return None
@@ -204,7 +203,6 @@ class GSCService:
credentials_data = json.loads(result[0])
# Check for required fields, but allow connection without refresh token
required_fields = ['token_uri', 'client_id', 'client_secret']
missing_fields = [field for field in required_fields if not credentials_data.get(field)]
@@ -214,7 +212,6 @@ class GSCService:
credentials = Credentials.from_authorized_user_info(credentials_data, self.scopes)
# Refresh token if needed and possible
if credentials.expired:
if credentials.refresh_token:
try:
@@ -222,9 +219,11 @@ class GSCService:
self.save_user_credentials(user_id, credentials)
except Exception as e:
logger.error(f"Failed to refresh GSC token for user {user_id}: {e}")
self.clear_incomplete_credentials(user_id)
return None
else:
logger.warning(f"GSC token expired for user {user_id} but no refresh token available - user needs to re-authorize")
self.clear_incomplete_credentials(user_id)
return None
return credentials
@@ -288,7 +287,6 @@ class GSCService:
try:
logger.info(f"Handling GSC OAuth callback with state: {state[:20]}...")
# Extract user_id from state
if ':' not in state:
logger.error(f"Invalid GSC state format: {state}")
return False
@@ -300,17 +298,19 @@ class GSCService:
logger.error(f"User database not found for user {user_id}")
return False
# Verify state in user's DB (but don't delete yet — delete after successful token exchange)
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute('SELECT user_id FROM gsc_oauth_states WHERE state = ?', (state,))
result = cursor.fetchone()
if not result:
logger.error(f"Invalid or expired GSC OAuth state for user {user_id}")
return False
# Exchange code for credentials
# Verify state in user's DB (best effort — if missing, attempt code exchange anyway)
state_valid = False
try:
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute('SELECT user_id FROM gsc_oauth_states WHERE state = ?', (state,))
state_valid = cursor.fetchone() is not None
except Exception as state_err:
logger.warning(f"State verification query failed, proceeding anyway: {state_err}")
if not state_valid:
logger.warning(f"GSC OAuth state not found in DB for user {user_id} — will attempt code exchange without state verification")
if not self.client_config:
logger.error("Cannot handle callback: Client configuration not loaded")
return False
@@ -324,21 +324,30 @@ class GSCService:
flow.fetch_token(code=authorization_code)
credentials = flow.credentials
if not credentials or not credentials.token:
logger.error(f"Token exchange returned empty credentials for user {user_id}")
return False
# State consumed successfully — clean up
try:
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM gsc_oauth_states WHERE state = ?', (state,))
conn.commit()
except Exception as cleanup_err:
logger.warning(f"Failed to clean up OAuth state: {cleanup_err}")
# Clean up state if it was valid
if state_valid:
try:
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM gsc_oauth_states WHERE state = ?', (state,))
conn.commit()
except Exception as cleanup_err:
logger.warning(f"Failed to clean up OAuth state: {cleanup_err}")
# Save credentials
return self.save_user_credentials(user_id, credentials)
result = self.save_user_credentials(user_id, credentials)
if result:
logger.info(f"GSC OAuth callback succeeded for user {user_id} (state_valid={state_valid})")
else:
logger.error(f"GSC OAuth callback: token exchange succeeded but failed to save credentials for user {user_id}")
return result
except Exception as e:
logger.error(f"Error handling GSC OAuth callback: {e}")
logger.error(f"Error handling GSC OAuth callback for user {user_id if 'user_id' in dir() else 'unknown'}: {e}")
return False
@@ -726,6 +735,8 @@ class GSCService:
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM gsc_credentials WHERE user_id = ?', (user_id,))
cursor.execute('DELETE FROM gsc_data_cache WHERE user_id = ?', (user_id,))
cursor.execute('DELETE FROM gsc_oauth_states WHERE user_id = ?', (user_id,))
conn.commit()
logger.info(f"Cleared incomplete GSC credentials for user: {user_id}")

View File

@@ -66,12 +66,19 @@ class WixAuthService:
response.raise_for_status()
return response.json()
def get_site_info(self, access_token: str) -> Dict[str, Any]:
def get_site_info(self, access_token: str, meta_site_id: Optional[str] = None) -> Dict[str, Any]:
headers = {
'Authorization': f'Bearer {access_token}',
'Content-Type': 'application/json'
'Content-Type': 'application/json',
}
if self.client_id:
headers['wix-client-id'] = self.client_id
if meta_site_id:
headers['wix-site-id'] = meta_site_id
response = requests.get(f"{self.base_url}/sites/v1/site", headers=headers)
if response.status_code == 404:
logger.warning("Wix site info not found (404) — user may not have a published site or token lacks sites scope")
return {"_no_site": True, "error": "No Wix site found for this account"}
response.raise_for_status()
return response.json()

View File

@@ -295,39 +295,39 @@ def create_blog_post(
wix_logger.log_token_info(token_length, has_blog_scope, meta_site_id)
# Convert markdown to Ricos
ricos_content = convert_content_to_ricos(content, None)
# PRIMARY: Use Wix Ricos Documents API for best formatting support (tables, complex markdown, etc.)
# FALLBACK: Use custom parser if Wix API fails
ricos_content = None
try:
logger.info("Converting markdown via Wix Ricos Documents API...")
ricos_content = convert_via_wix_api(content, access_token, base_url)
logger.info(f"Wix API conversion succeeded: {len(ricos_content.get('nodes', []))} nodes")
except Exception as e:
logger.warning(f"Wix API conversion failed, falling back to custom parser: {e}")
if not ricos_content or not isinstance(ricos_content, dict) or 'nodes' not in ricos_content:
logger.info("Using custom markdown parser for Ricos conversion")
ricos_content = convert_content_to_ricos(content, None)
nodes_count = len(ricos_content.get('nodes', []))
wix_logger.log_ricos_conversion(nodes_count)
# Validate Ricos content structure
# Per Wix Blog API documentation: richContent should ONLY contain 'nodes'
# The example in docs shows: { nodes: [...] } - no type, id, metadata, or documentStyle
if not isinstance(ricos_content, dict):
logger.error(f"richContent is not a dict: {type(ricos_content)}")
logger.error(f"richContent is not a dict: {type(ricos_content)}")
raise ValueError("richContent must be a dictionary object")
if 'nodes' not in ricos_content or not isinstance(ricos_content['nodes'], list):
logger.error(f"richContent.nodes is missing or not a list: {ricos_content.get('nodes', 'MISSING')}")
logger.error(f"richContent.nodes is missing or not a list: {ricos_content.get('nodes', 'MISSING')}")
raise ValueError("richContent must contain a 'nodes' array")
# Remove type and id fields (not expected by Blog API)
# NOTE: metadata is optional - Wix UPDATE endpoint example shows it, but CREATE example doesn't
# We'll keep it minimal (nodes only) for CREATE to match the recipe example
fields_to_remove = ['type', 'id']
for field in fields_to_remove:
# Remove top-level fields not expected by Blog API CREATE endpoint
# (Wix API converter may include type, id, metadata, documentStyle — strip them)
for field in ['type', 'id', 'metadata', 'documentStyle']:
if field in ricos_content:
logger.debug(f"Removing '{field}' field from richContent (Blog API doesn't expect this)")
logger.debug(f"Removing '{field}' from richContent for Blog API compatibility")
del ricos_content[field]
# Remove metadata and documentStyle - Blog API CREATE endpoint example shows only 'nodes'
# (UPDATE endpoint shows metadata, but we're using CREATE)
if 'metadata' in ricos_content:
logger.debug("Removing 'metadata' from richContent (CREATE endpoint expects only 'nodes')")
del ricos_content['metadata']
if 'documentStyle' in ricos_content:
logger.debug("Removing 'documentStyle' from richContent (CREATE endpoint expects only 'nodes')")
del ricos_content['documentStyle']
# Ensure we only have 'nodes' in richContent for CREATE endpoint
ricos_content = {'nodes': ricos_content['nodes']}

View File

@@ -708,7 +708,48 @@ class SIFIntegrationService:
themes = adv_insights.get('augmented_themes', [])
if themes:
text_content += f"Augmented Themes: {', '.join(themes[:5])}. "
freshness = adv_insights.get('freshness', {})
if freshness:
text_content += (f"Content Freshness Score: {freshness.get('freshness_score', 'N/A')}. "
f"Publishing Velocity: {freshness.get('publishing_velocity', 0)}/week. "
f"Trend: {freshness.get('publishing_trend', 'unknown')}. "
f"Last 30d: {freshness.get('publishing_recency', {}).get('last_30d', 0)} pages. ")
link_health = adv_insights.get('link_health', {})
if link_health and 'error' not in link_health:
text_content += (f"Internal Links: {link_health.get('internal_link_count', 0)}. "
f"External Links: {link_health.get('external_link_count', 0)}. "
f"Nofollow: {link_health.get('nofollow_link_count', 0)}. "
f"Avg Links/Page: {link_health.get('avg_links_per_page', 0)}. ")
redirects = adv_insights.get('redirect_audit', {})
if redirects and 'error' not in redirects:
text_content += (f"Redirects: {redirects.get('total_redirects', 0)} total, "
f"{redirects.get('multi_hop_chains', 0)} multi-hop. ")
image_seo = adv_insights.get('image_seo', {})
if image_seo and 'error' not in image_seo:
text_content += (f"Images: {image_seo.get('total_images', 0)} total, "
f"Alt Coverage: {image_seo.get('alt_coverage_percentage', 0)}%. ")
url_struct = adv_insights.get('url_structure', {})
if url_struct:
text_content += (f"URL Structure: {url_struct.get('total_urls_analyzed', 0)} URLs, "
f"Avg Depth: {url_struct.get('directory_depth', {}).get('average_depth', 0)}. "
f"Params: {url_struct.get('parameter_usage', {}).get('percentage_with_params', 0)}%. ")
robots = adv_insights.get('robots_txt', {})
if robots and robots.get('success'):
text_content += (f"Robots.txt: {robots.get('total_directives', 0)} directives, "
f"Compliance: {robots.get('compliance_score', 0)}/100. "
f"Issues: {len(robots.get('issues', []))}. ")
budget = adv_insights.get('crawl_budget', {})
if budget and budget.get('success'):
text_content += (f"Crawl Budget: {budget.get('pages_crawled', 0)} crawled of {budget.get('sitemap_total_urls', 0)} URLs. "
f"Waste: {budget.get('waste_percentage', 0)}%. "
f"Score: {budget.get('optimization_score', 0)}. ")
# Add Technical SEO overview
tech_audit = dashboard_data.get('technical_seo_audit', {})
if tech_audit:

View File

@@ -370,6 +370,136 @@ class FailureDetectionService:
"last_failure": task.last_failure.isoformat() if task.last_failure else None
})
# Check onboarding full website analysis tasks
from models.website_analysis_monitoring_models import OnboardingFullWebsiteAnalysisTask
onboarding_tasks = self.db.query(OnboardingFullWebsiteAnalysisTask).filter(
OnboardingFullWebsiteAnalysisTask.status == "needs_intervention"
)
if user_id:
onboarding_tasks = onboarding_tasks.filter(OnboardingFullWebsiteAnalysisTask.user_id == user_id)
for task in onboarding_tasks.all():
pattern = self.analyze_task_failures(task.id, "onboarding_full_website_analysis", task.user_id)
tasks_needing_intervention.append({
"task_id": task.id,
"task_type": "onboarding_full_website_analysis",
"user_id": task.user_id,
"website_url": task.website_url,
"failure_pattern": {
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
"recent_failures": pattern.recent_failures if pattern else 0,
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
"error_patterns": pattern.error_patterns if pattern else [],
},
"failure_reason": task.failure_reason,
"last_failure": task.last_failure.isoformat() if task.last_failure else None
})
# Check deep competitor analysis tasks
from models.website_analysis_monitoring_models import DeepCompetitorAnalysisTask
competitor_tasks = self.db.query(DeepCompetitorAnalysisTask).filter(
DeepCompetitorAnalysisTask.status == "needs_intervention"
)
if user_id:
competitor_tasks = competitor_tasks.filter(DeepCompetitorAnalysisTask.user_id == user_id)
for task in competitor_tasks.all():
pattern = self.analyze_task_failures(task.id, "deep_competitor_analysis", task.user_id)
tasks_needing_intervention.append({
"task_id": task.id,
"task_type": "deep_competitor_analysis",
"user_id": task.user_id,
"website_url": task.website_url,
"failure_pattern": {
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
"recent_failures": pattern.recent_failures if pattern else 0,
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
"error_patterns": pattern.error_patterns if pattern else [],
},
"failure_reason": task.failure_reason,
"last_failure": task.last_failure.isoformat() if task.last_failure else None
})
# Check SIF indexing tasks
from models.website_analysis_monitoring_models import SIFIndexingTask
sif_tasks = self.db.query(SIFIndexingTask).filter(
SIFIndexingTask.status == "needs_intervention"
)
if user_id:
sif_tasks = sif_tasks.filter(SIFIndexingTask.user_id == user_id)
for task in sif_tasks.all():
pattern = self.analyze_task_failures(task.id, "sif_indexing", task.user_id)
tasks_needing_intervention.append({
"task_id": task.id,
"task_type": "sif_indexing",
"user_id": task.user_id,
"website_url": task.website_url,
"failure_pattern": {
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
"recent_failures": pattern.recent_failures if pattern else 0,
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
"error_patterns": pattern.error_patterns if pattern else [],
},
"failure_reason": task.failure_reason,
"last_failure": task.last_failure.isoformat() if task.last_failure else None
})
# Check market trends tasks
from models.website_analysis_monitoring_models import MarketTrendsTask
trends_tasks = self.db.query(MarketTrendsTask).filter(
MarketTrendsTask.status == "needs_intervention"
)
if user_id:
trends_tasks = trends_tasks.filter(MarketTrendsTask.user_id == user_id)
for task in trends_tasks.all():
pattern = self.analyze_task_failures(task.id, "market_trends", task.user_id)
tasks_needing_intervention.append({
"task_id": task.id,
"task_type": "market_trends",
"user_id": task.user_id,
"website_url": task.website_url,
"failure_pattern": {
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
"recent_failures": pattern.recent_failures if pattern else 0,
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
"error_patterns": pattern.error_patterns if pattern else [],
},
"failure_reason": task.failure_reason,
"last_failure": task.last_failure.isoformat() if task.last_failure else None
})
# Check advertools tasks (paused tasks may also need attention)
from models.website_analysis_monitoring_models import AdvertoolsTask
advertools_tasks = self.db.query(AdvertoolsTask).filter(
AdvertoolsTask.status.in_(["needs_intervention", "failed"])
)
if user_id:
advertools_tasks = advertools_tasks.filter(AdvertoolsTask.user_id == user_id)
for task in advertools_tasks.all():
pattern = self.analyze_task_failures(task.id, "advertools", task.user_id)
tasks_needing_intervention.append({
"task_id": task.id,
"task_type": "advertools",
"user_id": task.user_id,
"website_url": task.website_url,
"failure_pattern": {
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
"recent_failures": pattern.recent_failures if pattern else 0,
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
"error_patterns": pattern.error_patterns if pattern else [],
},
"failure_reason": task.failure_reason,
"last_failure": task.last_failure.isoformat() if task.last_failure else None
})
return tasks_needing_intervention
except Exception as e:

View File

@@ -1,6 +1,7 @@
import asyncio
from datetime import datetime, timedelta
from typing import Any, Dict, List
from urllib.parse import urlparse
from loguru import logger
from sqlalchemy.orm import Session
from sqlalchemy import text
@@ -63,27 +64,66 @@ class AdvertoolsExecutor:
result = {}
if task_type == 'content_audit':
# Phase 1: Audit content themes using sample URLs from sitemap
# First, get the sitemap to find recent URLs
# Phase 1: Get sitemap analysis (freshness, URL structure, pillars)
sitemap_result = await self.advertools_service.analyze_sitemap(effective_url)
audit_urls = []
url_structure = {}
freshness = {}
if sitemap_result.get('success'):
# Use the sample URLs returned by the service
audit_urls = sitemap_result.get('metrics', {}).get('audit_sample_urls', [])
metrics = sitemap_result.get('metrics', {})
audit_urls = metrics.get('audit_sample_urls', [])
url_structure = metrics.get('url_structure', {})
freshness = {
"freshness_score": metrics.get('freshness_score'),
"publishing_velocity": metrics.get('publishing_velocity'),
"stale_content_percentage": metrics.get('stale_content_percentage'),
"publishing_recency": metrics.get('publishing_recency'),
"publishing_trend": metrics.get('publishing_trend'),
}
if not audit_urls:
# Fallback to homepage if sitemap fails or empty
audit_urls = [website_url]
# Run the audit on the sample
result = await self.advertools_service.audit_content(audit_urls)
# Phase 2: Theme analysis via content audit
audit_result = await self.advertools_service.audit_content(audit_urls)
# Phase 3: Site structure analysis (links, redirects, image SEO)
site_domain = urlparse(website_url).netloc or website_url
structure_result = await self.advertools_service.analyze_site_structure(
audit_urls, site_domain=site_domain
)
# Phase 4: Robots.txt compliance analysis
robots_result = await self.advertools_service.analyze_robots_txt(website_url)
# Phase 5: Crawl budget analysis
budget_result = await self.advertools_service.analyze_crawl_budget(
effective_url, site_domain
)
# Merge results
result = {
"success": audit_result.get('success', False) or structure_result.get('success', False),
"themes": audit_result.get('themes', []),
"page_count": audit_result.get('page_count', 0),
"avg_word_count": audit_result.get('avg_word_count', 0),
"link_health": structure_result.get('link_health', {}),
"redirect_audit": structure_result.get('redirect_audit', {}),
"image_seo": structure_result.get('image_seo', {}),
"page_status": structure_result.get('page_status', {}),
"url_structure": url_structure,
"freshness": freshness,
"robots_txt": robots_result,
"crawl_budget": budget_result,
"timestamp": datetime.utcnow().isoformat()
}
if result.get('success'):
await self._update_persona_augmentation(user_id, website_url, result, db)
elif task_type == 'site_health':
# Phase 1: Check site health (freshness, velocity)
# Site health: freshness, velocity, URL structure
result = await self.advertools_service.analyze_sitemap(effective_url)
if result.get('success'):
@@ -157,7 +197,8 @@ class AdvertoolsExecutor:
async def _update_persona_augmentation(self, user_id: str, website_url: str, audit_result: Dict[str, Any], db: Session):
"""
Updates the user's Brand Persona with discovered themes from the content audit.
Updates the user's Brand Persona with discovered themes, site structure,
link health, and redirect data from the content audit.
"""
try:
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
@@ -170,18 +211,40 @@ class AdvertoolsExecutor:
self.logger.warning(f"No website analysis found for user {user_id}")
return
# Update brand_analysis with augmented themes
current_brand = analysis.brand_analysis or {}
# Add or update the 'augmented_themes' field
# Core themes
current_brand['augmented_themes'] = audit_result.get('themes', [])
# Link health
current_brand['link_health'] = audit_result.get('link_health', {})
# Redirect audit
current_brand['redirect_audit'] = audit_result.get('redirect_audit', {})
# Image SEO
current_brand['image_seo'] = audit_result.get('image_seo', {})
# Page status distribution
current_brand['page_status'] = audit_result.get('page_status', {})
# URL structure analysis
current_brand['url_structure'] = audit_result.get('url_structure', {})
# Freshness
current_brand['freshness'] = audit_result.get('freshness', {})
# Robots.txt compliance
current_brand['robots_txt'] = audit_result.get('robots_txt', {})
# Crawl budget analysis
current_brand['crawl_budget'] = audit_result.get('crawl_budget', {})
current_brand['last_advertools_audit'] = datetime.utcnow().isoformat()
# Force SQLAlchemy to detect change in JSON field
from sqlalchemy.orm.attributes import flag_modified
flag_modified(analysis, "brand_analysis")
# Also update content_strategy_insights if relevant
if 'avg_word_count' in audit_result:
current_strategy = analysis.content_strategy_insights or {}
current_strategy['avg_content_length'] = audit_result['avg_word_count']
@@ -196,7 +259,8 @@ class AdvertoolsExecutor:
async def _update_site_health_metrics(self, user_id: str, website_url: str, health_result: Dict[str, Any], db: Session):
"""
Updates the WebsiteAnalysis with site health metrics (velocity, freshness).
Updates the WebsiteAnalysis with site health metrics (velocity, freshness,
URL structure analysis, freshness score).
"""
try:
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
@@ -207,7 +271,6 @@ class AdvertoolsExecutor:
if not analysis:
return
# Update seo_audit with health metrics
current_seo = analysis.seo_audit or {}
metrics = health_result.get('metrics', {})
@@ -216,7 +279,11 @@ class AdvertoolsExecutor:
"publishing_velocity": metrics.get('publishing_velocity'),
"stale_content_count": metrics.get('stale_content_count'),
"stale_content_percentage": metrics.get('stale_content_percentage'),
"top_pillars": metrics.get('top_pillars')
"freshness_score": metrics.get('freshness_score'),
"publishing_recency": metrics.get('publishing_recency'),
"publishing_trend": metrics.get('publishing_trend'),
"top_pillars": metrics.get('top_pillars'),
"url_structure": metrics.get('url_structure', {})
}
current_seo['last_advertools_health_check'] = datetime.utcnow().isoformat()

View File

@@ -1,12 +1,18 @@
import advertools as adv
import pandas as pd
import asyncio
from typing import Dict, Any, List, Optional
from typing import Dict, Any, List, Optional, Tuple
from datetime import datetime, timedelta
from loguru import logger
import json
import os
import tempfile
from urllib.parse import urlparse
from collections import Counter
import urllib.request
import urllib.error
import socket
import re
class AdvertoolsService:
"""
@@ -19,51 +25,58 @@ class AdvertoolsService:
async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
"""
Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
Analyzes a website's sitemap to extract metrics on publishing velocity, freshness,
URL structure patterns, and topic distribution.
"""
try:
self.logger.info(f"Analyzing sitemap: {sitemap_url}")
# advertools sitemap_to_df is blocking, run in executor
loop = asyncio.get_event_loop()
df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
if df is None or df.empty:
return {"success": False, "error": "Sitemap is empty or could not be parsed."}
# Convert lastmod to datetime
if 'lastmod' in df.columns:
df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
total_urls = len(df)
# Handle potential empty datetime columns
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
now = datetime.now(df['lastmod'].dt.tz)
thirty_days_ago = now - timedelta(days=30)
recent_urls = df[df['lastmod'] > thirty_days_ago]
six_months_ago = now - timedelta(days=180)
stale_urls = df[df['lastmod'] < six_months_ago]
publishing_velocity = len(recent_urls) / 4.0 # URLs per week
stale_count = len(stale_urls)
else:
publishing_velocity = 0
stale_count = 0
# --- Content Freshness Scoring ---
freshness = self._compute_freshness(df)
# Enhanced Content Pillars (Top folder patterns - 3 levels deep)
def extract_hierarchy(url: str):
try:
parts = urlparse(url).path.strip('/').split('/')
if not parts or not parts[0]: return "home"
return "/".join(parts[:2]) # Capture top 2 segments
except:
return "other"
# --- URL Structure Analysis ---
url_structure = {}
if 'loc' in df.columns:
url_structure = await self._analyze_url_structure(df['loc'].tolist())
# --- Content Pillars via url_to_df ---
pillars = {}
url_df = None
try:
url_df = adv.url_to_df(df['loc'])
if url_df is not None and not url_df.empty:
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
if dir_cols:
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
for col in dir_cols[1:3]:
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
pillars = pillar_series.value_counts().head(15).to_dict()
except Exception:
fallback_pillars = {}
if 'loc' in df.columns:
def extract_hierarchy(url: str):
try:
parts = urlparse(url).path.strip('/').split('/')
if not parts or not parts[0]: return "home"
return "/".join(parts[:2])
except:
return "other"
fallback_pillars = df['loc'].apply(extract_hierarchy).value_counts().head(15).to_dict()
pillars = fallback_pillars
df['pillar'] = df['loc'].apply(extract_hierarchy)
pillars = df['pillar'].value_counts().head(15).to_dict()
# Return a sample of URLs for auditing (top 15 most recent if available)
# Sample URLs for auditing (top 15 most recent)
audit_urls = []
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
@@ -74,10 +87,14 @@ class AdvertoolsService:
"success": True,
"metrics": {
"total_urls": total_urls,
"publishing_velocity": round(publishing_velocity, 2),
"stale_content_count": stale_count,
"stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
"publishing_velocity": freshness.get("publishing_velocity"),
"stale_content_count": freshness.get("stale_count"),
"stale_content_percentage": freshness.get("stale_percentage"),
"freshness_score": freshness.get("freshness_score"),
"publishing_recency": freshness.get("publishing_recency"),
"publishing_trend": freshness.get("publishing_trend"),
"top_pillars": pillars,
"url_structure": url_structure,
"audit_sample_urls": audit_urls
},
"timestamp": datetime.utcnow().isoformat()
@@ -86,6 +103,146 @@ class AdvertoolsService:
self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
return {"success": False, "error": str(e)}
def _compute_freshness(self, df: pd.DataFrame) -> Dict[str, Any]:
"""Compute content freshness, publishing velocity, and staleness metrics."""
result = {
"publishing_velocity": 0,
"stale_count": 0,
"stale_percentage": 0,
"freshness_score": 0,
"publishing_recency": {},
"publishing_trend": "unknown"
}
if 'lastmod' not in df.columns or df['lastmod'].isna().all():
return result
lastmod = df['lastmod'].dropna()
if lastmod.empty:
return result
now = datetime.now(lastmod.dt.tz)
thirty_days_ago = now - timedelta(days=30)
ninety_days_ago = now - timedelta(days=90)
six_months_ago = now - timedelta(days=180)
recent_urls = df[df['lastmod'] > thirty_days_ago]
stale_urls = df[df['lastmod'] < six_months_ago]
total_urls = len(df)
stale_count = len(stale_urls)
stale_percentage = round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0
# Publishing velocity: URLs per week over last 90 days
recent_90 = df[df['lastmod'] > ninety_days_ago]
publishing_velocity = round(len(recent_90) / 13.0, 2) if not recent_90.empty else 0
# Freshness score (0-100): weighted combination of metrics
non_stale_ratio = 1.0 - (stale_percentage / 100.0)
recency_ratio = len(recent_urls) / max(total_urls, 1)
velocity_score = min(publishing_velocity / 10.0, 1.0)
freshness_score = round((non_stale_ratio * 50 + recency_ratio * 30 + velocity_score * 20), 1)
# Publishing recency: URLs published in last 1d, 7d, 30d, 90d
publishing_recency = {
"last_24h": int(len(df[df['lastmod'] > (now - timedelta(days=1))])),
"last_7d": int(len(df[df['lastmod'] > (now - timedelta(days=7))])),
"last_30d": int(len(recent_urls)),
"last_90d": int(len(recent_90)),
}
# Publishing trend: compare recent 30d vs prior 30d
prior_30 = df[(df['lastmod'] <= thirty_days_ago) & (df['lastmod'] > (now - timedelta(days=60)))]
recent_count = len(recent_urls)
prior_count = len(prior_30)
if recent_count > prior_count * 1.1:
publishing_trend = "increasing"
elif recent_count < prior_count * 0.9:
publishing_trend = "decreasing"
else:
publishing_trend = "stable"
return {
"publishing_velocity": publishing_velocity,
"stale_count": stale_count,
"stale_percentage": stale_percentage,
"freshness_score": freshness_score,
"publishing_recency": publishing_recency,
"publishing_trend": publishing_trend
}
async def _analyze_url_structure(self, urls: List[str]) -> Dict[str, Any]:
"""Analyze URL patterns for parameter bloat, directory depth, and path patterns."""
try:
loop = asyncio.get_event_loop()
url_df = await loop.run_in_executor(None, lambda: adv.url_to_df(urls))
if url_df is None or url_df.empty:
return {}
total = len(url_df)
# Query param analysis
has_query = url_df['query'].notna() & (url_df['query'] != '')
param_count = has_query.sum()
param_percentage = round((param_count / total) * 100, 2) if total > 0 else 0
# Extract individual parameters
all_params = []
param_frequency = {}
if param_count > 0:
for q in url_df.loc[has_query, 'query'].dropna().unique():
for pair in q.split('&'):
key = pair.split('=')[0] if '=' in pair else pair
all_params.append(key)
from collections import Counter
param_frequency = dict(Counter(all_params).most_common(10))
# Directory depth analysis
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
def count_depth(row):
for i, col in enumerate(dir_cols):
val = row[col]
if pd.isna(val) or str(val) == 'nan' or str(val).strip() == '':
return i
return len(dir_cols)
depths = url_df.apply(count_depth, axis=1)
avg_depth = round(depths.mean(), 1) if not depths.empty else 0
max_depth = int(depths.max()) if not depths.empty else 0
depth_distribution = depths.value_counts().sort_index().head(10).to_dict()
depth_distribution = {str(k): int(v) for k, v in depth_distribution.items()}
# Protocol consistency
schemes = url_df['scheme'].value_counts().to_dict() if 'scheme' in url_df.columns else {}
# Subdomain analysis
netloc_counts = url_df['netloc'].value_counts() if 'netloc' in url_df.columns else None
unique_subdomains = int(netloc_counts.nunique()) if netloc_counts is not None else 0
primary_domain = netloc_counts.index[0] if netloc_counts is not None and not netloc_counts.empty else ""
return {
"total_urls_analyzed": total,
"parameter_usage": {
"urls_with_params": int(param_count),
"percentage_with_params": param_percentage,
"top_parameters": param_frequency
},
"directory_depth": {
"average_depth": avg_depth,
"max_depth": max_depth,
"distribution": depth_distribution
},
"protocols": {str(k): int(v) for k, v in schemes.items()},
"subdomains": {
"primary": primary_domain,
"unique_count": unique_subdomains
}
}
except Exception as e:
self.logger.warning(f"URL structure analysis failed: {e}")
return {}
async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
"""
Performs a shallow crawl and theme analysis using word frequency.
@@ -153,6 +310,512 @@ class AdvertoolsService:
except Exception as e:
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
async def analyze_site_structure(self, url_list: List[str], site_domain: Optional[str] = None) -> Dict[str, Any]:
"""
Crawls a set of pages with link following to analyze internal link health,
redirect chains, and page-level SEO elements.
Extracts metrics via crawlytics: link distribution, redirect chains, image SEO.
"""
temp_file = None
try:
self.logger.info(f"Analyzing site structure for {len(url_list)} URLs, domain={site_domain}")
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=url_list,
output_file=temp_file,
follow_links=True,
allowed_domains=[site_domain] if site_domain else None,
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 50,
'DOWNLOAD_TIMEOUT': 30,
'CONCURRENT_REQUESTS_PER_DOMAIN': 3,
'DEPTH_LIMIT': 3,
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Site structure crawl produced no output."}
crawl_df = pd.read_json(temp_file, lines=True)
page_count = len(crawl_df)
result = {"success": True, "page_count": page_count}
# --- Link Health via crawlytics ---
try:
internal_regex = site_domain if site_domain else None
link_df = adv.crawlytics.links(crawl_df, internal_url_regex=internal_regex)
if link_df is not None and not link_df.empty:
total_links = len(link_df)
internal_links = int(link_df['internal'].sum()) if 'internal' in link_df.columns else 0
external_links = total_links - internal_links
nofollow_links = int(link_df['nofollow'].sum()) if 'nofollow' in link_df.columns else 0
# Count links per page
links_per_page = link_df.groupby(level=0).size()
avg_links_per_page = round(links_per_page.mean(), 1) if not links_per_page.empty else 0
# Most common anchor text (internal links only)
anchor_texts = []
if 'text' in link_df.columns and 'internal' in link_df.columns:
internal_anchors = link_df[link_df['internal'] == True]['text'].dropna()
for t in internal_anchors:
if isinstance(t, str) and t.strip():
anchor_texts.extend([w.strip() for w in t.split() if len(w.strip()) > 2])
from collections import Counter
top_anchors = dict(Counter(anchor_texts).most_common(15)) if anchor_texts else {}
result["link_health"] = {
"total_links_found": total_links,
"internal_link_count": internal_links,
"external_link_count": external_links,
"internal_link_percentage": round((internal_links / total_links) * 100, 1) if total_links > 0 else 0,
"nofollow_link_count": nofollow_links,
"avg_links_per_page": avg_links_per_page,
"top_anchor_words": top_anchors
}
else:
result["link_health"] = {"error": "No links found in crawl data"}
except Exception as e:
self.logger.warning(f"Link analysis failed: {e}")
result["link_health"] = {"error": str(e)}
# --- Redirect Chain Audit via crawlytics ---
try:
redirect_df = adv.crawlytics.redirects(crawl_df)
if redirect_df is not None and not redirect_df.empty:
total_redirects = len(redirect_df)
redirect_chains = redirect_df['redirect_times'].nunique() if 'redirect_times' in redirect_df.columns else 0
redirect_statuses = redirect_df['status'].value_counts().to_dict() if 'status' in redirect_df.columns else {}
multi_hop = redirect_df[redirect_df['redirect_times'] > 1] if 'redirect_times' in redirect_df.columns else pd.DataFrame()
result["redirect_audit"] = {
"total_redirects": int(total_redirects),
"unique_chains": int(redirect_chains),
"status_distribution": {str(k): int(v) for k, v in redirect_statuses.items()},
"multi_hop_chains": int(len(multi_hop)),
"affected_pages": multi_hop.index.unique().tolist() if not multi_hop.empty else []
}
else:
result["redirect_audit"] = {"total_redirects": 0, "note": "No redirects detected"}
except Exception as e:
self.logger.warning(f"Redirect analysis failed: {e}")
result["redirect_audit"] = {"error": str(e)}
# --- Image SEO overview via crawlytics ---
try:
img_df = adv.crawlytics.images(crawl_df)
if img_df is not None and not img_df.empty:
total_images = len(img_df)
missing_alt = int(img_df['img_alt'].isna().sum()) if 'img_alt' in img_df.columns else 0
alt_coverage = round(((total_images - missing_alt) / total_images) * 100, 1) if total_images > 0 else 0
result["image_seo"] = {
"total_images": total_images,
"missing_alt_count": missing_alt,
"alt_coverage_percentage": alt_coverage
}
except Exception as e:
self.logger.warning(f"Image analysis failed: {e}")
# --- Page-level metrics ---
if 'status' in crawl_df.columns:
status_dist = crawl_df['status'].value_counts().to_dict()
result["page_status"] = {str(k): int(v) for k, v in status_dist.items()}
if 'title' in crawl_df.columns:
missing_titles = int(crawl_df['title'].isna().sum())
result["missing_titles"] = missing_titles
if 'meta_desc' in crawl_df.columns:
missing_descriptions = int(crawl_df['meta_desc'].isna().sum())
result["missing_descriptions"] = missing_descriptions
result["timestamp"] = datetime.utcnow().isoformat()
return result
except Exception as e:
self.logger.error(f"Failed to analyze site structure: {str(e)}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try:
os.remove(temp_file)
except Exception as e:
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
async def analyze_robots_txt(self, website_url: str) -> Dict[str, Any]:
"""
Fetch and analyze robots.txt for compliance issues.
Checks directives, sitemap declaration, crawl-delay, and common problems.
"""
try:
self.logger.info(f"Analyzing robots.txt for {website_url}")
parsed = urlparse(website_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
robots_url = f"{base_url}/robots.txt"
result = {
"success": True,
"url": robots_url,
"accessible": True,
"total_directives": 0,
"user_agents_found": [],
"has_sitemap_directive": False,
"sitemap_urls": [],
"has_crawl_delay": False,
"disallow_rules": [],
"issues": [],
"compliance_score": 100,
}
loop = asyncio.get_event_loop()
try:
robots_df = await loop.run_in_executor(
None, lambda: adv.robotstxt_to_df(robots_url)
)
if robots_df is None or robots_df.empty:
raise ValueError("Empty result from robotstxt_to_df")
except Exception as adv_err:
self.logger.warning(f"adv.robotstxt_to_df failed, using manual fallback: {adv_err}")
robots_df = await loop.run_in_executor(
None, lambda: self._parse_robots_txt_manual(robots_url)
)
if robots_df is None or robots_df.empty:
result["success"] = False
result["error"] = "Could not fetch or parse robots.txt"
result["accessible"] = False
return result
result["total_directives"] = len(robots_df)
if 'user_agent' in robots_df.columns:
result["user_agents_found"] = robots_df['user_agent'].dropna().unique().tolist()
rule_col = 'rule' if 'rule' in robots_df.columns else 'directive' if 'directive' in robots_df.columns else None
value_col = 'value' if 'value' in robots_df.columns else 'directive_value' if 'directive_value' in robots_df.columns else None
if rule_col and value_col:
rules_lower = robots_df[rule_col].astype(str).str.lower()
result["has_sitemap_directive"] = 'sitemap' in rules_lower.values
result["has_crawl_delay"] = 'crawl-delay' in rules_lower.values
has_disallow_all = any(
str(row.get(value_col, '')).strip() == '/'
for _, row in robots_df[robots_df[rule_col].astype(str).str.lower() == 'disallow'].iterrows()
) if 'disallow' in rules_lower.values else False
disallow_mask = rules_lower == 'disallow'
if disallow_mask.any():
for _, row in robots_df[disallow_mask].iterrows():
val = str(row.get(value_col, ''))
ua = str(row.get('user_agent', '*'))
if val:
result["disallow_rules"].append({"user_agent": ua, "path": val})
sitemap_mask = rules_lower == 'sitemap'
if sitemap_mask.any():
result["sitemap_urls"] = robots_df.loc[sitemap_mask, value_col].dropna().unique().tolist()
if has_disallow_all:
result["issues"].append({
"severity": "critical", "code": "DISALLOW_ALL",
"detail": "robots.txt disallows all user agents from all paths (Disallow: /)"
})
if not result["has_sitemap_directive"]:
result["issues"].append({
"severity": "warning", "code": "NO_SITEMAP",
"detail": "No Sitemap directive found — search engines may miss pages"
})
if not result["has_crawl_delay"]:
result["issues"].append({
"severity": "info", "code": "NO_CRAWL_DELAY",
"detail": "No Crawl-delay directive set — not critical for most sites"
})
for issue in result["issues"]:
sev = issue["severity"]
if sev == "critical":
result["compliance_score"] -= 30
elif sev == "warning":
result["compliance_score"] -= 15
elif sev == "info":
result["compliance_score"] -= 5
result["compliance_score"] = max(result["compliance_score"], 0)
return result
except Exception as e:
self.logger.error(f"Robots.txt analysis failed: {e}")
return {"success": False, "error": str(e), "url": robots_url if 'robots_url' in locals() else website_url}
def _parse_robots_txt_manual(self, url: str) -> pd.DataFrame:
"""Fallback: manually fetch and parse robots.txt."""
records = []
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=15) as resp:
content = resp.read().decode("utf-8", errors="replace")
current_ua = "*"
for line in content.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
if line.lower().startswith("user-agent"):
parts = line.split(":", 1)
current_ua = parts[1].strip() if len(parts) > 1 else "*"
continue
if ":" in line:
directive, _, value = line.partition(":")
records.append({
"user_agent": current_ua,
"rule": directive.strip(),
"value": value.strip(),
})
except Exception as e:
self.logger.warning(f"Manual robots.txt fetch failed: {e}")
if not records:
return pd.DataFrame()
return pd.DataFrame(records)
async def analyze_crawl_budget(self, sitemap_url: str, site_domain: str) -> Dict[str, Any]:
"""
Analyze crawl budget by comparing sitemap inventory against actual crawl results.
Estimates budget utilization, waste from redirects/errors, and optimization score.
"""
temp_file = None
try:
self.logger.info(f"Analyzing crawl budget for {site_domain}")
loop = asyncio.get_event_loop()
sitemap_df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
sitemap_total = len(sitemap_df) if sitemap_df is not None and not sitemap_df.empty else 0
start_url = f"https://{site_domain}" if not site_domain.startswith("http") else site_domain
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=[start_url],
output_file=temp_file,
follow_links=True,
allowed_domains=[site_domain],
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 30,
'DOWNLOAD_TIMEOUT': 15,
'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
'DEPTH_LIMIT': 2,
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Crawl produced no output"}
crawl_df = pd.read_json(temp_file, lines=True)
crawled_count = len(crawl_df)
status_dist = {}
if 'status' in crawl_df.columns:
raw = crawl_df['status'].value_counts().to_dict()
status_dist = {str(k): int(v) for k, v in raw.items()}
wasted = 0
for code_s in status_dist:
code = int(code_s)
if code >= 300 or code < 200:
wasted += status_dist[code_s]
budget_usage_ratio = round(crawled_count / max(sitemap_total, 1), 3)
waste_ratio = round(wasted / max(crawled_count, 1), 3)
depth_dist = {}
if 'depth' in crawl_df.columns:
raw = crawl_df['depth'].value_counts().sort_index().to_dict()
depth_dist = {str(k): int(v) for k, v in raw.items()}
param_count = 0
url_col = 'url' if 'url' in crawl_df.columns else 'response_url' if 'response_url' in crawl_df.columns else None
if url_col:
param_count = int(crawl_df[url_col].astype(str).str.contains('?').sum())
optimization_score = max(0, round(100 - (waste_ratio * 100) - (budget_usage_ratio * 20), 1))
return {
"success": True,
"sitemap_total_urls": sitemap_total,
"pages_crawled": crawled_count,
"crawl_coverage_percentage": round(budget_usage_ratio * 100, 1),
"status_distribution": status_dist,
"wasted_crawl_requests": int(wasted),
"waste_percentage": round(waste_ratio * 100, 1),
"depth_distribution": depth_dist,
"urls_with_parameters": int(param_count),
"optimization_score": optimization_score,
}
except Exception as e:
self.logger.error(f"Crawl budget analysis failed: {e}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try: os.remove(temp_file)
except Exception: pass
async def sitemap_compare(self, sitemap_a: str, sitemap_b: str) -> Dict[str, Any]:
"""
Compare two sitemaps for competitive content gap analysis.
Analyzes URL count, freshness, directory pillars, and identifies
patterns unique to each sitemap.
"""
try:
self.logger.info(f"Comparing sitemaps: {sitemap_a} vs {sitemap_b}")
loop = asyncio.get_event_loop()
df_a = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_a))
df_b = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_b))
total_a = len(df_a) if df_a is not None and not df_a.empty else 0
total_b = len(df_b) if df_b is not None and not df_b.empty else 0
result = {
"success": True,
"sitemap_a": {"url": sitemap_a, "total_urls": total_a},
"sitemap_b": {"url": sitemap_b, "total_urls": total_b},
"url_count_diff": total_a - total_b,
"ratio": round(total_a / max(total_b, 1), 2),
"pillars_a": {},
"pillars_b": {},
"shared_pillars": [],
"unique_to_a": [],
"unique_to_b": [],
"freshness_comparison": {},
"overlap_score": 0,
}
if total_a == 0 or total_b == 0:
return result
def extract_pillars(df: pd.DataFrame, label: str) -> Tuple[dict, list]:
pillars = {}
if 'loc' in df.columns:
try:
url_df = adv.url_to_df(df['loc'])
if url_df is not None and not url_df.empty:
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
if dir_cols:
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
for col in dir_cols[1:3]:
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
pillars = pillar_series.value_counts().head(20).to_dict()
except Exception:
pass
if not pillars:
seen = {}
for url in df['loc'].dropna():
parts = urlparse(url).path.strip('/').split('/')
key = parts[0] if parts and parts[0] else "home"
seen[key] = seen.get(key, 0) + 1
pillars = dict(sorted(seen.items(), key=lambda x: x[1], reverse=True)[:20])
pillar_keys = list(pillars.keys()) if pillars else []
return pillars, pillar_keys
pillars_a, keys_a = extract_pillars(df_a, "a")
pillars_b, keys_b = extract_pillars(df_b, "b")
result["pillars_a"] = pillars_a
result["pillars_b"] = pillars_b
set_a = set(keys_a)
set_b = set(keys_b)
shared = set_a & set_b
result["shared_pillars"] = sorted(shared)
result["unique_to_a"] = sorted(set_a - set_b)
result["unique_to_b"] = sorted(set_b - set_a)
total_keys = max(len(set_a | set_b), 1)
overlap_count = len(shared)
result["overlap_score"] = round((overlap_count / total_keys) * 100, 1)
def compute_freshness_stats(df: pd.DataFrame) -> dict:
stats = {"has_lastmod": False, "recent_30d": 0, "total_with_dates": 0}
if 'lastmod' in df.columns:
lm = pd.to_datetime(df['lastmod'], errors='coerce', utc=True).dropna()
if not lm.empty:
stats["has_lastmod"] = True
stats["total_with_dates"] = int(len(lm))
stats["recent_30d"] = int((lm > (datetime.now(lm.dt.tz) - timedelta(days=30))).sum())
return stats
result["freshness_comparison"] = {
"a": compute_freshness_stats(df_a),
"b": compute_freshness_stats(df_b),
}
return result
except Exception as e:
self.logger.error(f"Sitemap comparison failed: {e}")
return {"success": False, "error": str(e)}
async def compare_crawl_results(self, result_a: Dict[str, Any], result_b: Dict[str, Any]) -> Dict[str, Any]:
"""
Compare two crawl analysis result dicts to surface changes over time.
Useful for tracking SEO improvements between scheduled executions.
"""
try:
diff = {
"success": True,
"page_count_change": 0,
"status_distribution_changes": {},
"link_health_changes": {},
"redirect_changes": {},
"new_issues": [],
"resolved_issues": [],
}
pc_a = result_a.get("page_count", 0)
pc_b = result_b.get("page_count", 0)
diff["page_count_change"] = pc_b - pc_a
sd_a = result_a.get("page_status", {})
sd_b = result_b.get("page_status", {})
all_codes = set(list(sd_a.keys()) + list(sd_b.keys()))
for c in sorted(all_codes):
va = sd_a.get(c, 0)
vb = sd_b.get(c, 0)
change = vb - va
if change != 0:
diff["status_distribution_changes"][c] = change
def _safe_diff(d_a: dict, d_b: dict, prefix: str) -> dict:
changes = {}
all_keys = set(list(d_a.keys()) + list(d_b.keys()))
for k in all_keys:
va = d_a.get(k, 0)
vb = d_b.get(k, 0)
if isinstance(va, (int, float)) and isinstance(vb, (int, float)):
change = round(vb - va, 2)
if change != 0:
changes[f"{prefix}_{k}"] = change
return changes
lh_a = result_a.get("link_health", {})
lh_b = result_b.get("link_health", {})
diff["link_health_changes"] = _safe_diff(lh_a, lh_b, "link")
rd_a = result_a.get("redirect_audit", {})
rd_b = result_b.get("redirect_audit", {})
diff["redirect_changes"] = _safe_diff(rd_a, rd_b, "redirect")
return diff
except Exception as e:
self.logger.error(f"Crawl comparison failed: {e}")
return {"success": False, "error": str(e)}
async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
"""
Analyzes linking patterns and social media presence using unique temporary files.

View File

@@ -454,14 +454,12 @@ class SEODashboardService:
def _get_advertools_insights(self, user_id: str, site_url: str) -> Dict[str, Any]:
"""Fetch Advertools-based insights from WebsiteAnalysis and AdvertoolsTasks."""
try:
# 1. Get augmented persona themes from WebsiteAnalysis
session = self.db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
if not session:
return {}
analysis = self.db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
# 2. Get latest tasks status
tasks = self.db.query(AdvertoolsTask).filter(AdvertoolsTask.user_id == user_id).all()
audit_status = "pending"
@@ -479,6 +477,14 @@ class SEODashboardService:
return {
"augmented_themes": brand_analysis.get('augmented_themes', []),
"link_health": brand_analysis.get('link_health', {}),
"redirect_audit": brand_analysis.get('redirect_audit', {}),
"image_seo": brand_analysis.get('image_seo', {}),
"page_status": brand_analysis.get('page_status', {}),
"url_structure": brand_analysis.get('url_structure', {}),
"freshness": brand_analysis.get('freshness', {}),
"robots_txt": brand_analysis.get('robots_txt', {}),
"crawl_budget": brand_analysis.get('crawl_budget', {}),
"last_audit": brand_analysis.get('last_advertools_audit'),
"site_health": seo_audit.get('site_health', {}),
"last_health_check": seo_audit.get('last_advertools_health_check'),

View File

@@ -378,7 +378,48 @@ class SIFIntegrationService:
themes = adv_insights.get('augmented_themes', [])
if themes:
text_content += f"Augmented Themes: {', '.join(themes[:5])}. "
freshness = adv_insights.get('freshness', {})
if freshness:
text_content += (f"Content Freshness Score: {freshness.get('freshness_score', 'N/A')}. "
f"Publishing Velocity: {freshness.get('publishing_velocity', 0)}/week. "
f"Trend: {freshness.get('publishing_trend', 'unknown')}. "
f"Last 30d: {freshness.get('publishing_recency', {}).get('last_30d', 0)} pages. ")
link_health = adv_insights.get('link_health', {})
if link_health and 'error' not in link_health:
text_content += (f"Internal Links: {link_health.get('internal_link_count', 0)}. "
f"External Links: {link_health.get('external_link_count', 0)}. "
f"Nofollow: {link_health.get('nofollow_link_count', 0)}. "
f"Avg Links/Page: {link_health.get('avg_links_per_page', 0)}. ")
redirects = adv_insights.get('redirect_audit', {})
if redirects and 'error' not in redirects:
text_content += (f"Redirects: {redirects.get('total_redirects', 0)} total, "
f"{redirects.get('multi_hop_chains', 0)} multi-hop. ")
image_seo = adv_insights.get('image_seo', {})
if image_seo and 'error' not in image_seo:
text_content += (f"Images: {image_seo.get('total_images', 0)} total, "
f"Alt Coverage: {image_seo.get('alt_coverage_percentage', 0)}%. ")
url_struct = adv_insights.get('url_structure', {})
if url_struct:
text_content += (f"URL Structure: {url_struct.get('total_urls_analyzed', 0)} URLs, "
f"Avg Depth: {url_struct.get('directory_depth', {}).get('average_depth', 0)}. "
f"Params: {url_struct.get('parameter_usage', {}).get('percentage_with_params', 0)}%. ")
robots = adv_insights.get('robots_txt', {})
if robots and robots.get('success'):
text_content += (f"Robots.txt: {robots.get('total_directives', 0)} directives, "
f"Compliance: {robots.get('compliance_score', 0)}/100. "
f"Issues: {len(robots.get('issues', []))}. ")
budget = adv_insights.get('crawl_budget', {})
if budget and budget.get('success'):
text_content += (f"Crawl Budget: {budget.get('pages_crawled', 0)} crawled of {budget.get('sitemap_total_urls', 0)} URLs. "
f"Waste: {budget.get('waste_percentage', 0)}%. "
f"Score: {budget.get('optimization_score', 0)}. ")
# Add Technical SEO overview
tech_audit = dashboard_data.get('technical_seo_audit', {})
if tech_audit:

View File

@@ -143,16 +143,18 @@ class WixService:
access_token: Valid access token
Returns:
Site information
Site information (or {_no_site: True} if no site exists)
"""
token_str = normalize_token_string(access_token)
if not token_str:
raise ValueError("Invalid access token format for create_blog_post")
return {"_no_site": True, "error": "Invalid access token format"}
meta = extract_meta_from_token(token_str)
meta_site_id = meta.get("metaSiteId")
try:
return self.auth_service.get_site_info(token_str)
return self.auth_service.get_site_info(token_str, meta_site_id=meta_site_id)
except requests.RequestException as e:
logger.error(f"Failed to get site info: {e}")
raise
logger.warning(f"Failed to get site info: {e}")
return {"_no_site": True, "error": str(e)}
def get_current_member(self, access_token: str) -> Dict[str, Any]:
"""

View File

@@ -0,0 +1,387 @@
"""
YouTube Creator Task Manager
Hybrid DB-backed + in-memory task manager for YouTube video operations.
Writes task state to PostgreSQL so renders/combines/publishes survive
server restarts. Falls back to in-memory dict when DB is unavailable.
API surface matches Story Writer's TaskManager for drop-in compatibility.
"""
import uuid
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
from loguru import logger
from sqlalchemy.orm import Session
from models.youtube_task_models import YouTubeVideoTask, YouTubeTaskType, YouTubeTaskStatus
from services.database import get_session_for_user, get_engine_for_user
from models.subscription_models import Base as SubscriptionBase
class YouTubeTaskManager:
"""Hybrid persistent + in-memory task manager for YouTube Creator."""
def __init__(self):
self.task_storage: Dict[str, Dict[str, Any]] = {}
self._ensure_tables()
def _ensure_tables(self):
"""Ensure youtube_video_tasks table exists for all initialised users."""
try:
from services.database import _user_engines
for user_id, engine in list(_user_engines.items()):
try:
SubscriptionBase.metadata.create_all(bind=engine, checkfirst=True)
except Exception:
pass
except Exception:
pass
def _get_db(self, user_id: str) -> Optional[Session]:
"""Get a DB session for the given user. Returns None on failure."""
if not user_id:
return None
try:
session = get_session_for_user(user_id)
if session:
engine = get_engine_for_user(user_id)
SubscriptionBase.metadata.create_all(bind=engine, checkfirst=True)
return session
except Exception as e:
logger.warning(f"[YouTubeTaskManager] DB unavailable for user {user_id}: {e}")
return None
def _map_task_type(self, task_type_str: str) -> YouTubeTaskType:
"""Map a string task type to the enum."""
mapping = {
"youtube_video_render": YouTubeTaskType.RENDER,
"youtube_scene_video_render": YouTubeTaskType.SCENE_RENDER,
"youtube_video_combine": YouTubeTaskType.COMBINE,
"youtube_combine_video": YouTubeTaskType.COMBINE,
"youtube_publish": YouTubeTaskType.PUBLISH,
"youtube_image_generation": YouTubeTaskType.IMAGE_GENERATION,
"youtube_audio_generation": YouTubeTaskType.AUDIO_GENERATION,
}
return mapping.get(task_type_str, YouTubeTaskType.RENDER)
def _map_status_to_enum(self, status: str) -> YouTubeTaskStatus:
"""Map a frontend status string to the DB enum."""
mapping = {
"pending": YouTubeTaskStatus.PENDING,
"processing": YouTubeTaskStatus.PROCESSING,
"running": YouTubeTaskStatus.PROCESSING,
"completed": YouTubeTaskStatus.COMPLETED,
"failed": YouTubeTaskStatus.FAILED,
}
return mapping.get(status, YouTubeTaskStatus.PENDING)
def _map_status_from_enum(self, status: YouTubeTaskStatus) -> str:
"""Map DB enum to frontend status string."""
mapping = {
YouTubeTaskStatus.PENDING: "pending",
YouTubeTaskStatus.PROCESSING: "processing",
YouTubeTaskStatus.COMPLETED: "completed",
YouTubeTaskStatus.FAILED: "failed",
}
return mapping.get(status, "pending")
def create_task(
self,
task_type: str = "youtube_video_render",
metadata: Optional[Dict[str, Any]] = None,
user_id: Optional[str] = None,
) -> str:
"""Create a new task. Persists to DB if user_id provided; always writes to in-memory."""
task_id = str(uuid.uuid4())
task_metadata = metadata or {}
now = datetime.now(timezone.utc)
# Always write to in-memory for fast lookups
self.task_storage[task_id] = {
"status": "pending",
"created_at": now,
"updated_at": now,
"result": None,
"error": None,
"progress_messages": [],
"task_type": task_type,
"progress": 0.0,
"metadata": task_metadata,
}
# Persist to DB
effective_user_id = user_id or task_metadata.get("owner_user_id")
if effective_user_id:
db = self._get_db(effective_user_id)
if db:
try:
db_task = YouTubeVideoTask(
task_id=task_id,
user_id=effective_user_id,
task_type=self._map_task_type(task_type),
status=YouTubeTaskStatus.PENDING,
progress=0.0,
request_data=task_metadata if task_metadata else None,
created_at=now,
updated_at=now,
)
db.add(db_task)
db.commit()
logger.debug(f"[YouTubeTaskManager] Persisted task {task_id} to DB for user {effective_user_id}")
except Exception as e:
logger.warning(f"[YouTubeTaskManager] Failed to persist task {task_id} to DB: {e}")
db.rollback()
finally:
db.close()
logger.info(f"[YouTubeTaskManager] Created task: {task_id} (type: {task_type})")
return task_id
def get_task_status(self, task_id: str, requester_user_id: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""Get task status. Checks in-memory first, then DB."""
# Check in-memory first (fast path)
if task_id in self.task_storage:
task = self.task_storage[task_id]
metadata = task.get("metadata", {}) or {}
owner_user_id = metadata.get("owner_user_id")
if requester_user_id is not None and owner_user_id is not None and requester_user_id != owner_user_id:
logger.warning(f"[YouTubeTaskManager] Task access denied for task {task_id}")
return None
response = {
"task_id": task_id,
"status": task["status"],
"progress": task.get("progress", 0.0),
"message": task.get("progress_messages", [])[-1] if task.get("progress_messages") else None,
"created_at": task["created_at"].isoformat() if task.get("created_at") else None,
"updated_at": task.get("updated_at", task.get("created_at")).isoformat() if task.get("updated_at") or task.get("created_at") else None,
}
if task["status"] == "completed" and task.get("result"):
response["result"] = task["result"]
if task["status"] == "failed" and task.get("error"):
response["error"] = task["error"]
if task.get("error_status") is not None:
response["error_status"] = task["error_status"]
if task.get("error_data") is not None:
response["error_data"] = task["error_data"]
return response
# Fall back to DB
if requester_user_id:
db = self._get_db(requester_user_id)
if db:
try:
db_task = db.query(YouTubeVideoTask).filter(YouTubeVideoTask.task_id == task_id).first()
if db_task:
status_val = self._map_status_from_enum(db_task.status)
response = {
"task_id": db_task.task_id,
"status": status_val,
"progress": db_task.progress or 0.0,
"message": db_task.message,
"created_at": db_task.created_at.isoformat() if db_task.created_at else None,
"updated_at": db_task.updated_at.isoformat() if db_task.updated_at else None,
}
if db_task.result:
response["result"] = db_task.result if isinstance(db_task.result, dict) else db_task.result
if db_task.error:
response["error"] = db_task.error
if isinstance(db_task.result, dict):
if db_task.result.get("error_status") is not None:
response["error_status"] = db_task.result["error_status"]
if db_task.result.get("error_data") is not None:
response["error_data"] = db_task.result["error_data"]
return response
except Exception as e:
logger.warning(f"[YouTubeTaskManager] DB lookup failed for task {task_id}: {e}")
finally:
db.close()
return None
def update_task_status(
self,
task_id: str,
status: str,
progress: Optional[float] = None,
message: Optional[str] = None,
result: Optional[Dict[str, Any]] = None,
error: Optional[str] = None,
error_status: Optional[int] = None,
error_data: Optional[Dict[str, Any]] = None,
):
"""Update task status. Writes to both in-memory and DB."""
now = datetime.now(timezone.utc)
# Update in-memory
if task_id in self.task_storage:
task = self.task_storage[task_id]
task["status"] = status
task["updated_at"] = now
if progress is not None:
task["progress"] = progress
if message:
if "progress_messages" not in task:
task["progress_messages"] = []
task["progress_messages"].append(message)
logger.info(f"[YouTubeTaskManager] Task {task_id}: {message} (progress: {progress}%)")
if result is not None:
task["result"] = result
if error is not None:
task["error"] = error
logger.error(f"[YouTubeTaskManager] Task {task_id} error: {error}")
if error_status is not None:
task["error_status"] = error_status
if error_data is not None:
task["error_data"] = error_data
# Try DB update
metadata = task.get("metadata", {}) or {}
user_id = metadata.get("owner_user_id")
self._update_db_task(task_id, user_id, status, progress, message, result, error, now)
else:
logger.warning(f"[YouTubeTaskManager] Cannot update non-existent task: {task_id}")
def _update_db_task(
self,
task_id: str,
user_id: Optional[str],
status: str,
progress: Optional[float],
message: Optional[str],
result: Optional[Dict[str, Any]],
error: Optional[str],
now: datetime,
):
"""Update task in DB."""
if not user_id:
return
db = self._get_db(user_id)
if not db:
return
try:
db_task = db.query(YouTubeVideoTask).filter(YouTubeVideoTask.task_id == task_id).first()
if db_task:
db_task.status = self._map_status_to_enum(status)
db_task.updated_at = now
if progress is not None:
db_task.progress = progress
if message:
db_task.message = message[:500] if message else None
if result:
# Merge error fields into result if present
existing_result = db_task.result if isinstance(db_task.result, dict) else {}
existing_result.update(result)
db_task.result = existing_result
if error:
db_task.error = error
if status in ("completed", "failed"):
db_task.completed_at = now
db.commit()
logger.debug(f"[YouTubeTaskManager] Persisted status update for task {task_id}")
else:
logger.debug(f"[YouTubeTaskManager] Task {task_id} not found in DB for update")
except Exception as e:
logger.warning(f"[YouTubeTaskManager] Failed to update DB task {task_id}: {e}")
db.rollback()
finally:
db.close()
def recover_stale_tasks(self, user_id: str):
"""Mark in-flight tasks that were interrupted by server restart as failed.
Called on startup for each user to handle tasks that were 'processing'
when the server went down.
"""
db = self._get_db(user_id)
if not db:
return 0
count = 0
try:
stale_tasks = db.query(YouTubeVideoTask).filter(
YouTubeVideoTask.user_id == user_id,
YouTubeVideoTask.status.in_([
YouTubeTaskStatus.PENDING,
YouTubeTaskStatus.PROCESSING,
]),
).all()
for task in stale_tasks:
task.status = YouTubeTaskStatus.FAILED
task.error = "Task interrupted by server restart"
task.message = "Marked as failed on server restart"
task.completed_at = datetime.now(timezone.utc)
task.updated_at = datetime.now(timezone.utc)
count += 1
logger.info(f"[YouTubeTaskManager] Recovered stale task {task.task_id} for user {user_id}")
if count > 0:
db.commit()
logger.info(f"[YouTubeTaskManager] Recovered {count} stale tasks for user {user_id}")
except Exception as e:
logger.warning(f"[YouTubeTaskManager] Failed to recover stale tasks: {e}")
db.rollback()
finally:
db.close()
return count
def cleanup_old_tasks(self):
"""Remove in-memory tasks older than 1 hour. DB cleanup is handled by vacuum."""
now = datetime.now(timezone.utc)
cutoff = now.timestamp() - 3600 # 1 hour
tasks_to_remove = []
for task_id, task_data in self.task_storage.items():
created_at = task_data.get("created_at")
if created_at:
ts = created_at.timestamp() if hasattr(created_at, 'timestamp') else 0
if ts < cutoff:
tasks_to_remove.append(task_id)
for task_id in tasks_to_remove:
del self.task_storage[task_id]
logger.debug(f"[YouTubeTaskManager] Cleaned up old in-memory task: {task_id}")
def cleanup_old_db_tasks(self, days: int = 7, user_id: Optional[str] = None):
"""Delete completed/failed DB tasks older than N days."""
if not user_id:
return 0
db = self._get_db(user_id)
if not db:
return 0
count = 0
try:
from datetime import timedelta
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
old_tasks = db.query(YouTubeVideoTask).filter(
YouTubeVideoTask.user_id == user_id,
YouTubeVideoTask.status.in_([YouTubeTaskStatus.COMPLETED, YouTubeTaskStatus.FAILED]),
YouTubeVideoTask.created_at < cutoff,
).all()
for task in old_tasks:
db.delete(task)
count += 1
if count > 0:
db.commit()
logger.info(f"[YouTubeTaskManager] Cleaned up {count} old DB tasks for user {user_id}")
except Exception as e:
logger.warning(f"[YouTubeTaskManager] Failed to cleanup old DB tasks: {e}")
db.rollback()
finally:
db.close()
return count
# Global singleton instance
task_manager = YouTubeTaskManager()