chore: bulk commit of local changes across blog writer, SEO dashboard, scheduler, docs-site, and frontend
This commit is contained in:
@@ -40,8 +40,10 @@ class GroundingContextEngine:
|
||||
}
|
||||
|
||||
# Temporal relevance patterns
|
||||
cy = str(datetime.now().year)
|
||||
ny = str(datetime.now().year + 1)
|
||||
self.temporal_patterns = {
|
||||
'recent': ['2024', '2025', 'latest', 'new', 'recent', 'current', 'updated'],
|
||||
'recent': [cy, ny, 'latest', 'new', 'recent', 'current', 'updated'],
|
||||
'trending': ['trend', 'emerging', 'growing', 'increasing', 'rising'],
|
||||
'evergreen': ['fundamental', 'basic', 'principles', 'foundation', 'core']
|
||||
}
|
||||
|
||||
@@ -137,6 +137,15 @@ class KeywordCurator:
|
||||
lines.append(f"### Competitive advantage signal (must weave into narrative): {content_gap[0]}")
|
||||
lines.append(" → This is your primary differentiation hook. Surface it prominently in the unique value section.")
|
||||
|
||||
lines.append("")
|
||||
lines.append("### SUGGESTED SECTION → KEYWORD MAPPING")
|
||||
lines.append("Map each outline section's keyword focus according to its narrative role:")
|
||||
lines.append("- Hook / Introduction → lead with primary and trending keywords for timeliness & relevance")
|
||||
lines.append("- Problem / Pain Point → anchor on secondary and long-tail keywords (informational intent)")
|
||||
lines.append("- Solution / How-To → weave in primary and secondary keywords for solution-oriented search")
|
||||
lines.append("- Comparison / Analysis → embed semantic keywords to prevent topical drift into tangents")
|
||||
lines.append("- Case Studies / Evidence → surface content gap keywords as differentiation proof points")
|
||||
lines.append("- Future / Trends → leverage trending and content gap keywords for forward-looking authority")
|
||||
lines.append("")
|
||||
lines.append("GUIDELINE: Treat these as the primary keyword anchors. You may include closely related")
|
||||
lines.append("intent-matching variations where natural, but avoid inserting every raw research keyword.")
|
||||
@@ -176,7 +185,11 @@ class KeywordCurator:
|
||||
slot_key: Optional[str] = None,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Pick up to N items from a keyword list.
|
||||
Pick up to N items from a keyword list with diversity sampling.
|
||||
|
||||
When the raw list is significantly larger than the limit, selects
|
||||
evenly-spaced entries to capture semantic diversity rather than
|
||||
just the first N entries.
|
||||
|
||||
Args:
|
||||
data: The raw keyword_analysis dict.
|
||||
@@ -184,11 +197,24 @@ class KeywordCurator:
|
||||
slot_key: The internal slot name for looking up the limit.
|
||||
Falls back to source_key if not provided.
|
||||
Returns:
|
||||
Sliced list of at most N strings.
|
||||
List of at most N strings with diversity sampling.
|
||||
"""
|
||||
limit_key = slot_key or source_key
|
||||
limit = self.SLOTS.get(limit_key, 5)
|
||||
raw: Any = data.get(source_key, [])
|
||||
if not isinstance(raw, list):
|
||||
return []
|
||||
return raw[:limit]
|
||||
if len(raw) <= limit:
|
||||
return raw
|
||||
if len(raw) <= limit * 2:
|
||||
return raw[:limit]
|
||||
indices = set()
|
||||
if limit >= 2:
|
||||
indices.add(0)
|
||||
indices.add(len(raw) - 1)
|
||||
step = (len(raw) - 1) / max(limit - 1, 1)
|
||||
for i in range(1, limit - 1):
|
||||
indices.add(int(round(i * step)))
|
||||
else:
|
||||
indices.add(0)
|
||||
return [raw[i] for i in sorted(indices) if i < len(raw)][:limit]
|
||||
|
||||
@@ -124,7 +124,8 @@ class OutlineGenerator:
|
||||
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
|
||||
|
||||
# Combine AI-generated titles with content angles (full primary keywords for title variety)
|
||||
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
|
||||
research_topic = getattr(request, 'topic', '') or ''
|
||||
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords, research_topic)
|
||||
|
||||
logger.info(f"Generated optimized outline with {len(balanced_sections)} sections and {len(title_options)} title options")
|
||||
|
||||
@@ -224,7 +225,8 @@ class OutlineGenerator:
|
||||
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
|
||||
|
||||
# Combine AI-generated titles with content angles (full primary keywords for title variety)
|
||||
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
|
||||
research_topic = getattr(request, 'topic', '') or ''
|
||||
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords, research_topic)
|
||||
|
||||
await task_manager.update_progress(task_id, "✅ Outline generation and optimization completed successfully!")
|
||||
|
||||
|
||||
@@ -36,12 +36,56 @@ class PromptBuilder:
|
||||
competitor_text = ', '.join(research.competitor_analysis.get('top_competitors', [])) if research and research.competitor_analysis else "Not available"
|
||||
opportunity_text = ', '.join(research.competitor_analysis.get('opportunities', [])) if research and research.competitor_analysis else "Not available"
|
||||
advantages_text = ', '.join(research.competitor_analysis.get('competitive_advantages', [])) if research and research.competitor_analysis else "Not available"
|
||||
competitor_headings_text = ', '.join(research.competitor_analysis.get('competitor_headings', [])[:3]) if research and research.competitor_analysis and research.competitor_analysis.get('competitor_headings') else ""
|
||||
|
||||
# Extract additional UI-mapped context fields
|
||||
analysis_insights_text = (research.keyword_analysis.get('analysis_insights', '') or '') if research and research.keyword_analysis else ''
|
||||
market_positioning_text = (research.competitor_analysis.get('market_positioning', '') or '') if research and research.competitor_analysis else ''
|
||||
difficulty_score = research.keyword_analysis.get('difficulty', None) if research and research.keyword_analysis else None
|
||||
|
||||
# Extract top 3 authoritative source excerpts as factual data points
|
||||
source_excerpts_text = ""
|
||||
if sources:
|
||||
sorted_sources = sorted(
|
||||
[s for s in sources if (s.excerpt or s.summary)],
|
||||
key=lambda s: s.credibility_score or 0.8, reverse=True
|
||||
)[:3]
|
||||
excerpts = []
|
||||
for i, src in enumerate(sorted_sources, 1):
|
||||
excerpt = src.excerpt or src.summary or ""
|
||||
if len(excerpt) > 300:
|
||||
excerpt = excerpt[:297] + "..."
|
||||
excerpts.append(f" {i}. \"{src.title}\" — {excerpt}")
|
||||
if excerpts:
|
||||
source_excerpts_text = "FACTUAL DATA POINTS FROM RESEARCH:\n" + "\n".join(excerpts)
|
||||
|
||||
# Extract recency: newest source publication date
|
||||
newest_date_str = ""
|
||||
if sources:
|
||||
valid_dates = [s.published_at for s in sources if s.published_at]
|
||||
if valid_dates:
|
||||
try:
|
||||
parsed = [d for d in valid_dates if d[:4].isdigit()]
|
||||
if parsed:
|
||||
sorted_dates = sorted(parsed, reverse=True)
|
||||
newest_date_str = f"Most Recent Source: {sorted_dates[0]}"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Extract top grounding evidence snippets as verified data points
|
||||
grounding_evidence_text = ""
|
||||
if research and research.grounding_metadata and research.grounding_metadata.grounding_supports:
|
||||
supports = research.grounding_metadata.grounding_supports
|
||||
top_supports = [s for s in supports if s.segment_text and len(s.segment_text) > 20][:3]
|
||||
if top_supports:
|
||||
evidence_parts = []
|
||||
for i, s in enumerate(top_supports, 1):
|
||||
text = s.segment_text[:250]
|
||||
if len(s.segment_text) > 250:
|
||||
text += "..."
|
||||
evidence_parts.append(f" {i}. {text}")
|
||||
grounding_evidence_text = "VERIFIED EVIDENCE (high-confidence snippets):\n" + "\n".join(evidence_parts)
|
||||
|
||||
# Build selected angle prominence section
|
||||
if selected_content_angle and selected_content_angle.strip():
|
||||
selected_angle_section = f"""
|
||||
@@ -106,8 +150,14 @@ Top Competitors: {competitor_text}
|
||||
Market Opportunities: {opportunity_text}
|
||||
Competitive Advantages: {advantages_text}
|
||||
{f"Market Positioning: {market_positioning_text}" if market_positioning_text else ""}
|
||||
{f"Competitor Headings (AVOID duplicating): {competitor_headings_text}" if competitor_headings_text else ""}
|
||||
|
||||
RESEARCH SOURCES: {len(sources)} authoritative sources available
|
||||
{newest_date_str}
|
||||
|
||||
{source_excerpts_text}
|
||||
|
||||
{grounding_evidence_text}
|
||||
|
||||
{f"CUSTOM INSTRUCTIONS: {custom_instructions}" if custom_instructions else ""}
|
||||
|
||||
|
||||
@@ -54,58 +54,58 @@ class TitleGenerator:
|
||||
Returns:
|
||||
Formatted title string
|
||||
"""
|
||||
if not angle or len(angle.strip()) < 10: # Too short to be a good title
|
||||
if not angle or len(angle.strip()) < 10:
|
||||
return ""
|
||||
|
||||
# Clean up the angle
|
||||
cleaned_angle = angle.strip()
|
||||
|
||||
# Capitalize first letter of each sentence and proper nouns
|
||||
sentences = cleaned_angle.split('. ')
|
||||
formatted_sentences = []
|
||||
for sentence in sentences:
|
||||
if sentence.strip():
|
||||
# Use title case for better formatting
|
||||
formatted_sentence = sentence.strip().title()
|
||||
formatted_sentences.append(formatted_sentence)
|
||||
|
||||
formatted_title = '. '.join(formatted_sentences)
|
||||
|
||||
# Ensure it ends with proper punctuation
|
||||
if not formatted_title.endswith(('.', '!', '?')):
|
||||
formatted_title += '.'
|
||||
# Use sentence case: capitalize first letter, rest as-is
|
||||
if cleaned_angle:
|
||||
cleaned_angle = cleaned_angle[0].upper() + cleaned_angle[1:]
|
||||
|
||||
# Limit length to reasonable blog title size
|
||||
if len(formatted_title) > 200:
|
||||
formatted_title = formatted_title[:197] + "..."
|
||||
if len(cleaned_angle) > 120:
|
||||
cleaned_angle = cleaned_angle[:117] + "..."
|
||||
|
||||
return formatted_title
|
||||
return cleaned_angle
|
||||
|
||||
def combine_title_options(self, ai_titles: List[str], content_angle_titles: List[str], primary_keywords: List[str]) -> List[str]:
|
||||
def combine_title_options(self, ai_titles: List[str], content_angle_titles: List[str], primary_keywords: List[str], research_topic: str = "") -> List[str]:
|
||||
"""
|
||||
Combine AI-generated titles with content angle titles, ensuring variety and quality.
|
||||
|
||||
AI titles (proper SEO titles generated by LLM) take priority.
|
||||
Content angle titles (long-format descriptions) are used as fallback.
|
||||
The research topic is the last resort when nothing else exists.
|
||||
|
||||
Args:
|
||||
ai_titles: AI-generated title options
|
||||
content_angle_titles: Titles derived from content angles
|
||||
ai_titles: AI-generated title options (proper blog titles, 50-65 chars)
|
||||
content_angle_titles: Titles derived from content angles (longer, descriptive)
|
||||
primary_keywords: Primary keywords for fallback generation
|
||||
research_topic: Original user research topic as ultimate fallback
|
||||
|
||||
Returns:
|
||||
Combined list of title options (max 6 total)
|
||||
"""
|
||||
all_titles = []
|
||||
|
||||
# Add content angle titles first (these are research-based and valuable)
|
||||
for title in content_angle_titles[:3]: # Limit to top 3 content angles
|
||||
if title and title not in all_titles:
|
||||
all_titles.append(title)
|
||||
|
||||
# Add AI-generated titles
|
||||
# 1. AI-generated titles first (proper SEO titles from LLM)
|
||||
for title in ai_titles:
|
||||
if title and title not in all_titles:
|
||||
all_titles.append(title)
|
||||
|
||||
# Note: Removed fallback titles as requested - only use research and AI-generated titles
|
||||
# 2. Content angle titles as fallback (research-based, but verbose)
|
||||
for title in content_angle_titles[:3]:
|
||||
if title and title not in all_titles:
|
||||
all_titles.append(title)
|
||||
|
||||
# 3. Research topic as last resort when nothing was generated
|
||||
if not all_titles and research_topic:
|
||||
all_titles.append(research_topic)
|
||||
|
||||
# 4. Primary keyword fallback as absolute last resort
|
||||
if not all_titles and primary_keywords:
|
||||
kw = primary_keywords[0]
|
||||
all_titles.append(kw)
|
||||
|
||||
# Limit to 6 titles maximum for UI usability
|
||||
final_titles = all_titles[:6]
|
||||
@@ -115,9 +115,10 @@ class TitleGenerator:
|
||||
|
||||
def generate_fallback_titles(self, primary_keywords: List[str]) -> List[str]:
|
||||
"""Generate fallback titles when AI generation fails."""
|
||||
from datetime import datetime
|
||||
primary_keyword = primary_keywords[0] if primary_keywords else "Topic"
|
||||
return [
|
||||
f"The Complete Guide to {primary_keyword}",
|
||||
f"{primary_keyword}: Everything You Need to Know",
|
||||
f"How to Master {primary_keyword} in 2024"
|
||||
f"How to Master {primary_keyword} in {datetime.now().year}"
|
||||
]
|
||||
|
||||
@@ -432,7 +432,7 @@ class ResearchDataFilter:
|
||||
'how to', 'guide', 'tutorial', 'steps', 'process', 'method',
|
||||
'best practices', 'tips', 'strategies', 'techniques', 'approach',
|
||||
'comparison', 'vs', 'versus', 'difference', 'pros and cons',
|
||||
'trends', 'future', '2024', '2025', 'emerging', 'new'
|
||||
'trends', 'future', str(datetime.now().year), str(datetime.now().year + 1), 'emerging', 'new'
|
||||
]
|
||||
|
||||
for indicator in actionable_indicators:
|
||||
|
||||
@@ -720,7 +720,7 @@ class ResearchService:
|
||||
url=src.get("url", ""),
|
||||
excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
|
||||
credibility_score=float(src.get("credibility_score", 0.8)),
|
||||
published_at=str(src.get("publication_date", "2024-01-01")),
|
||||
published_at=str(src.get("publication_date", f"{datetime.now().year}-01-01")),
|
||||
index=src.get("index"),
|
||||
source_type=src.get("type", "web")
|
||||
)
|
||||
|
||||
@@ -6,6 +6,7 @@ Different strategies for executing research based on depth and focus.
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import BlogResearchRequest, ResearchMode, ResearchConfig
|
||||
@@ -87,7 +88,7 @@ Provide analysis in this EXACT format:
|
||||
- For each: Quote/claim, source URL, published date, metric/context.
|
||||
|
||||
REQUIREMENTS:
|
||||
- Every claim MUST include a source URL (authoritative, recent: 2024-2025 preferred).
|
||||
- Every claim MUST include a source URL (authoritative, recent: {datetime.now().year}-{datetime.now().year + 1} preferred).
|
||||
- Use concrete numbers, dates, outcomes; avoid generic advice.
|
||||
- Keep bullets tight and scannable for spoken narration."""
|
||||
return prompt.strip()
|
||||
@@ -116,7 +117,7 @@ Research Topic: "{topic}"{date_filter}{source_filter}
|
||||
|
||||
Provide COMPLETE analysis in this EXACT format:
|
||||
|
||||
## WHAT'S CHANGED (2024-2025)
|
||||
## WHAT'S CHANGED ({datetime.now().year}-{datetime.now().year + 1})
|
||||
[5-7 concise trend bullets with numbers + source URLs]
|
||||
|
||||
## PROOF & NUMBERS
|
||||
@@ -151,7 +152,7 @@ Primary (3), Secondary (8-10), Long-tail (5-7) with intent hints.
|
||||
VERIFICATION REQUIREMENTS:
|
||||
- Minimum 2 authoritative sources per major claim.
|
||||
- Prefer industry reports > research papers > news > blogs.
|
||||
- 2024-2025 data strongly preferred.
|
||||
- {datetime.now().year}-{datetime.now().year + 1} data strongly preferred.
|
||||
- All numbers must include timeframe and methodology.
|
||||
- Every bullet must be concise for spoken narration and actionable for {target_audience}."""
|
||||
return prompt.strip()
|
||||
@@ -213,7 +214,7 @@ REQUIREMENTS:
|
||||
- Cite all claims with authoritative source URLs
|
||||
- Include specific numbers, dates, examples
|
||||
- Focus on actionable insights for {target_audience}
|
||||
- Use 2024-2025 data when available"""
|
||||
- Use {datetime.now().year}-{datetime.now().year + 1} data when available"""
|
||||
return prompt.strip()
|
||||
|
||||
|
||||
|
||||
@@ -36,6 +36,8 @@ from models.podcast_models import PodcastProject
|
||||
from models.research_models import ResearchProject
|
||||
# Video Studio models
|
||||
from models.video_models import VideoGenerationTask
|
||||
# YouTube Creator task models
|
||||
from models.youtube_task_models import YouTubeVideoTask
|
||||
# Bing Analytics models
|
||||
from models.bing_analytics_models import Base as BingAnalyticsBase
|
||||
|
||||
|
||||
@@ -47,6 +47,10 @@ class GSCBrainstormService:
|
||||
if not site_url:
|
||||
sites = self.gsc_service.get_site_list(user_id)
|
||||
if not sites:
|
||||
logger.info(f"No GSC sites found for user {user_id} — falling back to AI-only brainstorm")
|
||||
fallback = self._generate_ai_only_brainstorm(user_id, keywords, None, None, None)
|
||||
if fallback:
|
||||
return fallback
|
||||
return {
|
||||
"error": "No GSC sites found. Make sure your site is verified in Google Search Console.",
|
||||
"content_opportunities": [],
|
||||
@@ -70,6 +74,10 @@ class GSCBrainstormService:
|
||||
)
|
||||
|
||||
if "error" in analytics:
|
||||
logger.info(f"GSC analytics error for user {user_id}: {analytics.get('error')} — falling back to AI-only brainstorm")
|
||||
fallback = self._generate_ai_only_brainstorm(user_id, keywords, site_url, start_date, end_date)
|
||||
if fallback:
|
||||
return fallback
|
||||
return {
|
||||
"error": analytics.get("error", "Failed to fetch GSC data"),
|
||||
"content_opportunities": [],
|
||||
@@ -88,6 +96,10 @@ class GSCBrainstormService:
|
||||
pages_data = self._parse_page_rows(page_rows)
|
||||
|
||||
if not keywords_data:
|
||||
logger.info(f"No GSC keyword data for user {user_id} — falling back to AI-only brainstorm")
|
||||
fallback = self._generate_ai_only_brainstorm(user_id, keywords, site_url, start_date, end_date)
|
||||
if fallback:
|
||||
return fallback
|
||||
return {
|
||||
"error": "No keyword data available for the selected period. This usually means your site is new to GSC or hasn't received search traffic yet.",
|
||||
"content_opportunities": [],
|
||||
@@ -110,6 +122,10 @@ class GSCBrainstormService:
|
||||
logger.info(f"After topic filter: {len(keywords_data)} keywords, {len(pages_data)} pages")
|
||||
|
||||
if not keywords_data:
|
||||
logger.info(f"No GSC keywords matched topic '{keywords}' for user {user_id} — falling back to AI-only brainstorm")
|
||||
fallback = self._generate_ai_only_brainstorm(user_id, keywords, site_url, start_date, end_date)
|
||||
if fallback:
|
||||
return fallback
|
||||
return {
|
||||
"error": "No GSC keywords matched your topic. Try a broader research topic or check your GSC data.",
|
||||
"content_opportunities": [],
|
||||
@@ -155,6 +171,128 @@ class GSCBrainstormService:
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# AI-only fallback (when GSC has no data)
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
def _generate_ai_only_brainstorm(
|
||||
self,
|
||||
user_id: str,
|
||||
keywords: str,
|
||||
site_url: Optional[str],
|
||||
start_date: Optional[str],
|
||||
end_date: Optional[str],
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Generate topic ideas using AI alone when GSC data is unavailable.
|
||||
Returns a brainstorm-shaped result with empty GSC-specific arrays
|
||||
but populated ai_recommendations.
|
||||
"""
|
||||
try:
|
||||
prompt = f"""You are an expert content strategist helping a blog writer brainstorm topic ideas.
|
||||
|
||||
The user is interested in writing about: "{keywords}"
|
||||
|
||||
Since they are a new or early-stage website, there is no Google Search Console data available yet.
|
||||
Generate compelling blog post ideas they can write RIGHT NOW to start building traffic.
|
||||
|
||||
For each suggestion include:
|
||||
1. A specific, compelling blog post TITLE (not a vague topic)
|
||||
2. The primary keyword it should target
|
||||
3. Why this topic will perform well (search demand, competition level, timing)
|
||||
4. The recommended content format (how-to, listicle, comparison, pillar page, etc.)
|
||||
5. Estimated difficulty level (Easy / Medium / Hard)
|
||||
|
||||
Return your response in this EXACT JSON format (no markdown, no code fences):
|
||||
{{
|
||||
"immediate_opportunities": [
|
||||
{{
|
||||
"title": "Specific Blog Post Title",
|
||||
"keyword": "primary target keyword",
|
||||
"reason": "Why this will perform well",
|
||||
"format": "How-To Guide | Listicle | Comparison | Pillar Page | etc.",
|
||||
"estimated_impact": "Beginner-friendly traffic opportunity"
|
||||
}}
|
||||
],
|
||||
"content_strategy": [
|
||||
{{
|
||||
"title": "Pillar Content Title",
|
||||
"keyword": "target keyword",
|
||||
"reason": "Strategic importance for building topical authority",
|
||||
"format": "Pillar Page | Ultimate Guide | Resource",
|
||||
"estimated_impact": "Foundation for long-term organic growth"
|
||||
}}
|
||||
],
|
||||
"long_term_strategy": [
|
||||
{{
|
||||
"title": "Authority Building Title",
|
||||
"keyword": "target keyword",
|
||||
"reason": "Establishes expertise and captures high-intent traffic over time",
|
||||
"format": "Research-Backed Analysis | Expert Roundup | Original Study",
|
||||
"estimated_impact": "Compound traffic growth over 6-12 months"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
IMPORTANT:
|
||||
- Provide 3-5 items in each category
|
||||
- All suggestions MUST relate to the user's interest in "{keywords}"
|
||||
- Titles should be specific, compelling, and SEO-aware
|
||||
- Prioritize topics with clear search intent and realistic ranking potential for a new site
|
||||
- Include a mix of easy wins (long-tail, low competition) and strategic pillar content
|
||||
- For estimated_impact, describe the opportunity type (not click numbers since we lack data)"""
|
||||
|
||||
system_prompt = (
|
||||
"You are an expert content strategist specializing in SEO and blog topic generation. "
|
||||
"You help new websites identify high-potential content topics even without search console data. "
|
||||
"You always respond with valid JSON matching the requested format exactly."
|
||||
)
|
||||
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
system_prompt=system_prompt,
|
||||
user_id=user_id,
|
||||
flow_type="gsc_brainstorm_fallback",
|
||||
)
|
||||
|
||||
if result:
|
||||
parsed = self._parse_ai_response(result)
|
||||
if parsed:
|
||||
return {
|
||||
"content_opportunities": [],
|
||||
"keyword_gaps": [],
|
||||
"quick_wins": [],
|
||||
"page_opportunities": [],
|
||||
"ai_recommendations": parsed,
|
||||
"summary": {
|
||||
"site_url": site_url or "",
|
||||
"date_range": {
|
||||
"start": start_date or "",
|
||||
"end": end_date or "",
|
||||
},
|
||||
"total_keywords_analyzed": 0,
|
||||
"total_impressions": 0,
|
||||
"total_clicks": 0,
|
||||
"avg_ctr": 0,
|
||||
"avg_position": 0,
|
||||
"ctr_vs_benchmark": 0,
|
||||
"health_score": 0,
|
||||
"keyword_distribution": {
|
||||
"positions_1_3": 0,
|
||||
"positions_4_10": 0,
|
||||
"positions_11_20": 0,
|
||||
"positions_21_plus": 0,
|
||||
},
|
||||
"top_keywords": [],
|
||||
"top_pages": [],
|
||||
"note": "AI-generated suggestions based on your topic. No GSC data was available — these are strategic recommendations, not data-driven insights."
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"AI-only brainstorm fallback failed for user {user_id}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Data parsing helpers
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
@@ -188,7 +188,6 @@ class GSCService:
|
||||
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
# Check if table exists first to avoid error on fresh DB
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='gsc_credentials'")
|
||||
if not cursor.fetchone():
|
||||
return None
|
||||
@@ -204,7 +203,6 @@ class GSCService:
|
||||
|
||||
credentials_data = json.loads(result[0])
|
||||
|
||||
# Check for required fields, but allow connection without refresh token
|
||||
required_fields = ['token_uri', 'client_id', 'client_secret']
|
||||
missing_fields = [field for field in required_fields if not credentials_data.get(field)]
|
||||
|
||||
@@ -214,7 +212,6 @@ class GSCService:
|
||||
|
||||
credentials = Credentials.from_authorized_user_info(credentials_data, self.scopes)
|
||||
|
||||
# Refresh token if needed and possible
|
||||
if credentials.expired:
|
||||
if credentials.refresh_token:
|
||||
try:
|
||||
@@ -222,9 +219,11 @@ class GSCService:
|
||||
self.save_user_credentials(user_id, credentials)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to refresh GSC token for user {user_id}: {e}")
|
||||
self.clear_incomplete_credentials(user_id)
|
||||
return None
|
||||
else:
|
||||
logger.warning(f"GSC token expired for user {user_id} but no refresh token available - user needs to re-authorize")
|
||||
self.clear_incomplete_credentials(user_id)
|
||||
return None
|
||||
|
||||
return credentials
|
||||
@@ -288,7 +287,6 @@ class GSCService:
|
||||
try:
|
||||
logger.info(f"Handling GSC OAuth callback with state: {state[:20]}...")
|
||||
|
||||
# Extract user_id from state
|
||||
if ':' not in state:
|
||||
logger.error(f"Invalid GSC state format: {state}")
|
||||
return False
|
||||
@@ -300,17 +298,19 @@ class GSCService:
|
||||
logger.error(f"User database not found for user {user_id}")
|
||||
return False
|
||||
|
||||
# Verify state in user's DB (but don't delete yet — delete after successful token exchange)
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT user_id FROM gsc_oauth_states WHERE state = ?', (state,))
|
||||
result = cursor.fetchone()
|
||||
|
||||
if not result:
|
||||
logger.error(f"Invalid or expired GSC OAuth state for user {user_id}")
|
||||
return False
|
||||
|
||||
# Exchange code for credentials
|
||||
# Verify state in user's DB (best effort — if missing, attempt code exchange anyway)
|
||||
state_valid = False
|
||||
try:
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT user_id FROM gsc_oauth_states WHERE state = ?', (state,))
|
||||
state_valid = cursor.fetchone() is not None
|
||||
except Exception as state_err:
|
||||
logger.warning(f"State verification query failed, proceeding anyway: {state_err}")
|
||||
|
||||
if not state_valid:
|
||||
logger.warning(f"GSC OAuth state not found in DB for user {user_id} — will attempt code exchange without state verification")
|
||||
|
||||
if not self.client_config:
|
||||
logger.error("Cannot handle callback: Client configuration not loaded")
|
||||
return False
|
||||
@@ -324,21 +324,30 @@ class GSCService:
|
||||
|
||||
flow.fetch_token(code=authorization_code)
|
||||
credentials = flow.credentials
|
||||
|
||||
if not credentials or not credentials.token:
|
||||
logger.error(f"Token exchange returned empty credentials for user {user_id}")
|
||||
return False
|
||||
|
||||
# State consumed successfully — clean up
|
||||
try:
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM gsc_oauth_states WHERE state = ?', (state,))
|
||||
conn.commit()
|
||||
except Exception as cleanup_err:
|
||||
logger.warning(f"Failed to clean up OAuth state: {cleanup_err}")
|
||||
# Clean up state if it was valid
|
||||
if state_valid:
|
||||
try:
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM gsc_oauth_states WHERE state = ?', (state,))
|
||||
conn.commit()
|
||||
except Exception as cleanup_err:
|
||||
logger.warning(f"Failed to clean up OAuth state: {cleanup_err}")
|
||||
|
||||
# Save credentials
|
||||
return self.save_user_credentials(user_id, credentials)
|
||||
result = self.save_user_credentials(user_id, credentials)
|
||||
if result:
|
||||
logger.info(f"GSC OAuth callback succeeded for user {user_id} (state_valid={state_valid})")
|
||||
else:
|
||||
logger.error(f"GSC OAuth callback: token exchange succeeded but failed to save credentials for user {user_id}")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error handling GSC OAuth callback: {e}")
|
||||
logger.error(f"Error handling GSC OAuth callback for user {user_id if 'user_id' in dir() else 'unknown'}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
@@ -726,6 +735,8 @@ class GSCService:
|
||||
with sqlite3.connect(db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM gsc_credentials WHERE user_id = ?', (user_id,))
|
||||
cursor.execute('DELETE FROM gsc_data_cache WHERE user_id = ?', (user_id,))
|
||||
cursor.execute('DELETE FROM gsc_oauth_states WHERE user_id = ?', (user_id,))
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"Cleared incomplete GSC credentials for user: {user_id}")
|
||||
|
||||
@@ -66,12 +66,19 @@ class WixAuthService:
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def get_site_info(self, access_token: str) -> Dict[str, Any]:
|
||||
def get_site_info(self, access_token: str, meta_site_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
headers = {
|
||||
'Authorization': f'Bearer {access_token}',
|
||||
'Content-Type': 'application/json'
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
if self.client_id:
|
||||
headers['wix-client-id'] = self.client_id
|
||||
if meta_site_id:
|
||||
headers['wix-site-id'] = meta_site_id
|
||||
response = requests.get(f"{self.base_url}/sites/v1/site", headers=headers)
|
||||
if response.status_code == 404:
|
||||
logger.warning("Wix site info not found (404) — user may not have a published site or token lacks sites scope")
|
||||
return {"_no_site": True, "error": "No Wix site found for this account"}
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
@@ -295,39 +295,39 @@ def create_blog_post(
|
||||
wix_logger.log_token_info(token_length, has_blog_scope, meta_site_id)
|
||||
|
||||
# Convert markdown to Ricos
|
||||
ricos_content = convert_content_to_ricos(content, None)
|
||||
# PRIMARY: Use Wix Ricos Documents API for best formatting support (tables, complex markdown, etc.)
|
||||
# FALLBACK: Use custom parser if Wix API fails
|
||||
ricos_content = None
|
||||
try:
|
||||
logger.info("Converting markdown via Wix Ricos Documents API...")
|
||||
ricos_content = convert_via_wix_api(content, access_token, base_url)
|
||||
logger.info(f"Wix API conversion succeeded: {len(ricos_content.get('nodes', []))} nodes")
|
||||
except Exception as e:
|
||||
logger.warning(f"Wix API conversion failed, falling back to custom parser: {e}")
|
||||
|
||||
if not ricos_content or not isinstance(ricos_content, dict) or 'nodes' not in ricos_content:
|
||||
logger.info("Using custom markdown parser for Ricos conversion")
|
||||
ricos_content = convert_content_to_ricos(content, None)
|
||||
|
||||
nodes_count = len(ricos_content.get('nodes', []))
|
||||
wix_logger.log_ricos_conversion(nodes_count)
|
||||
|
||||
# Validate Ricos content structure
|
||||
# Per Wix Blog API documentation: richContent should ONLY contain 'nodes'
|
||||
# The example in docs shows: { nodes: [...] } - no type, id, metadata, or documentStyle
|
||||
if not isinstance(ricos_content, dict):
|
||||
logger.error(f"❌ richContent is not a dict: {type(ricos_content)}")
|
||||
logger.error(f"richContent is not a dict: {type(ricos_content)}")
|
||||
raise ValueError("richContent must be a dictionary object")
|
||||
|
||||
if 'nodes' not in ricos_content or not isinstance(ricos_content['nodes'], list):
|
||||
logger.error(f"❌ richContent.nodes is missing or not a list: {ricos_content.get('nodes', 'MISSING')}")
|
||||
logger.error(f"richContent.nodes is missing or not a list: {ricos_content.get('nodes', 'MISSING')}")
|
||||
raise ValueError("richContent must contain a 'nodes' array")
|
||||
|
||||
# Remove type and id fields (not expected by Blog API)
|
||||
# NOTE: metadata is optional - Wix UPDATE endpoint example shows it, but CREATE example doesn't
|
||||
# We'll keep it minimal (nodes only) for CREATE to match the recipe example
|
||||
fields_to_remove = ['type', 'id']
|
||||
for field in fields_to_remove:
|
||||
# Remove top-level fields not expected by Blog API CREATE endpoint
|
||||
# (Wix API converter may include type, id, metadata, documentStyle — strip them)
|
||||
for field in ['type', 'id', 'metadata', 'documentStyle']:
|
||||
if field in ricos_content:
|
||||
logger.debug(f"Removing '{field}' field from richContent (Blog API doesn't expect this)")
|
||||
logger.debug(f"Removing '{field}' from richContent for Blog API compatibility")
|
||||
del ricos_content[field]
|
||||
|
||||
# Remove metadata and documentStyle - Blog API CREATE endpoint example shows only 'nodes'
|
||||
# (UPDATE endpoint shows metadata, but we're using CREATE)
|
||||
if 'metadata' in ricos_content:
|
||||
logger.debug("Removing 'metadata' from richContent (CREATE endpoint expects only 'nodes')")
|
||||
del ricos_content['metadata']
|
||||
if 'documentStyle' in ricos_content:
|
||||
logger.debug("Removing 'documentStyle' from richContent (CREATE endpoint expects only 'nodes')")
|
||||
del ricos_content['documentStyle']
|
||||
|
||||
# Ensure we only have 'nodes' in richContent for CREATE endpoint
|
||||
ricos_content = {'nodes': ricos_content['nodes']}
|
||||
|
||||
|
||||
@@ -708,7 +708,48 @@ class SIFIntegrationService:
|
||||
themes = adv_insights.get('augmented_themes', [])
|
||||
if themes:
|
||||
text_content += f"Augmented Themes: {', '.join(themes[:5])}. "
|
||||
|
||||
|
||||
freshness = adv_insights.get('freshness', {})
|
||||
if freshness:
|
||||
text_content += (f"Content Freshness Score: {freshness.get('freshness_score', 'N/A')}. "
|
||||
f"Publishing Velocity: {freshness.get('publishing_velocity', 0)}/week. "
|
||||
f"Trend: {freshness.get('publishing_trend', 'unknown')}. "
|
||||
f"Last 30d: {freshness.get('publishing_recency', {}).get('last_30d', 0)} pages. ")
|
||||
|
||||
link_health = adv_insights.get('link_health', {})
|
||||
if link_health and 'error' not in link_health:
|
||||
text_content += (f"Internal Links: {link_health.get('internal_link_count', 0)}. "
|
||||
f"External Links: {link_health.get('external_link_count', 0)}. "
|
||||
f"Nofollow: {link_health.get('nofollow_link_count', 0)}. "
|
||||
f"Avg Links/Page: {link_health.get('avg_links_per_page', 0)}. ")
|
||||
|
||||
redirects = adv_insights.get('redirect_audit', {})
|
||||
if redirects and 'error' not in redirects:
|
||||
text_content += (f"Redirects: {redirects.get('total_redirects', 0)} total, "
|
||||
f"{redirects.get('multi_hop_chains', 0)} multi-hop. ")
|
||||
|
||||
image_seo = adv_insights.get('image_seo', {})
|
||||
if image_seo and 'error' not in image_seo:
|
||||
text_content += (f"Images: {image_seo.get('total_images', 0)} total, "
|
||||
f"Alt Coverage: {image_seo.get('alt_coverage_percentage', 0)}%. ")
|
||||
|
||||
url_struct = adv_insights.get('url_structure', {})
|
||||
if url_struct:
|
||||
text_content += (f"URL Structure: {url_struct.get('total_urls_analyzed', 0)} URLs, "
|
||||
f"Avg Depth: {url_struct.get('directory_depth', {}).get('average_depth', 0)}. "
|
||||
f"Params: {url_struct.get('parameter_usage', {}).get('percentage_with_params', 0)}%. ")
|
||||
|
||||
robots = adv_insights.get('robots_txt', {})
|
||||
if robots and robots.get('success'):
|
||||
text_content += (f"Robots.txt: {robots.get('total_directives', 0)} directives, "
|
||||
f"Compliance: {robots.get('compliance_score', 0)}/100. "
|
||||
f"Issues: {len(robots.get('issues', []))}. ")
|
||||
|
||||
budget = adv_insights.get('crawl_budget', {})
|
||||
if budget and budget.get('success'):
|
||||
text_content += (f"Crawl Budget: {budget.get('pages_crawled', 0)} crawled of {budget.get('sitemap_total_urls', 0)} URLs. "
|
||||
f"Waste: {budget.get('waste_percentage', 0)}%. "
|
||||
f"Score: {budget.get('optimization_score', 0)}. ")
|
||||
# Add Technical SEO overview
|
||||
tech_audit = dashboard_data.get('technical_seo_audit', {})
|
||||
if tech_audit:
|
||||
|
||||
@@ -370,6 +370,136 @@ class FailureDetectionService:
|
||||
"last_failure": task.last_failure.isoformat() if task.last_failure else None
|
||||
})
|
||||
|
||||
# Check onboarding full website analysis tasks
|
||||
from models.website_analysis_monitoring_models import OnboardingFullWebsiteAnalysisTask
|
||||
onboarding_tasks = self.db.query(OnboardingFullWebsiteAnalysisTask).filter(
|
||||
OnboardingFullWebsiteAnalysisTask.status == "needs_intervention"
|
||||
)
|
||||
if user_id:
|
||||
onboarding_tasks = onboarding_tasks.filter(OnboardingFullWebsiteAnalysisTask.user_id == user_id)
|
||||
|
||||
for task in onboarding_tasks.all():
|
||||
pattern = self.analyze_task_failures(task.id, "onboarding_full_website_analysis", task.user_id)
|
||||
tasks_needing_intervention.append({
|
||||
"task_id": task.id,
|
||||
"task_type": "onboarding_full_website_analysis",
|
||||
"user_id": task.user_id,
|
||||
"website_url": task.website_url,
|
||||
"failure_pattern": {
|
||||
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures if pattern else 0,
|
||||
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
|
||||
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
|
||||
"error_patterns": pattern.error_patterns if pattern else [],
|
||||
},
|
||||
"failure_reason": task.failure_reason,
|
||||
"last_failure": task.last_failure.isoformat() if task.last_failure else None
|
||||
})
|
||||
|
||||
# Check deep competitor analysis tasks
|
||||
from models.website_analysis_monitoring_models import DeepCompetitorAnalysisTask
|
||||
competitor_tasks = self.db.query(DeepCompetitorAnalysisTask).filter(
|
||||
DeepCompetitorAnalysisTask.status == "needs_intervention"
|
||||
)
|
||||
if user_id:
|
||||
competitor_tasks = competitor_tasks.filter(DeepCompetitorAnalysisTask.user_id == user_id)
|
||||
|
||||
for task in competitor_tasks.all():
|
||||
pattern = self.analyze_task_failures(task.id, "deep_competitor_analysis", task.user_id)
|
||||
tasks_needing_intervention.append({
|
||||
"task_id": task.id,
|
||||
"task_type": "deep_competitor_analysis",
|
||||
"user_id": task.user_id,
|
||||
"website_url": task.website_url,
|
||||
"failure_pattern": {
|
||||
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures if pattern else 0,
|
||||
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
|
||||
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
|
||||
"error_patterns": pattern.error_patterns if pattern else [],
|
||||
},
|
||||
"failure_reason": task.failure_reason,
|
||||
"last_failure": task.last_failure.isoformat() if task.last_failure else None
|
||||
})
|
||||
|
||||
# Check SIF indexing tasks
|
||||
from models.website_analysis_monitoring_models import SIFIndexingTask
|
||||
sif_tasks = self.db.query(SIFIndexingTask).filter(
|
||||
SIFIndexingTask.status == "needs_intervention"
|
||||
)
|
||||
if user_id:
|
||||
sif_tasks = sif_tasks.filter(SIFIndexingTask.user_id == user_id)
|
||||
|
||||
for task in sif_tasks.all():
|
||||
pattern = self.analyze_task_failures(task.id, "sif_indexing", task.user_id)
|
||||
tasks_needing_intervention.append({
|
||||
"task_id": task.id,
|
||||
"task_type": "sif_indexing",
|
||||
"user_id": task.user_id,
|
||||
"website_url": task.website_url,
|
||||
"failure_pattern": {
|
||||
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures if pattern else 0,
|
||||
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
|
||||
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
|
||||
"error_patterns": pattern.error_patterns if pattern else [],
|
||||
},
|
||||
"failure_reason": task.failure_reason,
|
||||
"last_failure": task.last_failure.isoformat() if task.last_failure else None
|
||||
})
|
||||
|
||||
# Check market trends tasks
|
||||
from models.website_analysis_monitoring_models import MarketTrendsTask
|
||||
trends_tasks = self.db.query(MarketTrendsTask).filter(
|
||||
MarketTrendsTask.status == "needs_intervention"
|
||||
)
|
||||
if user_id:
|
||||
trends_tasks = trends_tasks.filter(MarketTrendsTask.user_id == user_id)
|
||||
|
||||
for task in trends_tasks.all():
|
||||
pattern = self.analyze_task_failures(task.id, "market_trends", task.user_id)
|
||||
tasks_needing_intervention.append({
|
||||
"task_id": task.id,
|
||||
"task_type": "market_trends",
|
||||
"user_id": task.user_id,
|
||||
"website_url": task.website_url,
|
||||
"failure_pattern": {
|
||||
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures if pattern else 0,
|
||||
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
|
||||
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
|
||||
"error_patterns": pattern.error_patterns if pattern else [],
|
||||
},
|
||||
"failure_reason": task.failure_reason,
|
||||
"last_failure": task.last_failure.isoformat() if task.last_failure else None
|
||||
})
|
||||
|
||||
# Check advertools tasks (paused tasks may also need attention)
|
||||
from models.website_analysis_monitoring_models import AdvertoolsTask
|
||||
advertools_tasks = self.db.query(AdvertoolsTask).filter(
|
||||
AdvertoolsTask.status.in_(["needs_intervention", "failed"])
|
||||
)
|
||||
if user_id:
|
||||
advertools_tasks = advertools_tasks.filter(AdvertoolsTask.user_id == user_id)
|
||||
|
||||
for task in advertools_tasks.all():
|
||||
pattern = self.analyze_task_failures(task.id, "advertools", task.user_id)
|
||||
tasks_needing_intervention.append({
|
||||
"task_id": task.id,
|
||||
"task_type": "advertools",
|
||||
"user_id": task.user_id,
|
||||
"website_url": task.website_url,
|
||||
"failure_pattern": {
|
||||
"consecutive_failures": pattern.consecutive_failures if pattern else task.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures if pattern else 0,
|
||||
"failure_reason": pattern.failure_reason.value if pattern else "unknown",
|
||||
"last_failure_time": pattern.last_failure_time.isoformat() if pattern and pattern.last_failure_time else None,
|
||||
"error_patterns": pattern.error_patterns if pattern else [],
|
||||
},
|
||||
"failure_reason": task.failure_reason,
|
||||
"last_failure": task.last_failure.isoformat() if task.last_failure else None
|
||||
})
|
||||
|
||||
return tasks_needing_intervention
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import text
|
||||
@@ -63,27 +64,66 @@ class AdvertoolsExecutor:
|
||||
|
||||
result = {}
|
||||
if task_type == 'content_audit':
|
||||
# Phase 1: Audit content themes using sample URLs from sitemap
|
||||
# First, get the sitemap to find recent URLs
|
||||
# Phase 1: Get sitemap analysis (freshness, URL structure, pillars)
|
||||
sitemap_result = await self.advertools_service.analyze_sitemap(effective_url)
|
||||
|
||||
audit_urls = []
|
||||
url_structure = {}
|
||||
freshness = {}
|
||||
if sitemap_result.get('success'):
|
||||
# Use the sample URLs returned by the service
|
||||
audit_urls = sitemap_result.get('metrics', {}).get('audit_sample_urls', [])
|
||||
metrics = sitemap_result.get('metrics', {})
|
||||
audit_urls = metrics.get('audit_sample_urls', [])
|
||||
url_structure = metrics.get('url_structure', {})
|
||||
freshness = {
|
||||
"freshness_score": metrics.get('freshness_score'),
|
||||
"publishing_velocity": metrics.get('publishing_velocity'),
|
||||
"stale_content_percentage": metrics.get('stale_content_percentage'),
|
||||
"publishing_recency": metrics.get('publishing_recency'),
|
||||
"publishing_trend": metrics.get('publishing_trend'),
|
||||
}
|
||||
|
||||
if not audit_urls:
|
||||
# Fallback to homepage if sitemap fails or empty
|
||||
audit_urls = [website_url]
|
||||
|
||||
# Run the audit on the sample
|
||||
result = await self.advertools_service.audit_content(audit_urls)
|
||||
# Phase 2: Theme analysis via content audit
|
||||
audit_result = await self.advertools_service.audit_content(audit_urls)
|
||||
|
||||
# Phase 3: Site structure analysis (links, redirects, image SEO)
|
||||
site_domain = urlparse(website_url).netloc or website_url
|
||||
structure_result = await self.advertools_service.analyze_site_structure(
|
||||
audit_urls, site_domain=site_domain
|
||||
)
|
||||
|
||||
# Phase 4: Robots.txt compliance analysis
|
||||
robots_result = await self.advertools_service.analyze_robots_txt(website_url)
|
||||
|
||||
# Phase 5: Crawl budget analysis
|
||||
budget_result = await self.advertools_service.analyze_crawl_budget(
|
||||
effective_url, site_domain
|
||||
)
|
||||
|
||||
# Merge results
|
||||
result = {
|
||||
"success": audit_result.get('success', False) or structure_result.get('success', False),
|
||||
"themes": audit_result.get('themes', []),
|
||||
"page_count": audit_result.get('page_count', 0),
|
||||
"avg_word_count": audit_result.get('avg_word_count', 0),
|
||||
"link_health": structure_result.get('link_health', {}),
|
||||
"redirect_audit": structure_result.get('redirect_audit', {}),
|
||||
"image_seo": structure_result.get('image_seo', {}),
|
||||
"page_status": structure_result.get('page_status', {}),
|
||||
"url_structure": url_structure,
|
||||
"freshness": freshness,
|
||||
"robots_txt": robots_result,
|
||||
"crawl_budget": budget_result,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
if result.get('success'):
|
||||
await self._update_persona_augmentation(user_id, website_url, result, db)
|
||||
|
||||
elif task_type == 'site_health':
|
||||
# Phase 1: Check site health (freshness, velocity)
|
||||
# Site health: freshness, velocity, URL structure
|
||||
result = await self.advertools_service.analyze_sitemap(effective_url)
|
||||
|
||||
if result.get('success'):
|
||||
@@ -157,7 +197,8 @@ class AdvertoolsExecutor:
|
||||
|
||||
async def _update_persona_augmentation(self, user_id: str, website_url: str, audit_result: Dict[str, Any], db: Session):
|
||||
"""
|
||||
Updates the user's Brand Persona with discovered themes from the content audit.
|
||||
Updates the user's Brand Persona with discovered themes, site structure,
|
||||
link health, and redirect data from the content audit.
|
||||
"""
|
||||
try:
|
||||
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
|
||||
@@ -170,18 +211,40 @@ class AdvertoolsExecutor:
|
||||
self.logger.warning(f"No website analysis found for user {user_id}")
|
||||
return
|
||||
|
||||
# Update brand_analysis with augmented themes
|
||||
current_brand = analysis.brand_analysis or {}
|
||||
|
||||
# Add or update the 'augmented_themes' field
|
||||
# Core themes
|
||||
current_brand['augmented_themes'] = audit_result.get('themes', [])
|
||||
|
||||
# Link health
|
||||
current_brand['link_health'] = audit_result.get('link_health', {})
|
||||
|
||||
# Redirect audit
|
||||
current_brand['redirect_audit'] = audit_result.get('redirect_audit', {})
|
||||
|
||||
# Image SEO
|
||||
current_brand['image_seo'] = audit_result.get('image_seo', {})
|
||||
|
||||
# Page status distribution
|
||||
current_brand['page_status'] = audit_result.get('page_status', {})
|
||||
|
||||
# URL structure analysis
|
||||
current_brand['url_structure'] = audit_result.get('url_structure', {})
|
||||
|
||||
# Freshness
|
||||
current_brand['freshness'] = audit_result.get('freshness', {})
|
||||
|
||||
# Robots.txt compliance
|
||||
current_brand['robots_txt'] = audit_result.get('robots_txt', {})
|
||||
|
||||
# Crawl budget analysis
|
||||
current_brand['crawl_budget'] = audit_result.get('crawl_budget', {})
|
||||
|
||||
current_brand['last_advertools_audit'] = datetime.utcnow().isoformat()
|
||||
|
||||
# Force SQLAlchemy to detect change in JSON field
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
flag_modified(analysis, "brand_analysis")
|
||||
|
||||
# Also update content_strategy_insights if relevant
|
||||
if 'avg_word_count' in audit_result:
|
||||
current_strategy = analysis.content_strategy_insights or {}
|
||||
current_strategy['avg_content_length'] = audit_result['avg_word_count']
|
||||
@@ -196,7 +259,8 @@ class AdvertoolsExecutor:
|
||||
|
||||
async def _update_site_health_metrics(self, user_id: str, website_url: str, health_result: Dict[str, Any], db: Session):
|
||||
"""
|
||||
Updates the WebsiteAnalysis with site health metrics (velocity, freshness).
|
||||
Updates the WebsiteAnalysis with site health metrics (velocity, freshness,
|
||||
URL structure analysis, freshness score).
|
||||
"""
|
||||
try:
|
||||
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
|
||||
@@ -207,7 +271,6 @@ class AdvertoolsExecutor:
|
||||
if not analysis:
|
||||
return
|
||||
|
||||
# Update seo_audit with health metrics
|
||||
current_seo = analysis.seo_audit or {}
|
||||
metrics = health_result.get('metrics', {})
|
||||
|
||||
@@ -216,7 +279,11 @@ class AdvertoolsExecutor:
|
||||
"publishing_velocity": metrics.get('publishing_velocity'),
|
||||
"stale_content_count": metrics.get('stale_content_count'),
|
||||
"stale_content_percentage": metrics.get('stale_content_percentage'),
|
||||
"top_pillars": metrics.get('top_pillars')
|
||||
"freshness_score": metrics.get('freshness_score'),
|
||||
"publishing_recency": metrics.get('publishing_recency'),
|
||||
"publishing_trend": metrics.get('publishing_trend'),
|
||||
"top_pillars": metrics.get('top_pillars'),
|
||||
"url_structure": metrics.get('url_structure', {})
|
||||
}
|
||||
current_seo['last_advertools_health_check'] = datetime.utcnow().isoformat()
|
||||
|
||||
|
||||
@@ -1,12 +1,18 @@
|
||||
import advertools as adv
|
||||
import pandas as pd
|
||||
import asyncio
|
||||
from typing import Dict, Any, List, Optional
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from urllib.parse import urlparse
|
||||
from collections import Counter
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import socket
|
||||
import re
|
||||
|
||||
class AdvertoolsService:
|
||||
"""
|
||||
@@ -19,51 +25,58 @@ class AdvertoolsService:
|
||||
|
||||
async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
|
||||
Analyzes a website's sitemap to extract metrics on publishing velocity, freshness,
|
||||
URL structure patterns, and topic distribution.
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"Analyzing sitemap: {sitemap_url}")
|
||||
|
||||
# advertools sitemap_to_df is blocking, run in executor
|
||||
loop = asyncio.get_event_loop()
|
||||
df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
|
||||
|
||||
if df is None or df.empty:
|
||||
return {"success": False, "error": "Sitemap is empty or could not be parsed."}
|
||||
|
||||
# Convert lastmod to datetime
|
||||
if 'lastmod' in df.columns:
|
||||
df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
|
||||
|
||||
total_urls = len(df)
|
||||
|
||||
# Handle potential empty datetime columns
|
||||
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
|
||||
now = datetime.now(df['lastmod'].dt.tz)
|
||||
thirty_days_ago = now - timedelta(days=30)
|
||||
recent_urls = df[df['lastmod'] > thirty_days_ago]
|
||||
six_months_ago = now - timedelta(days=180)
|
||||
stale_urls = df[df['lastmod'] < six_months_ago]
|
||||
|
||||
publishing_velocity = len(recent_urls) / 4.0 # URLs per week
|
||||
stale_count = len(stale_urls)
|
||||
else:
|
||||
publishing_velocity = 0
|
||||
stale_count = 0
|
||||
# --- Content Freshness Scoring ---
|
||||
freshness = self._compute_freshness(df)
|
||||
|
||||
# Enhanced Content Pillars (Top folder patterns - 3 levels deep)
|
||||
def extract_hierarchy(url: str):
|
||||
try:
|
||||
parts = urlparse(url).path.strip('/').split('/')
|
||||
if not parts or not parts[0]: return "home"
|
||||
return "/".join(parts[:2]) # Capture top 2 segments
|
||||
except:
|
||||
return "other"
|
||||
# --- URL Structure Analysis ---
|
||||
url_structure = {}
|
||||
if 'loc' in df.columns:
|
||||
url_structure = await self._analyze_url_structure(df['loc'].tolist())
|
||||
|
||||
# --- Content Pillars via url_to_df ---
|
||||
pillars = {}
|
||||
url_df = None
|
||||
try:
|
||||
url_df = adv.url_to_df(df['loc'])
|
||||
if url_df is not None and not url_df.empty:
|
||||
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
|
||||
if dir_cols:
|
||||
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
|
||||
for col in dir_cols[1:3]:
|
||||
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
|
||||
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
|
||||
pillars = pillar_series.value_counts().head(15).to_dict()
|
||||
except Exception:
|
||||
fallback_pillars = {}
|
||||
if 'loc' in df.columns:
|
||||
def extract_hierarchy(url: str):
|
||||
try:
|
||||
parts = urlparse(url).path.strip('/').split('/')
|
||||
if not parts or not parts[0]: return "home"
|
||||
return "/".join(parts[:2])
|
||||
except:
|
||||
return "other"
|
||||
fallback_pillars = df['loc'].apply(extract_hierarchy).value_counts().head(15).to_dict()
|
||||
pillars = fallback_pillars
|
||||
|
||||
df['pillar'] = df['loc'].apply(extract_hierarchy)
|
||||
pillars = df['pillar'].value_counts().head(15).to_dict()
|
||||
|
||||
# Return a sample of URLs for auditing (top 15 most recent if available)
|
||||
# Sample URLs for auditing (top 15 most recent)
|
||||
audit_urls = []
|
||||
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
|
||||
audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
|
||||
@@ -74,10 +87,14 @@ class AdvertoolsService:
|
||||
"success": True,
|
||||
"metrics": {
|
||||
"total_urls": total_urls,
|
||||
"publishing_velocity": round(publishing_velocity, 2),
|
||||
"stale_content_count": stale_count,
|
||||
"stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
|
||||
"publishing_velocity": freshness.get("publishing_velocity"),
|
||||
"stale_content_count": freshness.get("stale_count"),
|
||||
"stale_content_percentage": freshness.get("stale_percentage"),
|
||||
"freshness_score": freshness.get("freshness_score"),
|
||||
"publishing_recency": freshness.get("publishing_recency"),
|
||||
"publishing_trend": freshness.get("publishing_trend"),
|
||||
"top_pillars": pillars,
|
||||
"url_structure": url_structure,
|
||||
"audit_sample_urls": audit_urls
|
||||
},
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
@@ -86,6 +103,146 @@ class AdvertoolsService:
|
||||
self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
def _compute_freshness(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""Compute content freshness, publishing velocity, and staleness metrics."""
|
||||
result = {
|
||||
"publishing_velocity": 0,
|
||||
"stale_count": 0,
|
||||
"stale_percentage": 0,
|
||||
"freshness_score": 0,
|
||||
"publishing_recency": {},
|
||||
"publishing_trend": "unknown"
|
||||
}
|
||||
|
||||
if 'lastmod' not in df.columns or df['lastmod'].isna().all():
|
||||
return result
|
||||
|
||||
lastmod = df['lastmod'].dropna()
|
||||
if lastmod.empty:
|
||||
return result
|
||||
|
||||
now = datetime.now(lastmod.dt.tz)
|
||||
thirty_days_ago = now - timedelta(days=30)
|
||||
ninety_days_ago = now - timedelta(days=90)
|
||||
six_months_ago = now - timedelta(days=180)
|
||||
|
||||
recent_urls = df[df['lastmod'] > thirty_days_ago]
|
||||
stale_urls = df[df['lastmod'] < six_months_ago]
|
||||
|
||||
total_urls = len(df)
|
||||
stale_count = len(stale_urls)
|
||||
stale_percentage = round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0
|
||||
|
||||
# Publishing velocity: URLs per week over last 90 days
|
||||
recent_90 = df[df['lastmod'] > ninety_days_ago]
|
||||
publishing_velocity = round(len(recent_90) / 13.0, 2) if not recent_90.empty else 0
|
||||
|
||||
# Freshness score (0-100): weighted combination of metrics
|
||||
non_stale_ratio = 1.0 - (stale_percentage / 100.0)
|
||||
recency_ratio = len(recent_urls) / max(total_urls, 1)
|
||||
velocity_score = min(publishing_velocity / 10.0, 1.0)
|
||||
freshness_score = round((non_stale_ratio * 50 + recency_ratio * 30 + velocity_score * 20), 1)
|
||||
|
||||
# Publishing recency: URLs published in last 1d, 7d, 30d, 90d
|
||||
publishing_recency = {
|
||||
"last_24h": int(len(df[df['lastmod'] > (now - timedelta(days=1))])),
|
||||
"last_7d": int(len(df[df['lastmod'] > (now - timedelta(days=7))])),
|
||||
"last_30d": int(len(recent_urls)),
|
||||
"last_90d": int(len(recent_90)),
|
||||
}
|
||||
|
||||
# Publishing trend: compare recent 30d vs prior 30d
|
||||
prior_30 = df[(df['lastmod'] <= thirty_days_ago) & (df['lastmod'] > (now - timedelta(days=60)))]
|
||||
recent_count = len(recent_urls)
|
||||
prior_count = len(prior_30)
|
||||
if recent_count > prior_count * 1.1:
|
||||
publishing_trend = "increasing"
|
||||
elif recent_count < prior_count * 0.9:
|
||||
publishing_trend = "decreasing"
|
||||
else:
|
||||
publishing_trend = "stable"
|
||||
|
||||
return {
|
||||
"publishing_velocity": publishing_velocity,
|
||||
"stale_count": stale_count,
|
||||
"stale_percentage": stale_percentage,
|
||||
"freshness_score": freshness_score,
|
||||
"publishing_recency": publishing_recency,
|
||||
"publishing_trend": publishing_trend
|
||||
}
|
||||
|
||||
async def _analyze_url_structure(self, urls: List[str]) -> Dict[str, Any]:
|
||||
"""Analyze URL patterns for parameter bloat, directory depth, and path patterns."""
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
url_df = await loop.run_in_executor(None, lambda: adv.url_to_df(urls))
|
||||
|
||||
if url_df is None or url_df.empty:
|
||||
return {}
|
||||
|
||||
total = len(url_df)
|
||||
|
||||
# Query param analysis
|
||||
has_query = url_df['query'].notna() & (url_df['query'] != '')
|
||||
param_count = has_query.sum()
|
||||
param_percentage = round((param_count / total) * 100, 2) if total > 0 else 0
|
||||
|
||||
# Extract individual parameters
|
||||
all_params = []
|
||||
param_frequency = {}
|
||||
if param_count > 0:
|
||||
for q in url_df.loc[has_query, 'query'].dropna().unique():
|
||||
for pair in q.split('&'):
|
||||
key = pair.split('=')[0] if '=' in pair else pair
|
||||
all_params.append(key)
|
||||
from collections import Counter
|
||||
param_frequency = dict(Counter(all_params).most_common(10))
|
||||
|
||||
# Directory depth analysis
|
||||
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
|
||||
def count_depth(row):
|
||||
for i, col in enumerate(dir_cols):
|
||||
val = row[col]
|
||||
if pd.isna(val) or str(val) == 'nan' or str(val).strip() == '':
|
||||
return i
|
||||
return len(dir_cols)
|
||||
|
||||
depths = url_df.apply(count_depth, axis=1)
|
||||
avg_depth = round(depths.mean(), 1) if not depths.empty else 0
|
||||
max_depth = int(depths.max()) if not depths.empty else 0
|
||||
depth_distribution = depths.value_counts().sort_index().head(10).to_dict()
|
||||
depth_distribution = {str(k): int(v) for k, v in depth_distribution.items()}
|
||||
|
||||
# Protocol consistency
|
||||
schemes = url_df['scheme'].value_counts().to_dict() if 'scheme' in url_df.columns else {}
|
||||
|
||||
# Subdomain analysis
|
||||
netloc_counts = url_df['netloc'].value_counts() if 'netloc' in url_df.columns else None
|
||||
unique_subdomains = int(netloc_counts.nunique()) if netloc_counts is not None else 0
|
||||
primary_domain = netloc_counts.index[0] if netloc_counts is not None and not netloc_counts.empty else ""
|
||||
|
||||
return {
|
||||
"total_urls_analyzed": total,
|
||||
"parameter_usage": {
|
||||
"urls_with_params": int(param_count),
|
||||
"percentage_with_params": param_percentage,
|
||||
"top_parameters": param_frequency
|
||||
},
|
||||
"directory_depth": {
|
||||
"average_depth": avg_depth,
|
||||
"max_depth": max_depth,
|
||||
"distribution": depth_distribution
|
||||
},
|
||||
"protocols": {str(k): int(v) for k, v in schemes.items()},
|
||||
"subdomains": {
|
||||
"primary": primary_domain,
|
||||
"unique_count": unique_subdomains
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"URL structure analysis failed: {e}")
|
||||
return {}
|
||||
|
||||
async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Performs a shallow crawl and theme analysis using word frequency.
|
||||
@@ -153,6 +310,512 @@ class AdvertoolsService:
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
|
||||
|
||||
async def analyze_site_structure(self, url_list: List[str], site_domain: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Crawls a set of pages with link following to analyze internal link health,
|
||||
redirect chains, and page-level SEO elements.
|
||||
|
||||
Extracts metrics via crawlytics: link distribution, redirect chains, image SEO.
|
||||
"""
|
||||
temp_file = None
|
||||
try:
|
||||
self.logger.info(f"Analyzing site structure for {len(url_list)} URLs, domain={site_domain}")
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
|
||||
temp_file = tf.name
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, lambda: adv.crawl(
|
||||
url_list=url_list,
|
||||
output_file=temp_file,
|
||||
follow_links=True,
|
||||
allowed_domains=[site_domain] if site_domain else None,
|
||||
custom_settings={
|
||||
'LOG_LEVEL': 'WARNING',
|
||||
'CLOSESPIDER_PAGECOUNT': 50,
|
||||
'DOWNLOAD_TIMEOUT': 30,
|
||||
'CONCURRENT_REQUESTS_PER_DOMAIN': 3,
|
||||
'DEPTH_LIMIT': 3,
|
||||
}
|
||||
))
|
||||
|
||||
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
|
||||
return {"success": False, "error": "Site structure crawl produced no output."}
|
||||
|
||||
crawl_df = pd.read_json(temp_file, lines=True)
|
||||
page_count = len(crawl_df)
|
||||
result = {"success": True, "page_count": page_count}
|
||||
|
||||
# --- Link Health via crawlytics ---
|
||||
try:
|
||||
internal_regex = site_domain if site_domain else None
|
||||
link_df = adv.crawlytics.links(crawl_df, internal_url_regex=internal_regex)
|
||||
if link_df is not None and not link_df.empty:
|
||||
total_links = len(link_df)
|
||||
internal_links = int(link_df['internal'].sum()) if 'internal' in link_df.columns else 0
|
||||
external_links = total_links - internal_links
|
||||
nofollow_links = int(link_df['nofollow'].sum()) if 'nofollow' in link_df.columns else 0
|
||||
|
||||
# Count links per page
|
||||
links_per_page = link_df.groupby(level=0).size()
|
||||
avg_links_per_page = round(links_per_page.mean(), 1) if not links_per_page.empty else 0
|
||||
|
||||
# Most common anchor text (internal links only)
|
||||
anchor_texts = []
|
||||
if 'text' in link_df.columns and 'internal' in link_df.columns:
|
||||
internal_anchors = link_df[link_df['internal'] == True]['text'].dropna()
|
||||
for t in internal_anchors:
|
||||
if isinstance(t, str) and t.strip():
|
||||
anchor_texts.extend([w.strip() for w in t.split() if len(w.strip()) > 2])
|
||||
from collections import Counter
|
||||
top_anchors = dict(Counter(anchor_texts).most_common(15)) if anchor_texts else {}
|
||||
|
||||
result["link_health"] = {
|
||||
"total_links_found": total_links,
|
||||
"internal_link_count": internal_links,
|
||||
"external_link_count": external_links,
|
||||
"internal_link_percentage": round((internal_links / total_links) * 100, 1) if total_links > 0 else 0,
|
||||
"nofollow_link_count": nofollow_links,
|
||||
"avg_links_per_page": avg_links_per_page,
|
||||
"top_anchor_words": top_anchors
|
||||
}
|
||||
else:
|
||||
result["link_health"] = {"error": "No links found in crawl data"}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Link analysis failed: {e}")
|
||||
result["link_health"] = {"error": str(e)}
|
||||
|
||||
# --- Redirect Chain Audit via crawlytics ---
|
||||
try:
|
||||
redirect_df = adv.crawlytics.redirects(crawl_df)
|
||||
if redirect_df is not None and not redirect_df.empty:
|
||||
total_redirects = len(redirect_df)
|
||||
redirect_chains = redirect_df['redirect_times'].nunique() if 'redirect_times' in redirect_df.columns else 0
|
||||
redirect_statuses = redirect_df['status'].value_counts().to_dict() if 'status' in redirect_df.columns else {}
|
||||
multi_hop = redirect_df[redirect_df['redirect_times'] > 1] if 'redirect_times' in redirect_df.columns else pd.DataFrame()
|
||||
|
||||
result["redirect_audit"] = {
|
||||
"total_redirects": int(total_redirects),
|
||||
"unique_chains": int(redirect_chains),
|
||||
"status_distribution": {str(k): int(v) for k, v in redirect_statuses.items()},
|
||||
"multi_hop_chains": int(len(multi_hop)),
|
||||
"affected_pages": multi_hop.index.unique().tolist() if not multi_hop.empty else []
|
||||
}
|
||||
else:
|
||||
result["redirect_audit"] = {"total_redirects": 0, "note": "No redirects detected"}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Redirect analysis failed: {e}")
|
||||
result["redirect_audit"] = {"error": str(e)}
|
||||
|
||||
# --- Image SEO overview via crawlytics ---
|
||||
try:
|
||||
img_df = adv.crawlytics.images(crawl_df)
|
||||
if img_df is not None and not img_df.empty:
|
||||
total_images = len(img_df)
|
||||
missing_alt = int(img_df['img_alt'].isna().sum()) if 'img_alt' in img_df.columns else 0
|
||||
alt_coverage = round(((total_images - missing_alt) / total_images) * 100, 1) if total_images > 0 else 0
|
||||
result["image_seo"] = {
|
||||
"total_images": total_images,
|
||||
"missing_alt_count": missing_alt,
|
||||
"alt_coverage_percentage": alt_coverage
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Image analysis failed: {e}")
|
||||
|
||||
# --- Page-level metrics ---
|
||||
if 'status' in crawl_df.columns:
|
||||
status_dist = crawl_df['status'].value_counts().to_dict()
|
||||
result["page_status"] = {str(k): int(v) for k, v in status_dist.items()}
|
||||
if 'title' in crawl_df.columns:
|
||||
missing_titles = int(crawl_df['title'].isna().sum())
|
||||
result["missing_titles"] = missing_titles
|
||||
if 'meta_desc' in crawl_df.columns:
|
||||
missing_descriptions = int(crawl_df['meta_desc'].isna().sum())
|
||||
result["missing_descriptions"] = missing_descriptions
|
||||
|
||||
result["timestamp"] = datetime.utcnow().isoformat()
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to analyze site structure: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
if temp_file and os.path.exists(temp_file):
|
||||
try:
|
||||
os.remove(temp_file)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
|
||||
|
||||
async def analyze_robots_txt(self, website_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch and analyze robots.txt for compliance issues.
|
||||
Checks directives, sitemap declaration, crawl-delay, and common problems.
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"Analyzing robots.txt for {website_url}")
|
||||
parsed = urlparse(website_url)
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
robots_url = f"{base_url}/robots.txt"
|
||||
result = {
|
||||
"success": True,
|
||||
"url": robots_url,
|
||||
"accessible": True,
|
||||
"total_directives": 0,
|
||||
"user_agents_found": [],
|
||||
"has_sitemap_directive": False,
|
||||
"sitemap_urls": [],
|
||||
"has_crawl_delay": False,
|
||||
"disallow_rules": [],
|
||||
"issues": [],
|
||||
"compliance_score": 100,
|
||||
}
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
robots_df = await loop.run_in_executor(
|
||||
None, lambda: adv.robotstxt_to_df(robots_url)
|
||||
)
|
||||
if robots_df is None or robots_df.empty:
|
||||
raise ValueError("Empty result from robotstxt_to_df")
|
||||
except Exception as adv_err:
|
||||
self.logger.warning(f"adv.robotstxt_to_df failed, using manual fallback: {adv_err}")
|
||||
robots_df = await loop.run_in_executor(
|
||||
None, lambda: self._parse_robots_txt_manual(robots_url)
|
||||
)
|
||||
if robots_df is None or robots_df.empty:
|
||||
result["success"] = False
|
||||
result["error"] = "Could not fetch or parse robots.txt"
|
||||
result["accessible"] = False
|
||||
return result
|
||||
|
||||
result["total_directives"] = len(robots_df)
|
||||
|
||||
if 'user_agent' in robots_df.columns:
|
||||
result["user_agents_found"] = robots_df['user_agent'].dropna().unique().tolist()
|
||||
|
||||
rule_col = 'rule' if 'rule' in robots_df.columns else 'directive' if 'directive' in robots_df.columns else None
|
||||
value_col = 'value' if 'value' in robots_df.columns else 'directive_value' if 'directive_value' in robots_df.columns else None
|
||||
|
||||
if rule_col and value_col:
|
||||
rules_lower = robots_df[rule_col].astype(str).str.lower()
|
||||
result["has_sitemap_directive"] = 'sitemap' in rules_lower.values
|
||||
result["has_crawl_delay"] = 'crawl-delay' in rules_lower.values
|
||||
has_disallow_all = any(
|
||||
str(row.get(value_col, '')).strip() == '/'
|
||||
for _, row in robots_df[robots_df[rule_col].astype(str).str.lower() == 'disallow'].iterrows()
|
||||
) if 'disallow' in rules_lower.values else False
|
||||
|
||||
disallow_mask = rules_lower == 'disallow'
|
||||
if disallow_mask.any():
|
||||
for _, row in robots_df[disallow_mask].iterrows():
|
||||
val = str(row.get(value_col, ''))
|
||||
ua = str(row.get('user_agent', '*'))
|
||||
if val:
|
||||
result["disallow_rules"].append({"user_agent": ua, "path": val})
|
||||
|
||||
sitemap_mask = rules_lower == 'sitemap'
|
||||
if sitemap_mask.any():
|
||||
result["sitemap_urls"] = robots_df.loc[sitemap_mask, value_col].dropna().unique().tolist()
|
||||
|
||||
if has_disallow_all:
|
||||
result["issues"].append({
|
||||
"severity": "critical", "code": "DISALLOW_ALL",
|
||||
"detail": "robots.txt disallows all user agents from all paths (Disallow: /)"
|
||||
})
|
||||
|
||||
if not result["has_sitemap_directive"]:
|
||||
result["issues"].append({
|
||||
"severity": "warning", "code": "NO_SITEMAP",
|
||||
"detail": "No Sitemap directive found — search engines may miss pages"
|
||||
})
|
||||
if not result["has_crawl_delay"]:
|
||||
result["issues"].append({
|
||||
"severity": "info", "code": "NO_CRAWL_DELAY",
|
||||
"detail": "No Crawl-delay directive set — not critical for most sites"
|
||||
})
|
||||
|
||||
for issue in result["issues"]:
|
||||
sev = issue["severity"]
|
||||
if sev == "critical":
|
||||
result["compliance_score"] -= 30
|
||||
elif sev == "warning":
|
||||
result["compliance_score"] -= 15
|
||||
elif sev == "info":
|
||||
result["compliance_score"] -= 5
|
||||
result["compliance_score"] = max(result["compliance_score"], 0)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Robots.txt analysis failed: {e}")
|
||||
return {"success": False, "error": str(e), "url": robots_url if 'robots_url' in locals() else website_url}
|
||||
|
||||
def _parse_robots_txt_manual(self, url: str) -> pd.DataFrame:
|
||||
"""Fallback: manually fetch and parse robots.txt."""
|
||||
records = []
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
content = resp.read().decode("utf-8", errors="replace")
|
||||
current_ua = "*"
|
||||
for line in content.splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if line.lower().startswith("user-agent"):
|
||||
parts = line.split(":", 1)
|
||||
current_ua = parts[1].strip() if len(parts) > 1 else "*"
|
||||
continue
|
||||
if ":" in line:
|
||||
directive, _, value = line.partition(":")
|
||||
records.append({
|
||||
"user_agent": current_ua,
|
||||
"rule": directive.strip(),
|
||||
"value": value.strip(),
|
||||
})
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Manual robots.txt fetch failed: {e}")
|
||||
if not records:
|
||||
return pd.DataFrame()
|
||||
return pd.DataFrame(records)
|
||||
|
||||
async def analyze_crawl_budget(self, sitemap_url: str, site_domain: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze crawl budget by comparing sitemap inventory against actual crawl results.
|
||||
Estimates budget utilization, waste from redirects/errors, and optimization score.
|
||||
"""
|
||||
temp_file = None
|
||||
try:
|
||||
self.logger.info(f"Analyzing crawl budget for {site_domain}")
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
sitemap_df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
|
||||
sitemap_total = len(sitemap_df) if sitemap_df is not None and not sitemap_df.empty else 0
|
||||
|
||||
start_url = f"https://{site_domain}" if not site_domain.startswith("http") else site_domain
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
|
||||
temp_file = tf.name
|
||||
|
||||
await loop.run_in_executor(None, lambda: adv.crawl(
|
||||
url_list=[start_url],
|
||||
output_file=temp_file,
|
||||
follow_links=True,
|
||||
allowed_domains=[site_domain],
|
||||
custom_settings={
|
||||
'LOG_LEVEL': 'WARNING',
|
||||
'CLOSESPIDER_PAGECOUNT': 30,
|
||||
'DOWNLOAD_TIMEOUT': 15,
|
||||
'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
|
||||
'DEPTH_LIMIT': 2,
|
||||
}
|
||||
))
|
||||
|
||||
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
|
||||
return {"success": False, "error": "Crawl produced no output"}
|
||||
|
||||
crawl_df = pd.read_json(temp_file, lines=True)
|
||||
crawled_count = len(crawl_df)
|
||||
|
||||
status_dist = {}
|
||||
if 'status' in crawl_df.columns:
|
||||
raw = crawl_df['status'].value_counts().to_dict()
|
||||
status_dist = {str(k): int(v) for k, v in raw.items()}
|
||||
|
||||
wasted = 0
|
||||
for code_s in status_dist:
|
||||
code = int(code_s)
|
||||
if code >= 300 or code < 200:
|
||||
wasted += status_dist[code_s]
|
||||
|
||||
budget_usage_ratio = round(crawled_count / max(sitemap_total, 1), 3)
|
||||
waste_ratio = round(wasted / max(crawled_count, 1), 3)
|
||||
|
||||
depth_dist = {}
|
||||
if 'depth' in crawl_df.columns:
|
||||
raw = crawl_df['depth'].value_counts().sort_index().to_dict()
|
||||
depth_dist = {str(k): int(v) for k, v in raw.items()}
|
||||
|
||||
param_count = 0
|
||||
url_col = 'url' if 'url' in crawl_df.columns else 'response_url' if 'response_url' in crawl_df.columns else None
|
||||
if url_col:
|
||||
param_count = int(crawl_df[url_col].astype(str).str.contains('?').sum())
|
||||
|
||||
optimization_score = max(0, round(100 - (waste_ratio * 100) - (budget_usage_ratio * 20), 1))
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"sitemap_total_urls": sitemap_total,
|
||||
"pages_crawled": crawled_count,
|
||||
"crawl_coverage_percentage": round(budget_usage_ratio * 100, 1),
|
||||
"status_distribution": status_dist,
|
||||
"wasted_crawl_requests": int(wasted),
|
||||
"waste_percentage": round(waste_ratio * 100, 1),
|
||||
"depth_distribution": depth_dist,
|
||||
"urls_with_parameters": int(param_count),
|
||||
"optimization_score": optimization_score,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Crawl budget analysis failed: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
if temp_file and os.path.exists(temp_file):
|
||||
try: os.remove(temp_file)
|
||||
except Exception: pass
|
||||
|
||||
async def sitemap_compare(self, sitemap_a: str, sitemap_b: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Compare two sitemaps for competitive content gap analysis.
|
||||
Analyzes URL count, freshness, directory pillars, and identifies
|
||||
patterns unique to each sitemap.
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"Comparing sitemaps: {sitemap_a} vs {sitemap_b}")
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
df_a = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_a))
|
||||
df_b = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_b))
|
||||
|
||||
total_a = len(df_a) if df_a is not None and not df_a.empty else 0
|
||||
total_b = len(df_b) if df_b is not None and not df_b.empty else 0
|
||||
result = {
|
||||
"success": True,
|
||||
"sitemap_a": {"url": sitemap_a, "total_urls": total_a},
|
||||
"sitemap_b": {"url": sitemap_b, "total_urls": total_b},
|
||||
"url_count_diff": total_a - total_b,
|
||||
"ratio": round(total_a / max(total_b, 1), 2),
|
||||
"pillars_a": {},
|
||||
"pillars_b": {},
|
||||
"shared_pillars": [],
|
||||
"unique_to_a": [],
|
||||
"unique_to_b": [],
|
||||
"freshness_comparison": {},
|
||||
"overlap_score": 0,
|
||||
}
|
||||
|
||||
if total_a == 0 or total_b == 0:
|
||||
return result
|
||||
|
||||
def extract_pillars(df: pd.DataFrame, label: str) -> Tuple[dict, list]:
|
||||
pillars = {}
|
||||
if 'loc' in df.columns:
|
||||
try:
|
||||
url_df = adv.url_to_df(df['loc'])
|
||||
if url_df is not None and not url_df.empty:
|
||||
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
|
||||
if dir_cols:
|
||||
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
|
||||
for col in dir_cols[1:3]:
|
||||
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
|
||||
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
|
||||
pillars = pillar_series.value_counts().head(20).to_dict()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not pillars:
|
||||
seen = {}
|
||||
for url in df['loc'].dropna():
|
||||
parts = urlparse(url).path.strip('/').split('/')
|
||||
key = parts[0] if parts and parts[0] else "home"
|
||||
seen[key] = seen.get(key, 0) + 1
|
||||
pillars = dict(sorted(seen.items(), key=lambda x: x[1], reverse=True)[:20])
|
||||
|
||||
pillar_keys = list(pillars.keys()) if pillars else []
|
||||
return pillars, pillar_keys
|
||||
|
||||
pillars_a, keys_a = extract_pillars(df_a, "a")
|
||||
pillars_b, keys_b = extract_pillars(df_b, "b")
|
||||
result["pillars_a"] = pillars_a
|
||||
result["pillars_b"] = pillars_b
|
||||
|
||||
set_a = set(keys_a)
|
||||
set_b = set(keys_b)
|
||||
shared = set_a & set_b
|
||||
result["shared_pillars"] = sorted(shared)
|
||||
result["unique_to_a"] = sorted(set_a - set_b)
|
||||
result["unique_to_b"] = sorted(set_b - set_a)
|
||||
|
||||
total_keys = max(len(set_a | set_b), 1)
|
||||
overlap_count = len(shared)
|
||||
result["overlap_score"] = round((overlap_count / total_keys) * 100, 1)
|
||||
|
||||
def compute_freshness_stats(df: pd.DataFrame) -> dict:
|
||||
stats = {"has_lastmod": False, "recent_30d": 0, "total_with_dates": 0}
|
||||
if 'lastmod' in df.columns:
|
||||
lm = pd.to_datetime(df['lastmod'], errors='coerce', utc=True).dropna()
|
||||
if not lm.empty:
|
||||
stats["has_lastmod"] = True
|
||||
stats["total_with_dates"] = int(len(lm))
|
||||
stats["recent_30d"] = int((lm > (datetime.now(lm.dt.tz) - timedelta(days=30))).sum())
|
||||
return stats
|
||||
|
||||
result["freshness_comparison"] = {
|
||||
"a": compute_freshness_stats(df_a),
|
||||
"b": compute_freshness_stats(df_b),
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Sitemap comparison failed: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def compare_crawl_results(self, result_a: Dict[str, Any], result_b: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Compare two crawl analysis result dicts to surface changes over time.
|
||||
Useful for tracking SEO improvements between scheduled executions.
|
||||
"""
|
||||
try:
|
||||
diff = {
|
||||
"success": True,
|
||||
"page_count_change": 0,
|
||||
"status_distribution_changes": {},
|
||||
"link_health_changes": {},
|
||||
"redirect_changes": {},
|
||||
"new_issues": [],
|
||||
"resolved_issues": [],
|
||||
}
|
||||
|
||||
pc_a = result_a.get("page_count", 0)
|
||||
pc_b = result_b.get("page_count", 0)
|
||||
diff["page_count_change"] = pc_b - pc_a
|
||||
|
||||
sd_a = result_a.get("page_status", {})
|
||||
sd_b = result_b.get("page_status", {})
|
||||
all_codes = set(list(sd_a.keys()) + list(sd_b.keys()))
|
||||
for c in sorted(all_codes):
|
||||
va = sd_a.get(c, 0)
|
||||
vb = sd_b.get(c, 0)
|
||||
change = vb - va
|
||||
if change != 0:
|
||||
diff["status_distribution_changes"][c] = change
|
||||
|
||||
def _safe_diff(d_a: dict, d_b: dict, prefix: str) -> dict:
|
||||
changes = {}
|
||||
all_keys = set(list(d_a.keys()) + list(d_b.keys()))
|
||||
for k in all_keys:
|
||||
va = d_a.get(k, 0)
|
||||
vb = d_b.get(k, 0)
|
||||
if isinstance(va, (int, float)) and isinstance(vb, (int, float)):
|
||||
change = round(vb - va, 2)
|
||||
if change != 0:
|
||||
changes[f"{prefix}_{k}"] = change
|
||||
return changes
|
||||
|
||||
lh_a = result_a.get("link_health", {})
|
||||
lh_b = result_b.get("link_health", {})
|
||||
diff["link_health_changes"] = _safe_diff(lh_a, lh_b, "link")
|
||||
|
||||
rd_a = result_a.get("redirect_audit", {})
|
||||
rd_b = result_b.get("redirect_audit", {})
|
||||
diff["redirect_changes"] = _safe_diff(rd_a, rd_b, "redirect")
|
||||
|
||||
return diff
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Crawl comparison failed: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyzes linking patterns and social media presence using unique temporary files.
|
||||
|
||||
@@ -454,14 +454,12 @@ class SEODashboardService:
|
||||
def _get_advertools_insights(self, user_id: str, site_url: str) -> Dict[str, Any]:
|
||||
"""Fetch Advertools-based insights from WebsiteAnalysis and AdvertoolsTasks."""
|
||||
try:
|
||||
# 1. Get augmented persona themes from WebsiteAnalysis
|
||||
session = self.db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
|
||||
if not session:
|
||||
return {}
|
||||
|
||||
analysis = self.db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
|
||||
|
||||
# 2. Get latest tasks status
|
||||
tasks = self.db.query(AdvertoolsTask).filter(AdvertoolsTask.user_id == user_id).all()
|
||||
|
||||
audit_status = "pending"
|
||||
@@ -479,6 +477,14 @@ class SEODashboardService:
|
||||
|
||||
return {
|
||||
"augmented_themes": brand_analysis.get('augmented_themes', []),
|
||||
"link_health": brand_analysis.get('link_health', {}),
|
||||
"redirect_audit": brand_analysis.get('redirect_audit', {}),
|
||||
"image_seo": brand_analysis.get('image_seo', {}),
|
||||
"page_status": brand_analysis.get('page_status', {}),
|
||||
"url_structure": brand_analysis.get('url_structure', {}),
|
||||
"freshness": brand_analysis.get('freshness', {}),
|
||||
"robots_txt": brand_analysis.get('robots_txt', {}),
|
||||
"crawl_budget": brand_analysis.get('crawl_budget', {}),
|
||||
"last_audit": brand_analysis.get('last_advertools_audit'),
|
||||
"site_health": seo_audit.get('site_health', {}),
|
||||
"last_health_check": seo_audit.get('last_advertools_health_check'),
|
||||
|
||||
@@ -378,7 +378,48 @@ class SIFIntegrationService:
|
||||
themes = adv_insights.get('augmented_themes', [])
|
||||
if themes:
|
||||
text_content += f"Augmented Themes: {', '.join(themes[:5])}. "
|
||||
|
||||
|
||||
freshness = adv_insights.get('freshness', {})
|
||||
if freshness:
|
||||
text_content += (f"Content Freshness Score: {freshness.get('freshness_score', 'N/A')}. "
|
||||
f"Publishing Velocity: {freshness.get('publishing_velocity', 0)}/week. "
|
||||
f"Trend: {freshness.get('publishing_trend', 'unknown')}. "
|
||||
f"Last 30d: {freshness.get('publishing_recency', {}).get('last_30d', 0)} pages. ")
|
||||
|
||||
link_health = adv_insights.get('link_health', {})
|
||||
if link_health and 'error' not in link_health:
|
||||
text_content += (f"Internal Links: {link_health.get('internal_link_count', 0)}. "
|
||||
f"External Links: {link_health.get('external_link_count', 0)}. "
|
||||
f"Nofollow: {link_health.get('nofollow_link_count', 0)}. "
|
||||
f"Avg Links/Page: {link_health.get('avg_links_per_page', 0)}. ")
|
||||
|
||||
redirects = adv_insights.get('redirect_audit', {})
|
||||
if redirects and 'error' not in redirects:
|
||||
text_content += (f"Redirects: {redirects.get('total_redirects', 0)} total, "
|
||||
f"{redirects.get('multi_hop_chains', 0)} multi-hop. ")
|
||||
|
||||
image_seo = adv_insights.get('image_seo', {})
|
||||
if image_seo and 'error' not in image_seo:
|
||||
text_content += (f"Images: {image_seo.get('total_images', 0)} total, "
|
||||
f"Alt Coverage: {image_seo.get('alt_coverage_percentage', 0)}%. ")
|
||||
|
||||
url_struct = adv_insights.get('url_structure', {})
|
||||
if url_struct:
|
||||
text_content += (f"URL Structure: {url_struct.get('total_urls_analyzed', 0)} URLs, "
|
||||
f"Avg Depth: {url_struct.get('directory_depth', {}).get('average_depth', 0)}. "
|
||||
f"Params: {url_struct.get('parameter_usage', {}).get('percentage_with_params', 0)}%. ")
|
||||
|
||||
robots = adv_insights.get('robots_txt', {})
|
||||
if robots and robots.get('success'):
|
||||
text_content += (f"Robots.txt: {robots.get('total_directives', 0)} directives, "
|
||||
f"Compliance: {robots.get('compliance_score', 0)}/100. "
|
||||
f"Issues: {len(robots.get('issues', []))}. ")
|
||||
|
||||
budget = adv_insights.get('crawl_budget', {})
|
||||
if budget and budget.get('success'):
|
||||
text_content += (f"Crawl Budget: {budget.get('pages_crawled', 0)} crawled of {budget.get('sitemap_total_urls', 0)} URLs. "
|
||||
f"Waste: {budget.get('waste_percentage', 0)}%. "
|
||||
f"Score: {budget.get('optimization_score', 0)}. ")
|
||||
# Add Technical SEO overview
|
||||
tech_audit = dashboard_data.get('technical_seo_audit', {})
|
||||
if tech_audit:
|
||||
|
||||
@@ -143,16 +143,18 @@ class WixService:
|
||||
access_token: Valid access token
|
||||
|
||||
Returns:
|
||||
Site information
|
||||
Site information (or {_no_site: True} if no site exists)
|
||||
"""
|
||||
token_str = normalize_token_string(access_token)
|
||||
if not token_str:
|
||||
raise ValueError("Invalid access token format for create_blog_post")
|
||||
return {"_no_site": True, "error": "Invalid access token format"}
|
||||
meta = extract_meta_from_token(token_str)
|
||||
meta_site_id = meta.get("metaSiteId")
|
||||
try:
|
||||
return self.auth_service.get_site_info(token_str)
|
||||
return self.auth_service.get_site_info(token_str, meta_site_id=meta_site_id)
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Failed to get site info: {e}")
|
||||
raise
|
||||
logger.warning(f"Failed to get site info: {e}")
|
||||
return {"_no_site": True, "error": str(e)}
|
||||
|
||||
def get_current_member(self, access_token: str) -> Dict[str, Any]:
|
||||
"""
|
||||
|
||||
387
backend/services/youtube/youtube_task_manager.py
Normal file
387
backend/services/youtube/youtube_task_manager.py
Normal file
@@ -0,0 +1,387 @@
|
||||
"""
|
||||
YouTube Creator Task Manager
|
||||
|
||||
Hybrid DB-backed + in-memory task manager for YouTube video operations.
|
||||
Writes task state to PostgreSQL so renders/combines/publishes survive
|
||||
server restarts. Falls back to in-memory dict when DB is unavailable.
|
||||
|
||||
API surface matches Story Writer's TaskManager for drop-in compatibility.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional
|
||||
from loguru import logger
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models.youtube_task_models import YouTubeVideoTask, YouTubeTaskType, YouTubeTaskStatus
|
||||
from services.database import get_session_for_user, get_engine_for_user
|
||||
from models.subscription_models import Base as SubscriptionBase
|
||||
|
||||
|
||||
class YouTubeTaskManager:
|
||||
"""Hybrid persistent + in-memory task manager for YouTube Creator."""
|
||||
|
||||
def __init__(self):
|
||||
self.task_storage: Dict[str, Dict[str, Any]] = {}
|
||||
self._ensure_tables()
|
||||
|
||||
def _ensure_tables(self):
|
||||
"""Ensure youtube_video_tasks table exists for all initialised users."""
|
||||
try:
|
||||
from services.database import _user_engines
|
||||
for user_id, engine in list(_user_engines.items()):
|
||||
try:
|
||||
SubscriptionBase.metadata.create_all(bind=engine, checkfirst=True)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _get_db(self, user_id: str) -> Optional[Session]:
|
||||
"""Get a DB session for the given user. Returns None on failure."""
|
||||
if not user_id:
|
||||
return None
|
||||
try:
|
||||
session = get_session_for_user(user_id)
|
||||
if session:
|
||||
engine = get_engine_for_user(user_id)
|
||||
SubscriptionBase.metadata.create_all(bind=engine, checkfirst=True)
|
||||
return session
|
||||
except Exception as e:
|
||||
logger.warning(f"[YouTubeTaskManager] DB unavailable for user {user_id}: {e}")
|
||||
return None
|
||||
|
||||
def _map_task_type(self, task_type_str: str) -> YouTubeTaskType:
|
||||
"""Map a string task type to the enum."""
|
||||
mapping = {
|
||||
"youtube_video_render": YouTubeTaskType.RENDER,
|
||||
"youtube_scene_video_render": YouTubeTaskType.SCENE_RENDER,
|
||||
"youtube_video_combine": YouTubeTaskType.COMBINE,
|
||||
"youtube_combine_video": YouTubeTaskType.COMBINE,
|
||||
"youtube_publish": YouTubeTaskType.PUBLISH,
|
||||
"youtube_image_generation": YouTubeTaskType.IMAGE_GENERATION,
|
||||
"youtube_audio_generation": YouTubeTaskType.AUDIO_GENERATION,
|
||||
}
|
||||
return mapping.get(task_type_str, YouTubeTaskType.RENDER)
|
||||
|
||||
def _map_status_to_enum(self, status: str) -> YouTubeTaskStatus:
|
||||
"""Map a frontend status string to the DB enum."""
|
||||
mapping = {
|
||||
"pending": YouTubeTaskStatus.PENDING,
|
||||
"processing": YouTubeTaskStatus.PROCESSING,
|
||||
"running": YouTubeTaskStatus.PROCESSING,
|
||||
"completed": YouTubeTaskStatus.COMPLETED,
|
||||
"failed": YouTubeTaskStatus.FAILED,
|
||||
}
|
||||
return mapping.get(status, YouTubeTaskStatus.PENDING)
|
||||
|
||||
def _map_status_from_enum(self, status: YouTubeTaskStatus) -> str:
|
||||
"""Map DB enum to frontend status string."""
|
||||
mapping = {
|
||||
YouTubeTaskStatus.PENDING: "pending",
|
||||
YouTubeTaskStatus.PROCESSING: "processing",
|
||||
YouTubeTaskStatus.COMPLETED: "completed",
|
||||
YouTubeTaskStatus.FAILED: "failed",
|
||||
}
|
||||
return mapping.get(status, "pending")
|
||||
|
||||
def create_task(
|
||||
self,
|
||||
task_type: str = "youtube_video_render",
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
user_id: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Create a new task. Persists to DB if user_id provided; always writes to in-memory."""
|
||||
task_id = str(uuid.uuid4())
|
||||
task_metadata = metadata or {}
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Always write to in-memory for fast lookups
|
||||
self.task_storage[task_id] = {
|
||||
"status": "pending",
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
"result": None,
|
||||
"error": None,
|
||||
"progress_messages": [],
|
||||
"task_type": task_type,
|
||||
"progress": 0.0,
|
||||
"metadata": task_metadata,
|
||||
}
|
||||
|
||||
# Persist to DB
|
||||
effective_user_id = user_id or task_metadata.get("owner_user_id")
|
||||
if effective_user_id:
|
||||
db = self._get_db(effective_user_id)
|
||||
if db:
|
||||
try:
|
||||
db_task = YouTubeVideoTask(
|
||||
task_id=task_id,
|
||||
user_id=effective_user_id,
|
||||
task_type=self._map_task_type(task_type),
|
||||
status=YouTubeTaskStatus.PENDING,
|
||||
progress=0.0,
|
||||
request_data=task_metadata if task_metadata else None,
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
)
|
||||
db.add(db_task)
|
||||
db.commit()
|
||||
logger.debug(f"[YouTubeTaskManager] Persisted task {task_id} to DB for user {effective_user_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[YouTubeTaskManager] Failed to persist task {task_id} to DB: {e}")
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
logger.info(f"[YouTubeTaskManager] Created task: {task_id} (type: {task_type})")
|
||||
return task_id
|
||||
|
||||
def get_task_status(self, task_id: str, requester_user_id: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
||||
"""Get task status. Checks in-memory first, then DB."""
|
||||
# Check in-memory first (fast path)
|
||||
if task_id in self.task_storage:
|
||||
task = self.task_storage[task_id]
|
||||
metadata = task.get("metadata", {}) or {}
|
||||
owner_user_id = metadata.get("owner_user_id")
|
||||
|
||||
if requester_user_id is not None and owner_user_id is not None and requester_user_id != owner_user_id:
|
||||
logger.warning(f"[YouTubeTaskManager] Task access denied for task {task_id}")
|
||||
return None
|
||||
|
||||
response = {
|
||||
"task_id": task_id,
|
||||
"status": task["status"],
|
||||
"progress": task.get("progress", 0.0),
|
||||
"message": task.get("progress_messages", [])[-1] if task.get("progress_messages") else None,
|
||||
"created_at": task["created_at"].isoformat() if task.get("created_at") else None,
|
||||
"updated_at": task.get("updated_at", task.get("created_at")).isoformat() if task.get("updated_at") or task.get("created_at") else None,
|
||||
}
|
||||
if task["status"] == "completed" and task.get("result"):
|
||||
response["result"] = task["result"]
|
||||
if task["status"] == "failed" and task.get("error"):
|
||||
response["error"] = task["error"]
|
||||
if task.get("error_status") is not None:
|
||||
response["error_status"] = task["error_status"]
|
||||
if task.get("error_data") is not None:
|
||||
response["error_data"] = task["error_data"]
|
||||
return response
|
||||
|
||||
# Fall back to DB
|
||||
if requester_user_id:
|
||||
db = self._get_db(requester_user_id)
|
||||
if db:
|
||||
try:
|
||||
db_task = db.query(YouTubeVideoTask).filter(YouTubeVideoTask.task_id == task_id).first()
|
||||
if db_task:
|
||||
status_val = self._map_status_from_enum(db_task.status)
|
||||
response = {
|
||||
"task_id": db_task.task_id,
|
||||
"status": status_val,
|
||||
"progress": db_task.progress or 0.0,
|
||||
"message": db_task.message,
|
||||
"created_at": db_task.created_at.isoformat() if db_task.created_at else None,
|
||||
"updated_at": db_task.updated_at.isoformat() if db_task.updated_at else None,
|
||||
}
|
||||
if db_task.result:
|
||||
response["result"] = db_task.result if isinstance(db_task.result, dict) else db_task.result
|
||||
if db_task.error:
|
||||
response["error"] = db_task.error
|
||||
if isinstance(db_task.result, dict):
|
||||
if db_task.result.get("error_status") is not None:
|
||||
response["error_status"] = db_task.result["error_status"]
|
||||
if db_task.result.get("error_data") is not None:
|
||||
response["error_data"] = db_task.result["error_data"]
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.warning(f"[YouTubeTaskManager] DB lookup failed for task {task_id}: {e}")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
return None
|
||||
|
||||
def update_task_status(
|
||||
self,
|
||||
task_id: str,
|
||||
status: str,
|
||||
progress: Optional[float] = None,
|
||||
message: Optional[str] = None,
|
||||
result: Optional[Dict[str, Any]] = None,
|
||||
error: Optional[str] = None,
|
||||
error_status: Optional[int] = None,
|
||||
error_data: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""Update task status. Writes to both in-memory and DB."""
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Update in-memory
|
||||
if task_id in self.task_storage:
|
||||
task = self.task_storage[task_id]
|
||||
task["status"] = status
|
||||
task["updated_at"] = now
|
||||
if progress is not None:
|
||||
task["progress"] = progress
|
||||
if message:
|
||||
if "progress_messages" not in task:
|
||||
task["progress_messages"] = []
|
||||
task["progress_messages"].append(message)
|
||||
logger.info(f"[YouTubeTaskManager] Task {task_id}: {message} (progress: {progress}%)")
|
||||
if result is not None:
|
||||
task["result"] = result
|
||||
if error is not None:
|
||||
task["error"] = error
|
||||
logger.error(f"[YouTubeTaskManager] Task {task_id} error: {error}")
|
||||
if error_status is not None:
|
||||
task["error_status"] = error_status
|
||||
if error_data is not None:
|
||||
task["error_data"] = error_data
|
||||
|
||||
# Try DB update
|
||||
metadata = task.get("metadata", {}) or {}
|
||||
user_id = metadata.get("owner_user_id")
|
||||
self._update_db_task(task_id, user_id, status, progress, message, result, error, now)
|
||||
else:
|
||||
logger.warning(f"[YouTubeTaskManager] Cannot update non-existent task: {task_id}")
|
||||
|
||||
def _update_db_task(
|
||||
self,
|
||||
task_id: str,
|
||||
user_id: Optional[str],
|
||||
status: str,
|
||||
progress: Optional[float],
|
||||
message: Optional[str],
|
||||
result: Optional[Dict[str, Any]],
|
||||
error: Optional[str],
|
||||
now: datetime,
|
||||
):
|
||||
"""Update task in DB."""
|
||||
if not user_id:
|
||||
return
|
||||
|
||||
db = self._get_db(user_id)
|
||||
if not db:
|
||||
return
|
||||
|
||||
try:
|
||||
db_task = db.query(YouTubeVideoTask).filter(YouTubeVideoTask.task_id == task_id).first()
|
||||
if db_task:
|
||||
db_task.status = self._map_status_to_enum(status)
|
||||
db_task.updated_at = now
|
||||
if progress is not None:
|
||||
db_task.progress = progress
|
||||
if message:
|
||||
db_task.message = message[:500] if message else None
|
||||
if result:
|
||||
# Merge error fields into result if present
|
||||
existing_result = db_task.result if isinstance(db_task.result, dict) else {}
|
||||
existing_result.update(result)
|
||||
db_task.result = existing_result
|
||||
if error:
|
||||
db_task.error = error
|
||||
if status in ("completed", "failed"):
|
||||
db_task.completed_at = now
|
||||
db.commit()
|
||||
logger.debug(f"[YouTubeTaskManager] Persisted status update for task {task_id}")
|
||||
else:
|
||||
logger.debug(f"[YouTubeTaskManager] Task {task_id} not found in DB for update")
|
||||
except Exception as e:
|
||||
logger.warning(f"[YouTubeTaskManager] Failed to update DB task {task_id}: {e}")
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
def recover_stale_tasks(self, user_id: str):
|
||||
"""Mark in-flight tasks that were interrupted by server restart as failed.
|
||||
|
||||
Called on startup for each user to handle tasks that were 'processing'
|
||||
when the server went down.
|
||||
"""
|
||||
db = self._get_db(user_id)
|
||||
if not db:
|
||||
return 0
|
||||
|
||||
count = 0
|
||||
try:
|
||||
stale_tasks = db.query(YouTubeVideoTask).filter(
|
||||
YouTubeVideoTask.user_id == user_id,
|
||||
YouTubeVideoTask.status.in_([
|
||||
YouTubeTaskStatus.PENDING,
|
||||
YouTubeTaskStatus.PROCESSING,
|
||||
]),
|
||||
).all()
|
||||
|
||||
for task in stale_tasks:
|
||||
task.status = YouTubeTaskStatus.FAILED
|
||||
task.error = "Task interrupted by server restart"
|
||||
task.message = "Marked as failed on server restart"
|
||||
task.completed_at = datetime.now(timezone.utc)
|
||||
task.updated_at = datetime.now(timezone.utc)
|
||||
count += 1
|
||||
logger.info(f"[YouTubeTaskManager] Recovered stale task {task.task_id} for user {user_id}")
|
||||
|
||||
if count > 0:
|
||||
db.commit()
|
||||
logger.info(f"[YouTubeTaskManager] Recovered {count} stale tasks for user {user_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[YouTubeTaskManager] Failed to recover stale tasks: {e}")
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
return count
|
||||
|
||||
def cleanup_old_tasks(self):
|
||||
"""Remove in-memory tasks older than 1 hour. DB cleanup is handled by vacuum."""
|
||||
now = datetime.now(timezone.utc)
|
||||
cutoff = now.timestamp() - 3600 # 1 hour
|
||||
|
||||
tasks_to_remove = []
|
||||
for task_id, task_data in self.task_storage.items():
|
||||
created_at = task_data.get("created_at")
|
||||
if created_at:
|
||||
ts = created_at.timestamp() if hasattr(created_at, 'timestamp') else 0
|
||||
if ts < cutoff:
|
||||
tasks_to_remove.append(task_id)
|
||||
|
||||
for task_id in tasks_to_remove:
|
||||
del self.task_storage[task_id]
|
||||
logger.debug(f"[YouTubeTaskManager] Cleaned up old in-memory task: {task_id}")
|
||||
|
||||
def cleanup_old_db_tasks(self, days: int = 7, user_id: Optional[str] = None):
|
||||
"""Delete completed/failed DB tasks older than N days."""
|
||||
if not user_id:
|
||||
return 0
|
||||
|
||||
db = self._get_db(user_id)
|
||||
if not db:
|
||||
return 0
|
||||
|
||||
count = 0
|
||||
try:
|
||||
from datetime import timedelta
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
|
||||
old_tasks = db.query(YouTubeVideoTask).filter(
|
||||
YouTubeVideoTask.user_id == user_id,
|
||||
YouTubeVideoTask.status.in_([YouTubeTaskStatus.COMPLETED, YouTubeTaskStatus.FAILED]),
|
||||
YouTubeVideoTask.created_at < cutoff,
|
||||
).all()
|
||||
|
||||
for task in old_tasks:
|
||||
db.delete(task)
|
||||
count += 1
|
||||
|
||||
if count > 0:
|
||||
db.commit()
|
||||
logger.info(f"[YouTubeTaskManager] Cleaned up {count} old DB tasks for user {user_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[YouTubeTaskManager] Failed to cleanup old DB tasks: {e}")
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
return count
|
||||
|
||||
|
||||
# Global singleton instance
|
||||
task_manager = YouTubeTaskManager()
|
||||
Reference in New Issue
Block a user