Implement evidence-driven semantic gap detection

This commit is contained in:
ي
2026-03-01 21:58:44 +05:30
parent f8f7ddeb2a
commit 77088bfc53
4 changed files with 412 additions and 77 deletions

View File

@@ -7,6 +7,8 @@ Each agent leverages TxtaiIntelligenceService for semantic operations.
import traceback import traceback
import json import json
import asyncio import asyncio
import re
from collections import Counter
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
from datetime import datetime from datetime import datetime
from loguru import logger from loguru import logger
@@ -177,23 +179,193 @@ class StrategyArchitectAgent(SIFBaseAgent):
self._log_agent_operation("Finding semantic content gaps", competitor_count=len(competitor_indices)) self._log_agent_operation("Finding semantic content gaps", competitor_count=len(competitor_indices))
try: try:
# STUB: Implement cross-index comparison documents = await self._fetch_index_documents()
# This would involve: if not documents:
# 1. Getting user content topics/themes logger.info(f"[{self.__class__.__name__}] No indexed documents available for gap detection")
# 2. Getting competitor content topics/themes return []
# 3. Finding topics competitors cover but user doesn't
competitor_docs, user_docs = [], []
logger.info(f"[{self.__class__.__name__}] Found semantic gaps analysis stub") allowed_competitor_ids = set(str(idx) for idx in competitor_indices) if competitor_indices else None
return [ for doc in documents:
{"topic": "Topic A", "priority": "high", "reason": "Competitor coverage gap"}, metadata = doc.get("metadata", {})
{"topic": "Topic B", "priority": "medium", "reason": "Emerging trend"} doc_type = str(metadata.get("type", "")).lower()
] if "competitor" in doc_type:
if allowed_competitor_ids and str(doc.get("id")) not in allowed_competitor_ids:
continue
competitor_docs.append(doc)
elif "user" in doc_type:
user_docs.append(doc)
if not competitor_docs or not user_docs:
logger.info(
f"[{self.__class__.__name__}] Insufficient split for gap analysis: "
f"user_docs={len(user_docs)}, competitor_docs={len(competitor_docs)}"
)
return []
competitor_topics = self._extract_topic_density(competitor_docs)
user_topics = self._extract_topic_density(user_docs)
gaps = []
for topic, competitor_density in competitor_topics.items():
user_density = user_topics.get(topic, 0.0)
density_gap = competitor_density - user_density
if density_gap <= 0.08:
continue
confidence = max(
0.0,
min(1.0, 0.35 + (density_gap * 1.5) + (competitor_density * 0.4))
)
priority = "high" if confidence >= 0.75 else "medium" if confidence >= 0.5 else "low"
gaps.append({
"topic": topic,
"priority": priority,
"reason": (
f"Competitors mention '{topic}' substantially more often "
f"(density {competitor_density:.2f} vs {user_density:.2f})."
),
"confidence": round(confidence, 3),
"topic_density": {
"competitor": round(competitor_density, 4),
"user": round(user_density, 4),
"gap": round(density_gap, 4)
},
"evidence": {
"competitor_sample_titles": self._sample_titles_for_topic(competitor_docs, topic),
"user_sample_titles": self._sample_titles_for_topic(user_docs, topic),
"competitor_doc_count": len(competitor_docs),
"user_doc_count": len(user_docs)
}
})
gaps.sort(
key=lambda item: (
item.get("confidence", 0),
item.get("topic_density", {}).get("gap", 0)
),
reverse=True
)
return gaps[:12]
except Exception as e: except Exception as e:
logger.error(f"[{self.__class__.__name__}] Failed to find semantic gaps: {e}") logger.error(f"[{self.__class__.__name__}] Failed to find semantic gaps: {e}")
logger.error(f"[{self.__class__.__name__}] Full traceback: {traceback.format_exc()}") logger.error(f"[{self.__class__.__name__}] Full traceback: {traceback.format_exc()}")
return [] return []
async def _fetch_index_documents(self) -> List[Dict[str, Any]]:
"""Fetch indexed documents and normalize metadata from txtai result objects."""
if not self.intelligence.is_initialized() or not self.intelligence.embeddings:
return []
embeddings = self.intelligence.embeddings
limit = 0
if hasattr(embeddings, "count"):
try:
limit = int(embeddings.count())
except Exception:
limit = 0
documents = []
candidate_queries = []
if limit > 0:
candidate_queries.extend([
f"select id, text, object from txtai limit {limit}",
f"select id, text, tags from txtai limit {limit}"
])
candidate_queries.extend(["marketing", "content", "seo", "strategy", "social media"])
seen_ids = set()
for query in candidate_queries:
try:
query_limit = limit if query.startswith("select") and limit > 0 else max(10, limit or 50)
rows = embeddings.search(query, limit=query_limit)
except Exception:
continue
for row in rows or []:
doc_id = str(row.get("id", ""))
dedupe_key = doc_id or str(hash(f"{row.get('text','')}::{row.get('score',0)}"))
if dedupe_key in seen_ids:
continue
seen_ids.add(dedupe_key)
documents.append({
"id": doc_id,
"text": row.get("text", "") or "",
"metadata": self._normalize_metadata(row)
})
if limit > 0 and len(documents) >= limit:
break
return documents
def _normalize_metadata(self, row: Dict[str, Any]) -> Dict[str, Any]:
"""Normalize metadata payloads from txtai search rows."""
for key in ("object", "tags", "metadata", "meta"):
payload = row.get(key)
if isinstance(payload, dict):
return payload
if isinstance(payload, str):
try:
parsed = json.loads(payload)
if isinstance(parsed, dict):
return parsed
except Exception:
continue
return {}
def _extract_topic_density(self, documents: List[Dict[str, Any]]) -> Dict[str, float]:
"""Extract topic density from document metadata and titles."""
topic_counter: Counter = Counter()
for doc in documents:
metadata = doc.get("metadata", {})
candidates = []
for key in ("topics", "topic", "keywords", "keyword", "tags", "category"):
value = metadata.get(key)
if isinstance(value, list):
candidates.extend([str(v) for v in value if v])
elif isinstance(value, str) and value.strip():
candidates.extend(re.split(r"[,|/]", value))
title = metadata.get("title") or doc.get("text", "")[:120]
if title:
title_tokens = re.findall(r"[a-zA-Z][a-zA-Z\-]{3,}", str(title).lower())
candidates.extend(title_tokens)
normalized = {
item.strip().lower() for item in candidates
if item and len(item.strip()) >= 4 and not item.strip().isdigit()
}
for topic in normalized:
topic_counter[topic] += 1
total_docs = max(1, len(documents))
return {
topic: count / total_docs
for topic, count in topic_counter.items()
if count >= 2
}
def _sample_titles_for_topic(self, documents: List[Dict[str, Any]], topic: str, limit: int = 3) -> List[str]:
"""Return sample titles for a topic."""
samples = []
topic_lower = topic.lower()
for doc in documents:
metadata = doc.get("metadata", {})
title = metadata.get("title") or doc.get("text", "")[:100]
if not title:
continue
haystack = f"{title} {json.dumps(metadata, default=str)}".lower()
if topic_lower in haystack:
samples.append(str(title))
if len(samples) >= limit:
break
return samples
class ContentGuardianAgent(SIFBaseAgent): class ContentGuardianAgent(SIFBaseAgent):
"""Agent for preventing cannibalization and ensuring content originality.""" """Agent for preventing cannibalization and ensuring content originality."""
@@ -2436,4 +2608,3 @@ class SocialAmplificationAgent(BaseALwrityAgent):
"status": "scheduled", "status": "scheduled",
"timestamp": datetime.utcnow().isoformat() "timestamp": datetime.utcnow().isoformat()
} }

View File

@@ -7,6 +7,8 @@ Each agent leverages TxtaiIntelligenceService for semantic operations.
import traceback import traceback
import json import json
import asyncio import asyncio
import re
from collections import Counter
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
from datetime import datetime from datetime import datetime
from loguru import logger from loguru import logger
@@ -212,23 +214,193 @@ class StrategyArchitectAgent(SIFBaseAgent):
self._log_agent_operation("Finding semantic content gaps", competitor_count=len(competitor_indices)) self._log_agent_operation("Finding semantic content gaps", competitor_count=len(competitor_indices))
try: try:
# STUB: Implement cross-index comparison documents = await self._fetch_index_documents()
# This would involve: if not documents:
# 1. Getting user content topics/themes logger.info(f"[{self.__class__.__name__}] No indexed documents available for gap detection")
# 2. Getting competitor content topics/themes return []
# 3. Finding topics competitors cover but user doesn't
competitor_docs, user_docs = [], []
logger.info(f"[{self.__class__.__name__}] Found semantic gaps analysis stub") allowed_competitor_ids = set(str(idx) for idx in competitor_indices) if competitor_indices else None
return [ for doc in documents:
{"topic": "Topic A", "priority": "high", "reason": "Competitor coverage gap"}, metadata = doc.get("metadata", {})
{"topic": "Topic B", "priority": "medium", "reason": "Emerging trend"} doc_type = str(metadata.get("type", "")).lower()
] if "competitor" in doc_type:
if allowed_competitor_ids and str(doc.get("id")) not in allowed_competitor_ids:
continue
competitor_docs.append(doc)
elif "user" in doc_type:
user_docs.append(doc)
if not competitor_docs or not user_docs:
logger.info(
f"[{self.__class__.__name__}] Insufficient split for gap analysis: "
f"user_docs={len(user_docs)}, competitor_docs={len(competitor_docs)}"
)
return []
competitor_topics = self._extract_topic_density(competitor_docs)
user_topics = self._extract_topic_density(user_docs)
gaps = []
for topic, competitor_density in competitor_topics.items():
user_density = user_topics.get(topic, 0.0)
density_gap = competitor_density - user_density
if density_gap <= 0.08:
continue
confidence = max(
0.0,
min(1.0, 0.35 + (density_gap * 1.5) + (competitor_density * 0.4))
)
priority = "high" if confidence >= 0.75 else "medium" if confidence >= 0.5 else "low"
gaps.append({
"topic": topic,
"priority": priority,
"reason": (
f"Competitors mention '{topic}' substantially more often "
f"(density {competitor_density:.2f} vs {user_density:.2f})."
),
"confidence": round(confidence, 3),
"topic_density": {
"competitor": round(competitor_density, 4),
"user": round(user_density, 4),
"gap": round(density_gap, 4)
},
"evidence": {
"competitor_sample_titles": self._sample_titles_for_topic(competitor_docs, topic),
"user_sample_titles": self._sample_titles_for_topic(user_docs, topic),
"competitor_doc_count": len(competitor_docs),
"user_doc_count": len(user_docs)
}
})
gaps.sort(
key=lambda item: (
item.get("confidence", 0),
item.get("topic_density", {}).get("gap", 0)
),
reverse=True
)
return gaps[:12]
except Exception as e: except Exception as e:
logger.error(f"[{self.__class__.__name__}] Failed to find semantic gaps: {e}") logger.error(f"[{self.__class__.__name__}] Failed to find semantic gaps: {e}")
logger.error(f"[{self.__class__.__name__}] Full traceback: {traceback.format_exc()}") logger.error(f"[{self.__class__.__name__}] Full traceback: {traceback.format_exc()}")
return [] return []
async def _fetch_index_documents(self) -> List[Dict[str, Any]]:
"""Fetch indexed documents and normalize metadata from txtai result objects."""
if not self.intelligence.is_initialized() or not self.intelligence.embeddings:
return []
embeddings = self.intelligence.embeddings
limit = 0
if hasattr(embeddings, "count"):
try:
limit = int(embeddings.count())
except Exception:
limit = 0
documents = []
candidate_queries = []
if limit > 0:
candidate_queries.extend([
f"select id, text, object from txtai limit {limit}",
f"select id, text, tags from txtai limit {limit}"
])
candidate_queries.extend(["marketing", "content", "seo", "strategy", "social media"])
seen_ids = set()
for query in candidate_queries:
try:
query_limit = limit if query.startswith("select") and limit > 0 else max(10, limit or 50)
rows = embeddings.search(query, limit=query_limit)
except Exception:
continue
for row in rows or []:
doc_id = str(row.get("id", ""))
dedupe_key = doc_id or str(hash(f"{row.get('text','')}::{row.get('score',0)}"))
if dedupe_key in seen_ids:
continue
seen_ids.add(dedupe_key)
documents.append({
"id": doc_id,
"text": row.get("text", "") or "",
"metadata": self._normalize_metadata(row)
})
if limit > 0 and len(documents) >= limit:
break
return documents
def _normalize_metadata(self, row: Dict[str, Any]) -> Dict[str, Any]:
"""Normalize metadata payloads from txtai search rows."""
for key in ("object", "tags", "metadata", "meta"):
payload = row.get(key)
if isinstance(payload, dict):
return payload
if isinstance(payload, str):
try:
parsed = json.loads(payload)
if isinstance(parsed, dict):
return parsed
except Exception:
continue
return {}
def _extract_topic_density(self, documents: List[Dict[str, Any]]) -> Dict[str, float]:
"""Extract topic density from document metadata and titles."""
topic_counter: Counter = Counter()
for doc in documents:
metadata = doc.get("metadata", {})
candidates = []
for key in ("topics", "topic", "keywords", "keyword", "tags", "category"):
value = metadata.get(key)
if isinstance(value, list):
candidates.extend([str(v) for v in value if v])
elif isinstance(value, str) and value.strip():
candidates.extend(re.split(r"[,|/]", value))
title = metadata.get("title") or doc.get("text", "")[:120]
if title:
title_tokens = re.findall(r"[a-zA-Z][a-zA-Z\-]{3,}", str(title).lower())
candidates.extend(title_tokens)
normalized = {
item.strip().lower() for item in candidates
if item and len(item.strip()) >= 4 and not item.strip().isdigit()
}
for topic in normalized:
topic_counter[topic] += 1
total_docs = max(1, len(documents))
return {
topic: count / total_docs
for topic, count in topic_counter.items()
if count >= 2
}
def _sample_titles_for_topic(self, documents: List[Dict[str, Any]], topic: str, limit: int = 3) -> List[str]:
"""Return sample titles for a topic."""
samples = []
topic_lower = topic.lower()
for doc in documents:
metadata = doc.get("metadata", {})
title = metadata.get("title") or doc.get("text", "")[:100]
if not title:
continue
haystack = f"{title} {json.dumps(metadata, default=str)}".lower()
if topic_lower in haystack:
samples.append(str(title))
if len(samples) >= limit:
break
return samples
class ContentGuardianAgent(SIFBaseAgent): class ContentGuardianAgent(SIFBaseAgent):
"""Agent for preventing cannibalization and ensuring content originality.""" """Agent for preventing cannibalization and ensuring content originality."""

View File

@@ -974,60 +974,36 @@ class SIFIntegrationService:
return pillars return pillars
async def _identify_semantic_gaps(self, website_data: Dict[str, Any]) -> List[Dict[str, Any]]: async def _identify_semantic_gaps(self, website_data: Dict[str, Any]) -> List[Dict[str, Any]]:
""" """Identify semantic gaps using StrategyArchitectAgent evidence-driven analysis."""
Identify semantic gaps in user content by comparing against competitor topics or industry standards.
Uses txtai semantic search to check coverage of key topics.
"""
gaps = []
try: try:
# 1. Determine target topics to check if not self.strategy_agent:
# In a real scenario, these come from competitor analysis or keyword research. from .sif_agents import StrategyArchitectAgent
# Here we extract potential topics from competitor data or use defaults. self.strategy_agent = StrategyArchitectAgent(self.intelligence_service, user_id=self.user_id)
competitors = website_data.get('competitors', [])
target_topics = []
# Placeholder: Extract topics from competitor names/descriptions if available
# For now, we'll use a mix of generic marketing topics and any provided tags
target_topics = [
"content strategy", "SEO optimization", "social media marketing",
"email campaigns", "brand storytelling", "customer retention",
"voice search", "video marketing", "influencer partnerships"
]
# Add specific topics from input if available
if 'target_keywords' in website_data:
target_topics.extend(website_data['target_keywords'])
# 2. Check coverage for each topic in the user's index competitor_ids = website_data.get("competitor_indices", []) or []
for topic in target_topics: gaps = await self.strategy_agent.find_semantic_gaps(competitor_indices=competitor_ids)
# Search the user's index
results = await self.intelligence_service.search(topic, limit=1) normalized_gaps = []
for gap in gaps:
# Check relevance density = gap.get("topic_density", {})
max_score = results[0]['score'] if results else 0.0 normalized_gaps.append({
"topic": gap.get("topic"),
# If relevance is low, it's a gap "priority": gap.get("priority", "medium"),
GAP_THRESHOLD = 0.45 "reason": gap.get("reason", "Competitor coverage gap"),
if max_score < GAP_THRESHOLD: "confidence": gap.get("confidence", 0.0),
gaps.append({ "current_coverage_score": density.get("user", 0.0),
"topic": topic, "competitor_coverage_score": density.get("competitor", 0.0),
"current_coverage_score": float(max_score), "gap_severity": gap.get("priority", "medium"),
"gap_severity": "high" if max_score < 0.2 else "medium", "suggested_action": f"Create dedicated content for '{gap.get('topic', 'this topic')}'",
"reason": "Low semantic relevance in current content index", "topic_density": density,
"suggested_action": f"Create dedicated content for '{topic}'" "evidence": gap.get("evidence", {})
}) })
# Sort by severity (lower score = higher severity) return normalized_gaps
gaps.sort(key=lambda x: x['current_coverage_score'])
return gaps[:5] # Return top 5 gaps
except Exception as e: except Exception as e:
logger.error(f"Error identifying semantic gaps: {e}") logger.error(f"Error identifying semantic gaps: {e}")
# Fallback to sample data if index search fails completely return []
return [
{"topic": "error_fallback", "reason": str(e), "current_coverage_score": 0.0}
]
async def _analyze_competitor_semantics(self, website_data: Dict[str, Any]) -> Dict[str, Any]: async def _analyze_competitor_semantics(self, website_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze competitor semantic positioning.""" """Analyze competitor semantic positioning."""

View File

@@ -52,9 +52,9 @@ class SIFOnboardingIntegration:
self.harvester = SemanticHarvesterService() self.harvester = SemanticHarvesterService()
# Initialize agents # Initialize agents
self.strategy_agent = StrategyArchitectAgent(self.intelligence) self.strategy_agent = StrategyArchitectAgent(self.intelligence, user_id)
self.guardian_agent = ContentGuardianAgent(self.intelligence) self.guardian_agent = ContentGuardianAgent(self.intelligence, user_id)
self.link_agent = LinkGraphAgent(self.intelligence) self.link_agent = LinkGraphAgent(self.intelligence, user_id)
logger.info(f"[SIFOnboarding] Initialized for user {user_id}") logger.info(f"[SIFOnboarding] Initialized for user {user_id}")
@@ -254,7 +254,23 @@ class SIFOnboardingIntegration:
"priority": "high", "priority": "high",
"title": "Fill Content Gaps", "title": "Fill Content Gaps",
"description": f"Competitors are covering {len(semantic_gaps)} topics you haven't addressed.", "description": f"Competitors are covering {len(semantic_gaps)} topics you haven't addressed.",
"action_items": [f"Create content about: {gap.get('topic', 'Unknown topic')}" for gap in semantic_gaps[:5]] "action_items": [
(
f"Create content about: {gap.get('topic', 'Unknown topic')} "
f"({gap.get('priority', 'medium')} priority) - {gap.get('reason', 'Coverage gap identified')}"
)
for gap in semantic_gaps[:5]
],
"evidence": [
{
"topic": gap.get("topic"),
"priority": gap.get("priority"),
"confidence": gap.get("confidence"),
"topic_density": gap.get("topic_density"),
"competitor_sample_titles": gap.get("evidence", {}).get("competitor_sample_titles", [])
}
for gap in semantic_gaps[:5]
]
}) })
# Theme-based recommendations # Theme-based recommendations
@@ -448,4 +464,4 @@ async def discover_competitors(request: CompetitorDiscoveryRequest, user=Depends
"content_analysis": enhanced_results["content_analysis"], "content_analysis": enhanced_results["content_analysis"],
"strategic_recommendations": enhanced_results["semantic_insights"]["strategic_recommendations"] "strategic_recommendations": enhanced_results["semantic_insights"]["strategic_recommendations"]
} }
""" """