Added enhanced linguistic analyzer and persona quality improver

2025-09-14 09:53:27 +05:30
parent c63148e1ce
commit 1460ce3cb6
35 changed files with 4446 additions and 118 deletions
--- a/backend/services/blog_writer/content/source_url_manager.py
+++ b/backend/services/blog_writer/content/source_url_manager.py
@@ -0,0 +1,42 @@
+"""
+SourceURLManager - selects the most relevant source URLs for a section.
+
+Low-effort heuristic using keywords and titles; safe defaults if no research.
+"""
+
+from typing import List, Dict, Any
+
+
+class SourceURLManager:
+    def pick_relevant_urls(self, section: Any, research: Any, limit: int = 5) -> List[str]:
+        if not research or not getattr(research, 'sources', None):
+            return []
+
+        section_keywords = set([k.lower() for k in getattr(section, 'keywords', [])])
+        scored: List[tuple[float, str]] = []
+        for s in research.sources:
+            url = getattr(s, 'url', None) or getattr(s, 'uri', None) or s.get('url') if isinstance(s, dict) else None
+            title = getattr(s, 'title', None) or s.get('title') if isinstance(s, dict) else ''
+            if not url or not isinstance(url, str):
+                continue
+            title_l = (title or '').lower()
+            # simple overlap score
+            score = 0.0
+            for kw in section_keywords:
+                if kw and kw in title_l:
+                    score += 1.0
+            # prefer https and reputable domains lightly
+            if url.startswith('https://'):
+                score += 0.2
+            scored.append((score, url))
+
+        scored.sort(key=lambda x: x[0], reverse=True)
+        dedup: List[str] = []
+        for _, u in scored:
+            if u not in dedup:
+                dedup.append(u)
+            if len(dedup) >= limit:
+                break
+        return dedup
+
+