Base code
This commit is contained in:
19
backend/services/component_logic/__init__.py
Normal file
19
backend/services/component_logic/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""Component Logic Services for ALwrity Backend.
|
||||
|
||||
This module contains business logic extracted from legacy Streamlit components
|
||||
and converted to reusable FastAPI services.
|
||||
"""
|
||||
|
||||
from .ai_research_logic import AIResearchLogic
|
||||
from .personalization_logic import PersonalizationLogic
|
||||
from .research_utilities import ResearchUtilities
|
||||
from .style_detection_logic import StyleDetectionLogic
|
||||
from .web_crawler_logic import WebCrawlerLogic
|
||||
|
||||
__all__ = [
|
||||
"AIResearchLogic",
|
||||
"PersonalizationLogic",
|
||||
"ResearchUtilities",
|
||||
"StyleDetectionLogic",
|
||||
"WebCrawlerLogic"
|
||||
]
|
||||
268
backend/services/component_logic/ai_research_logic.py
Normal file
268
backend/services/component_logic/ai_research_logic.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""AI Research Logic Service for ALwrity Backend.
|
||||
|
||||
This service handles business logic for AI research configuration and user information
|
||||
validation, extracted from the legacy Streamlit component.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
class AIResearchLogic:
|
||||
"""Business logic for AI research configuration and user information."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the AI Research Logic service."""
|
||||
self.valid_roles = ["Content Creator", "Marketing Manager", "Business Owner", "Other"]
|
||||
self.valid_research_depths = ["Basic", "Standard", "Deep", "Comprehensive"]
|
||||
self.valid_content_types = ["Blog Posts", "Social Media", "Technical Articles", "News", "Academic Papers"]
|
||||
|
||||
def validate_user_info(self, user_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate user information for AI research configuration.
|
||||
|
||||
Args:
|
||||
user_data: Dictionary containing user information
|
||||
|
||||
Returns:
|
||||
Dict containing validation results
|
||||
"""
|
||||
try:
|
||||
logger.info("Validating user information for AI research")
|
||||
|
||||
errors = []
|
||||
validated_data = {}
|
||||
|
||||
# Validate full name
|
||||
full_name = user_data.get('full_name', '').strip()
|
||||
if not full_name or len(full_name) < 2:
|
||||
errors.append("Full name must be at least 2 characters long")
|
||||
else:
|
||||
validated_data['full_name'] = full_name
|
||||
|
||||
# Validate email
|
||||
email = user_data.get('email', '').strip().lower()
|
||||
email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
|
||||
if not email_pattern.match(email):
|
||||
errors.append("Invalid email format")
|
||||
else:
|
||||
validated_data['email'] = email
|
||||
|
||||
# Validate company
|
||||
company = user_data.get('company', '').strip()
|
||||
if not company:
|
||||
errors.append("Company name is required")
|
||||
else:
|
||||
validated_data['company'] = company
|
||||
|
||||
# Validate role
|
||||
role = user_data.get('role', '')
|
||||
if role not in self.valid_roles:
|
||||
errors.append(f"Role must be one of: {', '.join(self.valid_roles)}")
|
||||
else:
|
||||
validated_data['role'] = role
|
||||
|
||||
# Determine validation result
|
||||
is_valid = len(errors) == 0
|
||||
|
||||
if is_valid:
|
||||
logger.info("User information validation successful")
|
||||
validated_data['validated_at'] = datetime.now().isoformat()
|
||||
else:
|
||||
logger.warning(f"User information validation failed: {errors}")
|
||||
|
||||
return {
|
||||
'valid': is_valid,
|
||||
'user_info': validated_data if is_valid else None,
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating user information: {str(e)}")
|
||||
return {
|
||||
'valid': False,
|
||||
'user_info': None,
|
||||
'errors': [f"Validation error: {str(e)}"]
|
||||
}
|
||||
|
||||
def configure_research_preferences(self, preferences: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Configure research preferences for AI research.
|
||||
|
||||
Args:
|
||||
preferences: Dictionary containing research preferences
|
||||
|
||||
Returns:
|
||||
Dict containing configuration results
|
||||
"""
|
||||
try:
|
||||
logger.info("Configuring research preferences")
|
||||
|
||||
errors = []
|
||||
configured_preferences = {}
|
||||
|
||||
# Validate research depth
|
||||
research_depth = preferences.get('research_depth', '')
|
||||
if research_depth not in self.valid_research_depths:
|
||||
errors.append(f"Research depth must be one of: {', '.join(self.valid_research_depths)}")
|
||||
else:
|
||||
configured_preferences['research_depth'] = research_depth
|
||||
|
||||
# Validate content types
|
||||
content_types = preferences.get('content_types', [])
|
||||
if not content_types:
|
||||
errors.append("At least one content type must be selected")
|
||||
else:
|
||||
invalid_types = [ct for ct in content_types if ct not in self.valid_content_types]
|
||||
if invalid_types:
|
||||
errors.append(f"Invalid content types: {', '.join(invalid_types)}")
|
||||
else:
|
||||
configured_preferences['content_types'] = content_types
|
||||
|
||||
# Validate auto research setting
|
||||
auto_research = preferences.get('auto_research', False)
|
||||
if not isinstance(auto_research, bool):
|
||||
errors.append("Auto research must be a boolean value")
|
||||
else:
|
||||
configured_preferences['auto_research'] = auto_research
|
||||
|
||||
# Determine configuration result
|
||||
is_valid = len(errors) == 0
|
||||
|
||||
if is_valid:
|
||||
logger.info("Research preferences configuration successful")
|
||||
configured_preferences['configured_at'] = datetime.now().isoformat()
|
||||
else:
|
||||
logger.warning(f"Research preferences configuration failed: {errors}")
|
||||
|
||||
return {
|
||||
'valid': is_valid,
|
||||
'preferences': configured_preferences if is_valid else None,
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error configuring research preferences: {str(e)}")
|
||||
return {
|
||||
'valid': False,
|
||||
'preferences': None,
|
||||
'errors': [f"Configuration error: {str(e)}"]
|
||||
}
|
||||
|
||||
def process_research_request(self, topic: str, preferences: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a research request with configured preferences.
|
||||
|
||||
Args:
|
||||
topic: The research topic
|
||||
preferences: Configured research preferences
|
||||
|
||||
Returns:
|
||||
Dict containing research processing results
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Processing research request for topic: {topic}")
|
||||
|
||||
# Validate topic
|
||||
if not topic or len(topic.strip()) < 3:
|
||||
return {
|
||||
'success': False,
|
||||
'topic': topic,
|
||||
'error': 'Topic must be at least 3 characters long'
|
||||
}
|
||||
|
||||
# Validate preferences
|
||||
if not preferences:
|
||||
return {
|
||||
'success': False,
|
||||
'topic': topic,
|
||||
'error': 'Research preferences are required'
|
||||
}
|
||||
|
||||
# Process research based on preferences
|
||||
research_depth = preferences.get('research_depth', 'Standard')
|
||||
content_types = preferences.get('content_types', [])
|
||||
auto_research = preferences.get('auto_research', False)
|
||||
|
||||
# Simulate research processing (in real implementation, this would call AI services)
|
||||
research_results = {
|
||||
'topic': topic,
|
||||
'research_depth': research_depth,
|
||||
'content_types': content_types,
|
||||
'auto_research': auto_research,
|
||||
'processed_at': datetime.now().isoformat(),
|
||||
'status': 'processed'
|
||||
}
|
||||
|
||||
logger.info(f"Research request processed successfully for topic: {topic}")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'topic': topic,
|
||||
'results': research_results
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing research request: {str(e)}")
|
||||
return {
|
||||
'success': False,
|
||||
'topic': topic,
|
||||
'error': f"Processing error: {str(e)}"
|
||||
}
|
||||
|
||||
def get_research_configuration_options(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get available configuration options for research.
|
||||
|
||||
Returns:
|
||||
Dict containing all available options
|
||||
"""
|
||||
return {
|
||||
'roles': self.valid_roles,
|
||||
'research_depths': self.valid_research_depths,
|
||||
'content_types': self.valid_content_types,
|
||||
'auto_research_options': [True, False]
|
||||
}
|
||||
|
||||
def validate_complete_research_setup(self, user_info: Dict[str, Any], preferences: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate complete research setup including user info and preferences.
|
||||
|
||||
Args:
|
||||
user_info: User information dictionary
|
||||
preferences: Research preferences dictionary
|
||||
|
||||
Returns:
|
||||
Dict containing complete validation results
|
||||
"""
|
||||
try:
|
||||
logger.info("Validating complete research setup")
|
||||
|
||||
# Validate user information
|
||||
user_validation = self.validate_user_info(user_info)
|
||||
|
||||
# Validate research preferences
|
||||
preferences_validation = self.configure_research_preferences(preferences)
|
||||
|
||||
# Combine results
|
||||
all_errors = user_validation.get('errors', []) + preferences_validation.get('errors', [])
|
||||
is_complete = user_validation.get('valid', False) and preferences_validation.get('valid', False)
|
||||
|
||||
return {
|
||||
'complete': is_complete,
|
||||
'user_info_valid': user_validation.get('valid', False),
|
||||
'preferences_valid': preferences_validation.get('valid', False),
|
||||
'errors': all_errors,
|
||||
'user_info': user_validation.get('user_info'),
|
||||
'preferences': preferences_validation.get('preferences')
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating complete research setup: {str(e)}")
|
||||
return {
|
||||
'complete': False,
|
||||
'user_info_valid': False,
|
||||
'preferences_valid': False,
|
||||
'errors': [f"Setup validation error: {str(e)}"]
|
||||
}
|
||||
337
backend/services/component_logic/personalization_logic.py
Normal file
337
backend/services/component_logic/personalization_logic.py
Normal file
@@ -0,0 +1,337 @@
|
||||
"""Personalization Logic Service for ALwrity Backend.
|
||||
|
||||
This service handles business logic for content personalization settings,
|
||||
extracted from the legacy Streamlit component.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
from datetime import datetime
|
||||
|
||||
class PersonalizationLogic:
|
||||
"""Business logic for content personalization and brand voice configuration."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Personalization Logic service."""
|
||||
self.valid_writing_styles = ["Professional", "Casual", "Technical", "Conversational", "Academic"]
|
||||
self.valid_tones = ["Formal", "Semi-Formal", "Neutral", "Friendly", "Humorous"]
|
||||
self.valid_content_lengths = ["Concise", "Standard", "Detailed", "Comprehensive"]
|
||||
self.valid_personality_traits = ["Professional", "Innovative", "Friendly", "Trustworthy", "Creative", "Expert"]
|
||||
self.valid_readability_levels = ["Simple", "Standard", "Advanced", "Expert"]
|
||||
self.valid_content_structures = ["Introduction", "Key Points", "Examples", "Conclusion", "Call-to-Action"]
|
||||
|
||||
def validate_content_style(self, style_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate content style configuration.
|
||||
|
||||
Args:
|
||||
style_data: Dictionary containing content style settings
|
||||
|
||||
Returns:
|
||||
Dict containing validation results
|
||||
"""
|
||||
try:
|
||||
logger.info("Validating content style configuration")
|
||||
|
||||
errors = []
|
||||
validated_style = {}
|
||||
|
||||
# Validate writing style
|
||||
writing_style = style_data.get('writing_style', '')
|
||||
if writing_style not in self.valid_writing_styles:
|
||||
errors.append(f"Writing style must be one of: {', '.join(self.valid_writing_styles)}")
|
||||
else:
|
||||
validated_style['writing_style'] = writing_style
|
||||
|
||||
# Validate tone
|
||||
tone = style_data.get('tone', '')
|
||||
if tone not in self.valid_tones:
|
||||
errors.append(f"Tone must be one of: {', '.join(self.valid_tones)}")
|
||||
else:
|
||||
validated_style['tone'] = tone
|
||||
|
||||
# Validate content length
|
||||
content_length = style_data.get('content_length', '')
|
||||
if content_length not in self.valid_content_lengths:
|
||||
errors.append(f"Content length must be one of: {', '.join(self.valid_content_lengths)}")
|
||||
else:
|
||||
validated_style['content_length'] = content_length
|
||||
|
||||
# Determine validation result
|
||||
is_valid = len(errors) == 0
|
||||
|
||||
if is_valid:
|
||||
logger.info("Content style validation successful")
|
||||
validated_style['validated_at'] = datetime.now().isoformat()
|
||||
else:
|
||||
logger.warning(f"Content style validation failed: {errors}")
|
||||
|
||||
return {
|
||||
'valid': is_valid,
|
||||
'style_config': validated_style if is_valid else None,
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating content style: {str(e)}")
|
||||
return {
|
||||
'valid': False,
|
||||
'style_config': None,
|
||||
'errors': [f"Style validation error: {str(e)}"]
|
||||
}
|
||||
|
||||
def configure_brand_voice(self, brand_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Configure brand voice settings.
|
||||
|
||||
Args:
|
||||
brand_data: Dictionary containing brand voice settings
|
||||
|
||||
Returns:
|
||||
Dict containing configuration results
|
||||
"""
|
||||
try:
|
||||
logger.info("Configuring brand voice settings")
|
||||
|
||||
errors = []
|
||||
configured_brand = {}
|
||||
|
||||
# Validate personality traits
|
||||
personality_traits = brand_data.get('personality_traits', [])
|
||||
if not personality_traits:
|
||||
errors.append("At least one personality trait must be selected")
|
||||
else:
|
||||
invalid_traits = [trait for trait in personality_traits if trait not in self.valid_personality_traits]
|
||||
if invalid_traits:
|
||||
errors.append(f"Invalid personality traits: {', '.join(invalid_traits)}")
|
||||
else:
|
||||
configured_brand['personality_traits'] = personality_traits
|
||||
|
||||
# Validate voice description (optional but if provided, must be valid)
|
||||
voice_description = brand_data.get('voice_description', '').strip()
|
||||
if voice_description and len(voice_description) < 10:
|
||||
errors.append("Voice description must be at least 10 characters long")
|
||||
elif voice_description:
|
||||
configured_brand['voice_description'] = voice_description
|
||||
|
||||
# Validate keywords (optional)
|
||||
keywords = brand_data.get('keywords', '').strip()
|
||||
if keywords:
|
||||
configured_brand['keywords'] = keywords
|
||||
|
||||
# Determine configuration result
|
||||
is_valid = len(errors) == 0
|
||||
|
||||
if is_valid:
|
||||
logger.info("Brand voice configuration successful")
|
||||
configured_brand['configured_at'] = datetime.now().isoformat()
|
||||
else:
|
||||
logger.warning(f"Brand voice configuration failed: {errors}")
|
||||
|
||||
return {
|
||||
'valid': is_valid,
|
||||
'brand_config': configured_brand if is_valid else None,
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error configuring brand voice: {str(e)}")
|
||||
return {
|
||||
'valid': False,
|
||||
'brand_config': None,
|
||||
'errors': [f"Brand configuration error: {str(e)}"]
|
||||
}
|
||||
|
||||
def process_advanced_settings(self, settings: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Process advanced content generation settings.
|
||||
|
||||
Args:
|
||||
settings: Dictionary containing advanced settings
|
||||
|
||||
Returns:
|
||||
Dict containing processing results
|
||||
"""
|
||||
try:
|
||||
logger.info("Processing advanced content generation settings")
|
||||
|
||||
errors = []
|
||||
processed_settings = {}
|
||||
|
||||
# Validate SEO optimization (boolean)
|
||||
seo_optimization = settings.get('seo_optimization', False)
|
||||
if not isinstance(seo_optimization, bool):
|
||||
errors.append("SEO optimization must be a boolean value")
|
||||
else:
|
||||
processed_settings['seo_optimization'] = seo_optimization
|
||||
|
||||
# Validate readability level
|
||||
readability_level = settings.get('readability_level', '')
|
||||
if readability_level not in self.valid_readability_levels:
|
||||
errors.append(f"Readability level must be one of: {', '.join(self.valid_readability_levels)}")
|
||||
else:
|
||||
processed_settings['readability_level'] = readability_level
|
||||
|
||||
# Validate content structure
|
||||
content_structure = settings.get('content_structure', [])
|
||||
if not content_structure:
|
||||
errors.append("At least one content structure element must be selected")
|
||||
else:
|
||||
invalid_structures = [struct for struct in content_structure if struct not in self.valid_content_structures]
|
||||
if invalid_structures:
|
||||
errors.append(f"Invalid content structure elements: {', '.join(invalid_structures)}")
|
||||
else:
|
||||
processed_settings['content_structure'] = content_structure
|
||||
|
||||
# Determine processing result
|
||||
is_valid = len(errors) == 0
|
||||
|
||||
if is_valid:
|
||||
logger.info("Advanced settings processing successful")
|
||||
processed_settings['processed_at'] = datetime.now().isoformat()
|
||||
else:
|
||||
logger.warning(f"Advanced settings processing failed: {errors}")
|
||||
|
||||
return {
|
||||
'valid': is_valid,
|
||||
'advanced_settings': processed_settings if is_valid else None,
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing advanced settings: {str(e)}")
|
||||
return {
|
||||
'valid': False,
|
||||
'advanced_settings': None,
|
||||
'errors': [f"Advanced settings error: {str(e)}"]
|
||||
}
|
||||
|
||||
def process_personalization_settings(self, settings: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Process complete personalization settings including all components.
|
||||
|
||||
Args:
|
||||
settings: Dictionary containing complete personalization settings
|
||||
|
||||
Returns:
|
||||
Dict containing processing results
|
||||
"""
|
||||
try:
|
||||
logger.info("Processing complete personalization settings")
|
||||
|
||||
# Validate content style
|
||||
content_style = settings.get('content_style', {})
|
||||
style_validation = self.validate_content_style(content_style)
|
||||
|
||||
# Configure brand voice
|
||||
brand_voice = settings.get('brand_voice', {})
|
||||
brand_validation = self.configure_brand_voice(brand_voice)
|
||||
|
||||
# Process advanced settings
|
||||
advanced_settings = settings.get('advanced_settings', {})
|
||||
advanced_validation = self.process_advanced_settings(advanced_settings)
|
||||
|
||||
# Combine results
|
||||
all_errors = (
|
||||
style_validation.get('errors', []) +
|
||||
brand_validation.get('errors', []) +
|
||||
advanced_validation.get('errors', [])
|
||||
)
|
||||
|
||||
is_complete = (
|
||||
style_validation.get('valid', False) and
|
||||
brand_validation.get('valid', False) and
|
||||
advanced_validation.get('valid', False)
|
||||
)
|
||||
|
||||
if is_complete:
|
||||
# Combine all valid settings
|
||||
complete_settings = {
|
||||
'content_style': style_validation.get('style_config'),
|
||||
'brand_voice': brand_validation.get('brand_config'),
|
||||
'advanced_settings': advanced_validation.get('advanced_settings'),
|
||||
'processed_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
logger.info("Complete personalization settings processed successfully")
|
||||
|
||||
return {
|
||||
'valid': True,
|
||||
'settings': complete_settings,
|
||||
'errors': []
|
||||
}
|
||||
else:
|
||||
logger.warning(f"Personalization settings processing failed: {all_errors}")
|
||||
|
||||
return {
|
||||
'valid': False,
|
||||
'settings': None,
|
||||
'errors': all_errors
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing personalization settings: {str(e)}")
|
||||
return {
|
||||
'valid': False,
|
||||
'settings': None,
|
||||
'errors': [f"Personalization processing error: {str(e)}"]
|
||||
}
|
||||
|
||||
def get_personalization_configuration_options(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get available configuration options for personalization.
|
||||
|
||||
Returns:
|
||||
Dict containing all available options
|
||||
"""
|
||||
return {
|
||||
'writing_styles': self.valid_writing_styles,
|
||||
'tones': self.valid_tones,
|
||||
'content_lengths': self.valid_content_lengths,
|
||||
'personality_traits': self.valid_personality_traits,
|
||||
'readability_levels': self.valid_readability_levels,
|
||||
'content_structures': self.valid_content_structures,
|
||||
'seo_optimization_options': [True, False]
|
||||
}
|
||||
|
||||
def generate_content_guidelines(self, settings: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate content guidelines based on personalization settings.
|
||||
|
||||
Args:
|
||||
settings: Validated personalization settings
|
||||
|
||||
Returns:
|
||||
Dict containing content guidelines
|
||||
"""
|
||||
try:
|
||||
logger.info("Generating content guidelines from personalization settings")
|
||||
|
||||
content_style = settings.get('content_style', {})
|
||||
brand_voice = settings.get('brand_voice', {})
|
||||
advanced_settings = settings.get('advanced_settings', {})
|
||||
|
||||
guidelines = {
|
||||
'writing_style': content_style.get('writing_style', 'Professional'),
|
||||
'tone': content_style.get('tone', 'Neutral'),
|
||||
'content_length': content_style.get('content_length', 'Standard'),
|
||||
'brand_personality': brand_voice.get('personality_traits', []),
|
||||
'seo_optimized': advanced_settings.get('seo_optimization', False),
|
||||
'readability_level': advanced_settings.get('readability_level', 'Standard'),
|
||||
'required_sections': advanced_settings.get('content_structure', []),
|
||||
'generated_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
logger.info("Content guidelines generated successfully")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'guidelines': guidelines
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating content guidelines: {str(e)}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': f"Guidelines generation error: {str(e)}"
|
||||
}
|
||||
325
backend/services/component_logic/research_utilities.py
Normal file
325
backend/services/component_logic/research_utilities.py
Normal file
@@ -0,0 +1,325 @@
|
||||
"""Research Utilities Service for ALwrity Backend.
|
||||
|
||||
This service handles research functionality and result processing,
|
||||
extracted from the legacy AI research utilities.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
class ResearchUtilities:
|
||||
"""Business logic for research functionality and result processing."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Research Utilities service."""
|
||||
self.research_providers = {
|
||||
'tavily': 'TAVILY_API_KEY',
|
||||
'serper': 'SERPER_API_KEY',
|
||||
'metaphor': 'METAPHOR_API_KEY',
|
||||
'firecrawl': 'FIRECRAWL_API_KEY'
|
||||
}
|
||||
|
||||
async def research_topic(self, topic: str, api_keys: Dict[str, str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Research a topic using available AI services.
|
||||
|
||||
Args:
|
||||
topic: The topic to research
|
||||
api_keys: Dictionary of API keys for different services
|
||||
|
||||
Returns:
|
||||
Dict containing research results and metadata
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Starting research on topic: {topic}")
|
||||
|
||||
# Validate topic
|
||||
if not topic or len(topic.strip()) < 3:
|
||||
return {
|
||||
'success': False,
|
||||
'topic': topic,
|
||||
'error': 'Topic must be at least 3 characters long'
|
||||
}
|
||||
|
||||
# Check available API keys
|
||||
available_providers = []
|
||||
for provider, key_name in self.research_providers.items():
|
||||
if api_keys.get(key_name):
|
||||
available_providers.append(provider)
|
||||
|
||||
if not available_providers:
|
||||
return {
|
||||
'success': False,
|
||||
'topic': topic,
|
||||
'error': 'No research providers available. Please configure API keys.'
|
||||
}
|
||||
|
||||
# Simulate research processing (in real implementation, this would call actual AI services)
|
||||
research_results = await self._simulate_research(topic, available_providers)
|
||||
|
||||
logger.info(f"Research completed successfully for topic: {topic}")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'topic': topic,
|
||||
'results': research_results,
|
||||
'metadata': {
|
||||
'providers_used': available_providers,
|
||||
'research_timestamp': datetime.now().isoformat(),
|
||||
'topic_length': len(topic)
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during research: {str(e)}")
|
||||
return {
|
||||
'success': False,
|
||||
'topic': topic,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
async def _simulate_research(self, topic: str, providers: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Simulate research processing for demonstration purposes.
|
||||
In real implementation, this would call actual AI research services.
|
||||
|
||||
Args:
|
||||
topic: The research topic
|
||||
providers: List of available research providers
|
||||
|
||||
Returns:
|
||||
Dict containing simulated research results
|
||||
"""
|
||||
# Simulate async processing time
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Generate simulated research results
|
||||
results = {
|
||||
'summary': f"Comprehensive research summary for '{topic}' based on multiple sources.",
|
||||
'key_points': [
|
||||
f"Key insight 1 about {topic}",
|
||||
f"Important finding 2 related to {topic}",
|
||||
f"Notable trend 3 in {topic}",
|
||||
f"Critical observation 4 regarding {topic}"
|
||||
],
|
||||
'sources': [
|
||||
f"Research source 1 for {topic}",
|
||||
f"Academic paper on {topic}",
|
||||
f"Industry report about {topic}",
|
||||
f"Expert analysis of {topic}"
|
||||
],
|
||||
'trends': [
|
||||
f"Emerging trend in {topic}",
|
||||
f"Growing interest in {topic}",
|
||||
f"Market shift related to {topic}"
|
||||
],
|
||||
'recommendations': [
|
||||
f"Action item 1 for {topic}",
|
||||
f"Strategic recommendation for {topic}",
|
||||
f"Next steps regarding {topic}"
|
||||
],
|
||||
'providers_used': providers,
|
||||
'research_depth': 'comprehensive',
|
||||
'confidence_score': 0.85
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
def process_research_results(self, results: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Process and format research results for better presentation.
|
||||
|
||||
Args:
|
||||
results: Raw research results
|
||||
|
||||
Returns:
|
||||
Dict containing processed and formatted results
|
||||
"""
|
||||
try:
|
||||
logger.info("Processing research results")
|
||||
|
||||
if not results or 'success' not in results:
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Invalid research results format'
|
||||
}
|
||||
|
||||
if not results.get('success', False):
|
||||
return results # Return error results as-is
|
||||
|
||||
# Process successful results
|
||||
raw_results = results.get('results', {})
|
||||
metadata = results.get('metadata', {})
|
||||
|
||||
# Format and structure the results
|
||||
processed_results = {
|
||||
'topic': results.get('topic', ''),
|
||||
'summary': raw_results.get('summary', ''),
|
||||
'key_insights': raw_results.get('key_points', []),
|
||||
'sources': raw_results.get('sources', []),
|
||||
'trends': raw_results.get('trends', []),
|
||||
'recommendations': raw_results.get('recommendations', []),
|
||||
'metadata': {
|
||||
'providers_used': raw_results.get('providers_used', []),
|
||||
'research_depth': raw_results.get('research_depth', 'standard'),
|
||||
'confidence_score': raw_results.get('confidence_score', 0.0),
|
||||
'processed_at': datetime.now().isoformat(),
|
||||
'original_timestamp': metadata.get('research_timestamp')
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Research results processed successfully")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'processed_results': processed_results
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing research results: {str(e)}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': f"Results processing error: {str(e)}"
|
||||
}
|
||||
|
||||
def validate_research_request(self, topic: str, api_keys: Dict[str, str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate a research request before processing.
|
||||
|
||||
Args:
|
||||
topic: The research topic
|
||||
api_keys: Available API keys
|
||||
|
||||
Returns:
|
||||
Dict containing validation results
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Validating research request for topic: {topic}")
|
||||
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
# Validate topic
|
||||
if not topic or len(topic.strip()) < 3:
|
||||
errors.append("Topic must be at least 3 characters long")
|
||||
elif len(topic.strip()) > 500:
|
||||
errors.append("Topic is too long (maximum 500 characters)")
|
||||
|
||||
# Check API keys
|
||||
available_providers = []
|
||||
for provider, key_name in self.research_providers.items():
|
||||
if api_keys.get(key_name):
|
||||
available_providers.append(provider)
|
||||
else:
|
||||
warnings.append(f"No API key for {provider}")
|
||||
|
||||
if not available_providers:
|
||||
errors.append("No research providers available. Please configure at least one API key.")
|
||||
|
||||
# Determine validation result
|
||||
is_valid = len(errors) == 0
|
||||
|
||||
return {
|
||||
'valid': is_valid,
|
||||
'errors': errors,
|
||||
'warnings': warnings,
|
||||
'available_providers': available_providers,
|
||||
'topic_length': len(topic.strip()) if topic else 0
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating research request: {str(e)}")
|
||||
return {
|
||||
'valid': False,
|
||||
'errors': [f"Validation error: {str(e)}"],
|
||||
'warnings': [],
|
||||
'available_providers': [],
|
||||
'topic_length': 0
|
||||
}
|
||||
|
||||
def get_research_providers_info(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get information about available research providers.
|
||||
|
||||
Returns:
|
||||
Dict containing provider information
|
||||
"""
|
||||
return {
|
||||
'providers': {
|
||||
'tavily': {
|
||||
'name': 'Tavily',
|
||||
'description': 'Intelligent web research',
|
||||
'api_key_name': 'TAVILY_API_KEY',
|
||||
'url': 'https://tavily.com/#api'
|
||||
},
|
||||
'serper': {
|
||||
'name': 'Serper',
|
||||
'description': 'Google search functionality',
|
||||
'api_key_name': 'SERPER_API_KEY',
|
||||
'url': 'https://serper.dev/signup'
|
||||
},
|
||||
'metaphor': {
|
||||
'name': 'Metaphor',
|
||||
'description': 'Advanced web search',
|
||||
'api_key_name': 'METAPHOR_API_KEY',
|
||||
'url': 'https://dashboard.exa.ai/login'
|
||||
},
|
||||
'firecrawl': {
|
||||
'name': 'Firecrawl',
|
||||
'description': 'Web content extraction',
|
||||
'api_key_name': 'FIRECRAWL_API_KEY',
|
||||
'url': 'https://www.firecrawl.dev/account'
|
||||
}
|
||||
},
|
||||
'total_providers': len(self.research_providers)
|
||||
}
|
||||
|
||||
def generate_research_report(self, results: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate a formatted research report from processed results.
|
||||
|
||||
Args:
|
||||
results: Processed research results
|
||||
|
||||
Returns:
|
||||
Dict containing formatted research report
|
||||
"""
|
||||
try:
|
||||
logger.info("Generating research report")
|
||||
|
||||
if not results.get('success', False):
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Cannot generate report from failed research'
|
||||
}
|
||||
|
||||
processed_results = results.get('processed_results', {})
|
||||
|
||||
# Generate formatted report
|
||||
report = {
|
||||
'title': f"Research Report: {processed_results.get('topic', 'Unknown Topic')}",
|
||||
'executive_summary': processed_results.get('summary', ''),
|
||||
'key_findings': processed_results.get('key_insights', []),
|
||||
'trends_analysis': processed_results.get('trends', []),
|
||||
'recommendations': processed_results.get('recommendations', []),
|
||||
'sources': processed_results.get('sources', []),
|
||||
'metadata': processed_results.get('metadata', {}),
|
||||
'generated_at': datetime.now().isoformat(),
|
||||
'report_format': 'structured'
|
||||
}
|
||||
|
||||
logger.info("Research report generated successfully")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'report': report
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating research report: {str(e)}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': f"Report generation error: {str(e)}"
|
||||
}
|
||||
424
backend/services/component_logic/style_detection_logic.py
Normal file
424
backend/services/component_logic/style_detection_logic.py
Normal file
@@ -0,0 +1,424 @@
|
||||
"""Style Detection Logic Service for ALwrity Backend.
|
||||
|
||||
This service handles business logic for content style detection and analysis,
|
||||
migrated from the legacy StyleAnalyzer functionality.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
from datetime import datetime
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the backend directory to Python path for absolute imports
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
||||
|
||||
# Import the new backend LLM providers from services
|
||||
from ..llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
class StyleDetectionLogic:
|
||||
"""Business logic for content style detection and analysis."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Style Detection Logic service."""
|
||||
logger.info("[StyleDetectionLogic.__init__] Initializing style detection service")
|
||||
|
||||
def _clean_json_response(self, text: str) -> str:
|
||||
"""
|
||||
Clean the LLM response to extract valid JSON.
|
||||
|
||||
Args:
|
||||
text (str): Raw response from LLM
|
||||
|
||||
Returns:
|
||||
str: Cleaned JSON string
|
||||
"""
|
||||
try:
|
||||
# Remove markdown code block markers
|
||||
cleaned_string = text.replace("```json", "").replace("```", "").strip()
|
||||
|
||||
# Log the cleaned JSON for debugging
|
||||
logger.debug(f"[StyleDetectionLogic._clean_json_response] Cleaned JSON: {cleaned_string}")
|
||||
|
||||
return cleaned_string
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[StyleDetectionLogic._clean_json_response] Error cleaning response: {str(e)}")
|
||||
return ""
|
||||
|
||||
def analyze_content_style(self, content: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze the style of the provided content using AI with enhanced prompts.
|
||||
|
||||
Args:
|
||||
content (Dict): Content to analyze, containing main_content, title, etc.
|
||||
|
||||
Returns:
|
||||
Dict: Analysis results with writing style, characteristics, and recommendations
|
||||
"""
|
||||
try:
|
||||
logger.info("[StyleDetectionLogic.analyze_content_style] Starting enhanced style analysis")
|
||||
|
||||
# Extract content components
|
||||
title = content.get('title', '')
|
||||
description = content.get('description', '')
|
||||
main_content = content.get('main_content', '')
|
||||
headings = content.get('headings', [])
|
||||
domain_info = content.get('domain_info', {})
|
||||
brand_info = content.get('brand_info', {})
|
||||
social_media = content.get('social_media', {})
|
||||
content_structure = content.get('content_structure', {})
|
||||
|
||||
# Construct the enhanced analysis prompt (strict JSON, minified, stable keys)
|
||||
prompt = f"""Analyze the following website content for comprehensive writing style, tone, and characteristics for personalization and AI generation.
|
||||
|
||||
RULES:
|
||||
- Return ONE single-line MINIFIED JSON object only. No markdown, code fences, comments, or prose.
|
||||
- Use EXACTLY the keys and ordering from the schema below. Do not add extra top-level keys.
|
||||
- For unknown/unavailable fields use empty string "" or empty array [] and explain in meta.uncertainty.
|
||||
- Keep text concise; avoid repeating input text.
|
||||
- Assume token budget; consider only first 5000 chars of main_content and first 10 headings.
|
||||
|
||||
WEBSITE INFORMATION:
|
||||
- Domain: {domain_info.get('domain_name', 'Unknown')}
|
||||
- Website Type: {self._determine_website_type(domain_info)}
|
||||
- Brand Name: {brand_info.get('company_name', 'Not specified')}
|
||||
- Tagline: {brand_info.get('tagline', 'Not specified')}
|
||||
- Social Media Presence: {', '.join(social_media.keys()) if social_media else 'None detected'}
|
||||
|
||||
CONTENT STRUCTURE:
|
||||
- Headings: {len(headings)} total ({content_structure.get('headings', {}).get('h1', 0)} H1, {content_structure.get('headings', {}).get('h2', 0)} H2)
|
||||
- Paragraphs: {content_structure.get('paragraphs', 0)}
|
||||
- Images: {content_structure.get('images', 0)}
|
||||
- Links: {content_structure.get('links', 0)}
|
||||
- Has Navigation: {content_structure.get('has_navigation', False)}
|
||||
- Has Call-to-Action: {content_structure.get('has_call_to_action', False)}
|
||||
|
||||
CONTENT TO ANALYZE:
|
||||
- Title: {title}
|
||||
- Description: {description}
|
||||
- Main Content (truncated): {main_content[:5000]}
|
||||
- Key Headings (first 10): {headings[:10]}
|
||||
|
||||
ANALYSIS REQUIREMENTS:
|
||||
1. Analyze the writing style, tone, and voice characteristics
|
||||
2. Identify target audience demographics and expertise level
|
||||
3. Determine content type and purpose
|
||||
4. Assess content structure and organization patterns
|
||||
5. Evaluate brand voice consistency and personality
|
||||
6. Identify unique style elements and patterns
|
||||
7. Consider the website type and industry context
|
||||
8. Analyze social media presence impact on content style
|
||||
|
||||
REQUIRED JSON SCHEMA (stable key order):
|
||||
{{
|
||||
"writing_style": {{
|
||||
"tone": "", "voice": "", "complexity": "", "engagement_level": "",
|
||||
"brand_personality": "", "formality_level": "", "emotional_appeal": ""
|
||||
}},
|
||||
"content_characteristics": {{
|
||||
"sentence_structure": "", "vocabulary_level": "", "paragraph_organization": "",
|
||||
"content_flow": "", "readability_score": "", "content_density": "",
|
||||
"visual_elements_usage": ""
|
||||
}},
|
||||
"target_audience": {{
|
||||
"demographics": [], "expertise_level": "", "industry_focus": "", "geographic_focus": "",
|
||||
"psychographic_profile": "", "pain_points": [], "motivations": []
|
||||
}},
|
||||
"content_type": {{
|
||||
"primary_type": "", "secondary_types": [], "purpose": "", "call_to_action": "",
|
||||
"conversion_focus": "", "educational_value": ""
|
||||
}},
|
||||
"brand_analysis": {{
|
||||
"brand_voice": "", "brand_values": [], "brand_positioning": "", "competitive_differentiation": "",
|
||||
"trust_signals": [], "authority_indicators": []
|
||||
}},
|
||||
"content_strategy_insights": {{
|
||||
"strengths": [], "weaknesses": [], "opportunities": [], "threats": [],
|
||||
"recommended_improvements": [], "content_gaps": []
|
||||
}},
|
||||
"recommended_settings": {{
|
||||
"writing_tone": "", "target_audience": "", "content_type": "", "creativity_level": "",
|
||||
"geographic_location": "", "industry_context": "", "brand_alignment": ""
|
||||
}},
|
||||
"meta": {{"schema_version": "1.1", "confidence": 0.0, "notes": "", "uncertainty": {{"fields": []}}}}
|
||||
}}
|
||||
"""
|
||||
|
||||
# Call the LLM for analysis
|
||||
logger.debug("[StyleDetectionLogic.analyze_content_style] Sending enhanced prompt to LLM")
|
||||
analysis_text = llm_text_gen(prompt)
|
||||
|
||||
# Clean and parse the response
|
||||
cleaned_json = self._clean_json_response(analysis_text)
|
||||
|
||||
try:
|
||||
analysis_results = json.loads(cleaned_json)
|
||||
logger.info("[StyleDetectionLogic.analyze_content_style] Successfully parsed enhanced analysis results")
|
||||
return {
|
||||
'success': True,
|
||||
'analysis': analysis_results
|
||||
}
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"[StyleDetectionLogic.analyze_content_style] Failed to parse JSON response: {e}")
|
||||
logger.debug(f"[StyleDetectionLogic.analyze_content_style] Raw response: {analysis_text}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Failed to parse analysis response'
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[StyleDetectionLogic.analyze_content_style] Error in enhanced analysis: {str(e)}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def _determine_website_type(self, domain_info: Dict[str, Any]) -> str:
|
||||
"""Determine the type of website based on domain and content analysis."""
|
||||
if domain_info.get('is_blog'):
|
||||
return 'Blog/Content Platform'
|
||||
elif domain_info.get('is_ecommerce'):
|
||||
return 'E-commerce/Online Store'
|
||||
elif domain_info.get('is_corporate'):
|
||||
return 'Corporate/Business Website'
|
||||
elif domain_info.get('has_blog_section'):
|
||||
return 'Business with Blog'
|
||||
elif domain_info.get('has_about_page') and domain_info.get('has_contact_page'):
|
||||
return 'Professional Services'
|
||||
else:
|
||||
return 'General Website'
|
||||
|
||||
def _get_fallback_analysis(self, content: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get fallback analysis when LLM analysis fails."""
|
||||
main_content = content.get("main_content", "")
|
||||
title = content.get("title", "")
|
||||
|
||||
# Simple content analysis based on content characteristics
|
||||
content_length = len(main_content)
|
||||
word_count = len(main_content.split())
|
||||
|
||||
# Determine tone based on content characteristics
|
||||
if any(word in main_content.lower() for word in ['professional', 'business', 'industry', 'company']):
|
||||
tone = "professional"
|
||||
elif any(word in main_content.lower() for word in ['casual', 'fun', 'enjoy', 'exciting']):
|
||||
tone = "casual"
|
||||
else:
|
||||
tone = "neutral"
|
||||
|
||||
# Determine complexity based on sentence length and vocabulary
|
||||
avg_sentence_length = word_count / max(len([s for s in main_content.split('.') if s.strip()]), 1)
|
||||
if avg_sentence_length > 20:
|
||||
complexity = "complex"
|
||||
elif avg_sentence_length > 15:
|
||||
complexity = "moderate"
|
||||
else:
|
||||
complexity = "simple"
|
||||
|
||||
return {
|
||||
"writing_style": {
|
||||
"tone": tone,
|
||||
"voice": "active",
|
||||
"complexity": complexity,
|
||||
"engagement_level": "medium"
|
||||
},
|
||||
"content_characteristics": {
|
||||
"sentence_structure": "standard",
|
||||
"vocabulary_level": "intermediate",
|
||||
"paragraph_organization": "logical",
|
||||
"content_flow": "smooth"
|
||||
},
|
||||
"target_audience": {
|
||||
"demographics": ["general audience"],
|
||||
"expertise_level": "intermediate",
|
||||
"industry_focus": "general",
|
||||
"geographic_focus": "global"
|
||||
},
|
||||
"content_type": {
|
||||
"primary_type": "article",
|
||||
"secondary_types": ["blog", "content"],
|
||||
"purpose": "inform",
|
||||
"call_to_action": "minimal"
|
||||
},
|
||||
"recommended_settings": {
|
||||
"writing_tone": tone,
|
||||
"target_audience": "general audience",
|
||||
"content_type": "article",
|
||||
"creativity_level": "medium",
|
||||
"geographic_location": "global"
|
||||
}
|
||||
}
|
||||
|
||||
def analyze_style_patterns(self, content: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze recurring patterns in the content style.
|
||||
|
||||
Args:
|
||||
content (Dict): Content to analyze
|
||||
|
||||
Returns:
|
||||
Dict: Pattern analysis results
|
||||
"""
|
||||
try:
|
||||
logger.info("[StyleDetectionLogic.analyze_style_patterns] Starting pattern analysis")
|
||||
|
||||
main_content = content.get("main_content", "")
|
||||
|
||||
prompt = f"""Analyze the content for recurring writing patterns and style characteristics.
|
||||
|
||||
RULES:
|
||||
- Return ONE single-line MINIFIED JSON object only. No markdown, code fences, comments, or prose.
|
||||
- Use EXACTLY the keys and ordering from the schema below. No extra top-level keys.
|
||||
- If uncertain, set empty values and list field names in meta.uncertainty.fields.
|
||||
- Keep responses concise and avoid quoting long input spans.
|
||||
|
||||
Content (truncated to 3000 chars): {main_content[:3000]}
|
||||
|
||||
REQUIRED JSON SCHEMA (stable key order):
|
||||
{{
|
||||
"patterns": {{
|
||||
"sentence_length": "", "vocabulary_patterns": [], "rhetorical_devices": [],
|
||||
"paragraph_structure": "", "transition_phrases": []
|
||||
}},
|
||||
"style_consistency": "",
|
||||
"unique_elements": [],
|
||||
"meta": {{"schema_version": "1.1", "confidence": 0.0, "notes": "", "uncertainty": {{"fields": []}}}}
|
||||
}}
|
||||
"""
|
||||
|
||||
analysis_text = llm_text_gen(prompt)
|
||||
cleaned_json = self._clean_json_response(analysis_text)
|
||||
|
||||
try:
|
||||
pattern_results = json.loads(cleaned_json)
|
||||
return {
|
||||
'success': True,
|
||||
'patterns': pattern_results
|
||||
}
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"[StyleDetectionLogic.analyze_style_patterns] Failed to parse JSON response: {e}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Failed to parse pattern analysis response'
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[StyleDetectionLogic.analyze_style_patterns] Error during analysis: {str(e)}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def generate_style_guidelines(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate comprehensive content guidelines based on enhanced style analysis.
|
||||
|
||||
Args:
|
||||
analysis_results (Dict): Results from enhanced style analysis
|
||||
|
||||
Returns:
|
||||
Dict: Generated comprehensive guidelines
|
||||
"""
|
||||
try:
|
||||
logger.info("[StyleDetectionLogic.generate_style_guidelines] Generating comprehensive style guidelines")
|
||||
|
||||
# Extract key information from analysis
|
||||
writing_style = analysis_results.get('writing_style', {})
|
||||
content_characteristics = analysis_results.get('content_characteristics', {})
|
||||
target_audience = analysis_results.get('target_audience', {})
|
||||
brand_analysis = analysis_results.get('brand_analysis', {})
|
||||
content_strategy_insights = analysis_results.get('content_strategy_insights', {})
|
||||
|
||||
prompt = f"""Generate actionable content creation guidelines based on the style analysis.
|
||||
|
||||
ANALYSIS DATA:
|
||||
Writing Style: {writing_style}
|
||||
Content Characteristics: {content_characteristics}
|
||||
Target Audience: {target_audience}
|
||||
Brand Analysis: {brand_analysis}
|
||||
Content Strategy Insights: {content_strategy_insights}
|
||||
|
||||
REQUIREMENTS:
|
||||
- Return ONE single-line MINIFIED JSON object only. No markdown, code fences, comments, or prose.
|
||||
- Use EXACTLY the keys and ordering from the schema below. No extra top-level keys.
|
||||
- Provide concise, implementation-ready bullets with an example for key items (e.g., tone and CTA examples).
|
||||
- Include negative guidance (what to avoid) tied to brand constraints where applicable.
|
||||
- If uncertain, set empty values and list field names in meta.uncertainty.fields.
|
||||
|
||||
IMPORTANT: REQUIRED JSON SCHEMA (stable key order):
|
||||
{{
|
||||
"guidelines": {{
|
||||
"tone_recommendations": [],
|
||||
"structure_guidelines": [],
|
||||
"vocabulary_suggestions": [],
|
||||
"engagement_tips": [],
|
||||
"audience_considerations": [],
|
||||
"brand_alignment": [],
|
||||
"seo_optimization": [],
|
||||
"conversion_optimization": []
|
||||
}},
|
||||
"best_practices": [],
|
||||
"avoid_elements": [],
|
||||
"content_strategy": "",
|
||||
"ai_generation_tips": [],
|
||||
"competitive_advantages": [],
|
||||
"content_calendar_suggestions": [],
|
||||
"meta": {{"schema_version": "1.1", "confidence": 0.0, "notes": "", "uncertainty": {{"fields": []}}}}
|
||||
}}
|
||||
"""
|
||||
|
||||
guidelines_text = llm_text_gen(prompt)
|
||||
cleaned_json = self._clean_json_response(guidelines_text)
|
||||
|
||||
try:
|
||||
guidelines = json.loads(cleaned_json)
|
||||
return {
|
||||
'success': True,
|
||||
'guidelines': guidelines
|
||||
}
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"[StyleDetectionLogic.generate_style_guidelines] Failed to parse JSON response: {e}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Failed to parse guidelines response'
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[StyleDetectionLogic.generate_style_guidelines] Error generating guidelines: {str(e)}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def validate_style_analysis_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate style analysis request data.
|
||||
|
||||
Args:
|
||||
request_data (Dict): Request data to validate
|
||||
|
||||
Returns:
|
||||
Dict: Validation results
|
||||
"""
|
||||
errors = []
|
||||
|
||||
# Check if content is provided
|
||||
if not request_data.get('content') and not request_data.get('url') and not request_data.get('text_sample'):
|
||||
errors.append("Content is required for style analysis")
|
||||
|
||||
# Check content length
|
||||
content = request_data.get('content', {})
|
||||
main_content = content.get('main_content', '')
|
||||
if len(main_content) < 50:
|
||||
errors.append("Content must be at least 50 characters long for meaningful analysis")
|
||||
|
||||
# Check for required fields
|
||||
if not content.get('title') and not content.get('main_content'):
|
||||
errors.append("Either title or main content must be provided")
|
||||
|
||||
return {
|
||||
'valid': len(errors) == 0,
|
||||
'errors': errors
|
||||
}
|
||||
584
backend/services/component_logic/web_crawler_logic.py
Normal file
584
backend/services/component_logic/web_crawler_logic.py
Normal file
@@ -0,0 +1,584 @@
|
||||
"""Web Crawler Logic Service for ALwrity Backend.
|
||||
|
||||
This service handles business logic for web crawling and content extraction,
|
||||
migrated from the legacy web crawler functionality.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
from datetime import datetime
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import requests
|
||||
import re
|
||||
|
||||
class WebCrawlerLogic:
|
||||
"""Business logic for web crawling and content extraction."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Web Crawler Logic service."""
|
||||
logger.info("[WebCrawlerLogic.__init__] Initializing web crawler service")
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
self.timeout = 30
|
||||
self.max_content_length = 10000
|
||||
|
||||
def _validate_url(self, url: str) -> bool:
|
||||
"""
|
||||
Validate URL format and fix common formatting issues.
|
||||
|
||||
Args:
|
||||
url (str): URL to validate
|
||||
|
||||
Returns:
|
||||
bool: True if URL is valid
|
||||
"""
|
||||
try:
|
||||
# Clean and fix common URL issues
|
||||
cleaned_url = self._fix_url_format(url)
|
||||
|
||||
result = urlparse(cleaned_url)
|
||||
|
||||
# Check if we have both scheme and netloc
|
||||
if not all([result.scheme, result.netloc]):
|
||||
return False
|
||||
|
||||
# Additional validation for domain format
|
||||
domain = result.netloc
|
||||
if '.' not in domain or len(domain.split('.')[-1]) < 2:
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"[WebCrawlerLogic._validate_url] URL validation error: {str(e)}")
|
||||
return False
|
||||
|
||||
def _fix_url_format(self, url: str) -> str:
|
||||
"""
|
||||
Fix common URL formatting issues.
|
||||
|
||||
Args:
|
||||
url (str): URL to fix
|
||||
|
||||
Returns:
|
||||
str: Fixed URL
|
||||
"""
|
||||
# Remove leading/trailing whitespace
|
||||
url = url.strip()
|
||||
|
||||
# Check if URL already has a protocol but is missing slashes
|
||||
if url.startswith('https:/') and not url.startswith('https://'):
|
||||
url = url.replace('https:/', 'https://')
|
||||
elif url.startswith('http:/') and not url.startswith('http://'):
|
||||
url = url.replace('http:/', 'http://')
|
||||
|
||||
# Add protocol if missing
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = 'https://' + url
|
||||
|
||||
# Fix missing slash after protocol
|
||||
if '://' in url and not url.split('://')[1].startswith('/'):
|
||||
url = url.replace('://', ':///')
|
||||
|
||||
# Ensure only two slashes after protocol
|
||||
if ':///' in url:
|
||||
url = url.replace(':///', '://')
|
||||
|
||||
logger.debug(f"[WebCrawlerLogic._fix_url_format] Fixed URL: {url}")
|
||||
return url
|
||||
|
||||
async def crawl_website(self, url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Crawl a website and extract its content asynchronously with enhanced data extraction.
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl
|
||||
|
||||
Returns:
|
||||
Dict: Extracted website content and metadata
|
||||
"""
|
||||
try:
|
||||
logger.info(f"[WebCrawlerLogic.crawl_website] Starting enhanced crawl for URL: {url}")
|
||||
|
||||
# Fix URL format first
|
||||
fixed_url = self._fix_url_format(url)
|
||||
logger.info(f"[WebCrawlerLogic.crawl_website] Fixed URL: {fixed_url}")
|
||||
|
||||
# Validate URL
|
||||
if not self._validate_url(fixed_url):
|
||||
error_msg = f"Invalid URL format: {url}"
|
||||
logger.error(f"[WebCrawlerLogic.crawl_website] {error_msg}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': error_msg
|
||||
}
|
||||
|
||||
# Fetch the page content
|
||||
try:
|
||||
async with aiohttp.ClientSession(headers=self.headers, timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
|
||||
async with session.get(fixed_url) as response:
|
||||
if response.status == 200:
|
||||
html_content = await response.text()
|
||||
logger.debug("[WebCrawlerLogic.crawl_website] Successfully fetched HTML content")
|
||||
else:
|
||||
error_msg = f"Failed to fetch content: Status code {response.status}"
|
||||
logger.error(f"[WebCrawlerLogic.crawl_website] {error_msg}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': error_msg
|
||||
}
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to fetch content from {fixed_url}: {str(e)}"
|
||||
logger.error(f"[WebCrawlerLogic.crawl_website] {error_msg}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': error_msg
|
||||
}
|
||||
|
||||
# Parse HTML with BeautifulSoup
|
||||
logger.debug("[WebCrawlerLogic.crawl_website] Parsing HTML content")
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Extract domain information
|
||||
domain_info = self._extract_domain_info(fixed_url, soup)
|
||||
|
||||
# Extract enhanced main content
|
||||
main_content = self._extract_enhanced_content(soup)
|
||||
|
||||
# Extract social media and brand information
|
||||
social_media = self._extract_social_media(soup)
|
||||
brand_info = self._extract_brand_information(soup)
|
||||
|
||||
# Extract content structure and patterns
|
||||
content_structure = self._extract_content_structure(soup)
|
||||
|
||||
# Extract content
|
||||
content = {
|
||||
'title': soup.title.string.strip() if soup.title else '',
|
||||
'description': soup.find('meta', {'name': 'description'}).get('content', '').strip() if soup.find('meta', {'name': 'description'}) else '',
|
||||
'main_content': main_content,
|
||||
'headings': [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])],
|
||||
'links': [{'text': a.get_text(strip=True), 'href': urljoin(fixed_url, a.get('href', ''))} for a in soup.find_all('a', href=True)],
|
||||
'images': [{'alt': img.get('alt', '').strip(), 'src': urljoin(fixed_url, img.get('src', ''))} for img in soup.find_all('img', src=True)],
|
||||
'meta_tags': {
|
||||
meta.get('name', meta.get('property', '')): meta.get('content', '').strip()
|
||||
for meta in soup.find_all('meta')
|
||||
if (meta.get('name') or meta.get('property')) and meta.get('content')
|
||||
},
|
||||
'domain_info': domain_info,
|
||||
'social_media': social_media,
|
||||
'brand_info': brand_info,
|
||||
'content_structure': content_structure
|
||||
}
|
||||
|
||||
logger.debug(f"[WebCrawlerLogic.crawl_website] Extracted {len(content['links'])} links, {len(content['images'])} images, and {len(social_media)} social media links")
|
||||
|
||||
logger.info("[WebCrawlerLogic.crawl_website] Successfully completed enhanced website crawl")
|
||||
return {
|
||||
'success': True,
|
||||
'content': content,
|
||||
'url': fixed_url,
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error crawling {url}: {str(e)}"
|
||||
logger.error(f"[WebCrawlerLogic.crawl_website] {error_msg}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def _extract_domain_info(self, url: str, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract domain-specific information."""
|
||||
try:
|
||||
domain = urlparse(url).netloc
|
||||
return {
|
||||
'domain': domain,
|
||||
'domain_name': domain.replace('www.', ''),
|
||||
'is_blog': any(keyword in domain.lower() for keyword in ['blog', 'medium', 'substack', 'wordpress']),
|
||||
'is_ecommerce': any(keyword in domain.lower() for keyword in ['shop', 'store', 'cart', 'buy', 'amazon', 'ebay']),
|
||||
'is_corporate': any(keyword in domain.lower() for keyword in ['corp', 'inc', 'llc', 'company', 'business']),
|
||||
'has_blog_section': bool(soup.find('a', href=re.compile(r'blog|news|articles', re.I))),
|
||||
'has_about_page': bool(soup.find('a', href=re.compile(r'about|company|team', re.I))),
|
||||
'has_contact_page': bool(soup.find('a', href=re.compile(r'contact|support|help', re.I)))
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[WebCrawlerLogic._extract_domain_info] Error: {str(e)}")
|
||||
return {}
|
||||
|
||||
def _extract_enhanced_content(self, soup: BeautifulSoup) -> str:
|
||||
"""Extract enhanced main content with better structure detection."""
|
||||
try:
|
||||
# Try to find main content areas
|
||||
main_content_elements = []
|
||||
|
||||
# Look for semantic content containers
|
||||
semantic_selectors = [
|
||||
'article', 'main', '[role="main"]',
|
||||
'.content', '.main-content', '.article', '.post',
|
||||
'.entry', '.page-content', '.site-content'
|
||||
]
|
||||
|
||||
for selector in semantic_selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
main_content_elements.extend(elements)
|
||||
break
|
||||
|
||||
# If no semantic containers found, look for content-rich divs
|
||||
if not main_content_elements:
|
||||
content_divs = soup.find_all('div', class_=re.compile(r'content|main|article|post|entry', re.I))
|
||||
main_content_elements = content_divs
|
||||
|
||||
# If still no content, get all paragraph text
|
||||
if not main_content_elements:
|
||||
main_content_elements = soup.find_all(['p', 'article', 'section'])
|
||||
|
||||
# Extract text with better formatting
|
||||
content_parts = []
|
||||
for elem in main_content_elements:
|
||||
text = elem.get_text(separator=' ', strip=True)
|
||||
if text and len(text) > 20: # Only include substantial text
|
||||
content_parts.append(text)
|
||||
|
||||
main_content = ' '.join(content_parts)
|
||||
|
||||
# Limit content length
|
||||
if len(main_content) > self.max_content_length:
|
||||
main_content = main_content[:self.max_content_length] + "..."
|
||||
|
||||
return main_content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[WebCrawlerLogic._extract_enhanced_content] Error: {str(e)}")
|
||||
return ''
|
||||
|
||||
def _extract_social_media(self, soup: BeautifulSoup) -> Dict[str, str]:
|
||||
"""Extract social media links and handles."""
|
||||
social_media = {}
|
||||
try:
|
||||
# Common social media patterns
|
||||
social_patterns = {
|
||||
'facebook': r'facebook\.com|fb\.com',
|
||||
'twitter': r'twitter\.com|x\.com',
|
||||
'linkedin': r'linkedin\.com',
|
||||
'instagram': r'instagram\.com',
|
||||
'youtube': r'youtube\.com|youtu\.be',
|
||||
'tiktok': r'tiktok\.com',
|
||||
'pinterest': r'pinterest\.com',
|
||||
'github': r'github\.com'
|
||||
}
|
||||
|
||||
# Find all links
|
||||
links = soup.find_all('a', href=True)
|
||||
|
||||
for link in links:
|
||||
href = link.get('href', '').lower()
|
||||
for platform, pattern in social_patterns.items():
|
||||
if re.search(pattern, href):
|
||||
social_media[platform] = href
|
||||
break
|
||||
|
||||
# Also check for social media meta tags
|
||||
meta_social = {
|
||||
'og:site_name': 'site_name',
|
||||
'twitter:site': 'twitter',
|
||||
'twitter:creator': 'twitter_creator'
|
||||
}
|
||||
|
||||
for meta in soup.find_all('meta', property=True):
|
||||
prop = meta.get('property', '')
|
||||
if prop in meta_social:
|
||||
social_media[meta_social[prop]] = meta.get('content', '')
|
||||
|
||||
return social_media
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[WebCrawlerLogic._extract_social_media] Error: {str(e)}")
|
||||
return {}
|
||||
|
||||
def _extract_brand_information(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract brand and company information."""
|
||||
brand_info = {}
|
||||
try:
|
||||
# Extract logo information
|
||||
logos = soup.find_all('img', alt=re.compile(r'logo|brand', re.I))
|
||||
if logos:
|
||||
brand_info['logo_alt'] = [logo.get('alt', '') for logo in logos]
|
||||
|
||||
# Extract company name from various sources
|
||||
company_name_selectors = [
|
||||
'h1', '.logo', '.brand', '.company-name',
|
||||
'[class*="logo"]', '[class*="brand"]'
|
||||
]
|
||||
|
||||
for selector in company_name_selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
brand_info['company_name'] = elements[0].get_text(strip=True)
|
||||
break
|
||||
|
||||
# Extract taglines and slogans
|
||||
tagline_selectors = [
|
||||
'.tagline', '.slogan', '.motto',
|
||||
'[class*="tagline"]', '[class*="slogan"]'
|
||||
]
|
||||
|
||||
for selector in tagline_selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
brand_info['tagline'] = elements[0].get_text(strip=True)
|
||||
break
|
||||
|
||||
# Extract contact information
|
||||
contact_info = {}
|
||||
contact_patterns = {
|
||||
'email': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
|
||||
'phone': r'[\+]?[1-9][\d]{0,15}',
|
||||
'address': r'\d+\s+[a-zA-Z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd)'
|
||||
}
|
||||
|
||||
for info_type, pattern in contact_patterns.items():
|
||||
matches = re.findall(pattern, soup.get_text())
|
||||
if matches:
|
||||
contact_info[info_type] = matches[:3] # Limit to first 3 matches
|
||||
|
||||
brand_info['contact_info'] = contact_info
|
||||
|
||||
return brand_info
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[WebCrawlerLogic._extract_brand_information] Error: {str(e)}")
|
||||
return {}
|
||||
|
||||
def _extract_content_structure(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract content structure and patterns."""
|
||||
structure = {}
|
||||
try:
|
||||
# Count different content types
|
||||
structure['headings'] = {
|
||||
'h1': len(soup.find_all('h1')),
|
||||
'h2': len(soup.find_all('h2')),
|
||||
'h3': len(soup.find_all('h3')),
|
||||
'h4': len(soup.find_all('h4')),
|
||||
'h5': len(soup.find_all('h5')),
|
||||
'h6': len(soup.find_all('h6'))
|
||||
}
|
||||
|
||||
structure['paragraphs'] = len(soup.find_all('p'))
|
||||
structure['lists'] = len(soup.find_all(['ul', 'ol']))
|
||||
structure['images'] = len(soup.find_all('img'))
|
||||
structure['links'] = len(soup.find_all('a'))
|
||||
|
||||
# Analyze content sections
|
||||
sections = soup.find_all(['section', 'article', 'div'], class_=re.compile(r'section|article|content', re.I))
|
||||
structure['content_sections'] = len(sections)
|
||||
|
||||
# Check for common content patterns
|
||||
structure['has_navigation'] = bool(soup.find(['nav', 'header']))
|
||||
structure['has_footer'] = bool(soup.find('footer'))
|
||||
structure['has_sidebar'] = bool(soup.find(class_=re.compile(r'sidebar|aside', re.I)))
|
||||
structure['has_call_to_action'] = bool(soup.find(text=re.compile(r'click|buy|sign|register|subscribe', re.I)))
|
||||
|
||||
return structure
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[WebCrawlerLogic._extract_content_structure] Error: {str(e)}")
|
||||
return {}
|
||||
|
||||
def extract_content_from_text(self, text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract content from provided text sample.
|
||||
|
||||
Args:
|
||||
text (str): Text content to process
|
||||
|
||||
Returns:
|
||||
Dict: Processed content with metadata
|
||||
"""
|
||||
try:
|
||||
logger.info("[WebCrawlerLogic.extract_content_from_text] Processing text content")
|
||||
|
||||
# Clean and process text
|
||||
cleaned_text = re.sub(r'\s+', ' ', text.strip())
|
||||
|
||||
# Split into sentences for analysis
|
||||
sentences = [s.strip() for s in cleaned_text.split('.') if s.strip()]
|
||||
|
||||
# Extract basic metrics
|
||||
words = cleaned_text.split()
|
||||
word_count = len(words)
|
||||
sentence_count = len(sentences)
|
||||
avg_sentence_length = word_count / max(sentence_count, 1)
|
||||
|
||||
content = {
|
||||
'title': 'Text Sample',
|
||||
'description': 'Content provided as text sample',
|
||||
'main_content': cleaned_text,
|
||||
'headings': [],
|
||||
'links': [],
|
||||
'images': [],
|
||||
'meta_tags': {},
|
||||
'metrics': {
|
||||
'word_count': word_count,
|
||||
'sentence_count': sentence_count,
|
||||
'avg_sentence_length': avg_sentence_length,
|
||||
'unique_words': len(set(words)),
|
||||
'content_length': len(cleaned_text)
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("[WebCrawlerLogic.extract_content_from_text] Successfully processed text content")
|
||||
return {
|
||||
'success': True,
|
||||
'content': content,
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error processing text content: {str(e)}"
|
||||
logger.error(f"[WebCrawlerLogic.extract_content_from_text] {error_msg}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': error_msg
|
||||
}
|
||||
|
||||
def validate_crawl_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate web crawl request data.
|
||||
|
||||
Args:
|
||||
request_data (Dict): Request data to validate
|
||||
|
||||
Returns:
|
||||
Dict: Validation results
|
||||
"""
|
||||
try:
|
||||
logger.info("[WebCrawlerLogic.validate_crawl_request] Validating request")
|
||||
|
||||
errors = []
|
||||
|
||||
# Check for required fields
|
||||
url = request_data.get('url', '')
|
||||
text_sample = request_data.get('text_sample', '')
|
||||
|
||||
if not url and not text_sample:
|
||||
errors.append("Either URL or text sample is required")
|
||||
|
||||
if url and not self._validate_url(url):
|
||||
errors.append("Invalid URL format")
|
||||
|
||||
if text_sample and len(text_sample) < 50:
|
||||
errors.append("Text sample must be at least 50 characters")
|
||||
|
||||
if text_sample and len(text_sample) > 10000:
|
||||
errors.append("Text sample is too long (max 10,000 characters)")
|
||||
|
||||
if errors:
|
||||
return {
|
||||
'valid': False,
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
logger.info("[WebCrawlerLogic.validate_crawl_request] Request validation successful")
|
||||
return {
|
||||
'valid': True,
|
||||
'url': url,
|
||||
'text_sample': text_sample
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[WebCrawlerLogic.validate_crawl_request] Validation error: {str(e)}")
|
||||
return {
|
||||
'valid': False,
|
||||
'errors': [f"Validation error: {str(e)}"]
|
||||
}
|
||||
|
||||
def get_crawl_metrics(self, content: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Calculate metrics for crawled content.
|
||||
|
||||
Args:
|
||||
content (Dict): Content to analyze
|
||||
|
||||
Returns:
|
||||
Dict: Content metrics
|
||||
"""
|
||||
try:
|
||||
logger.info("[WebCrawlerLogic.get_crawl_metrics] Calculating content metrics")
|
||||
|
||||
main_content = content.get('main_content', '')
|
||||
title = content.get('title', '')
|
||||
description = content.get('description', '')
|
||||
headings = content.get('headings', [])
|
||||
links = content.get('links', [])
|
||||
images = content.get('images', [])
|
||||
|
||||
# Calculate metrics
|
||||
words = main_content.split()
|
||||
sentences = [s.strip() for s in main_content.split('.') if s.strip()]
|
||||
|
||||
metrics = {
|
||||
'word_count': len(words),
|
||||
'sentence_count': len(sentences),
|
||||
'avg_sentence_length': len(words) / max(len(sentences), 1),
|
||||
'unique_words': len(set(words)),
|
||||
'content_length': len(main_content),
|
||||
'title_length': len(title),
|
||||
'description_length': len(description),
|
||||
'heading_count': len(headings),
|
||||
'link_count': len(links),
|
||||
'image_count': len(images),
|
||||
'readability_score': self._calculate_readability(main_content),
|
||||
'content_density': len(set(words)) / max(len(words), 1)
|
||||
}
|
||||
|
||||
logger.info("[WebCrawlerLogic.get_crawl_metrics] Metrics calculated successfully")
|
||||
return {
|
||||
'success': True,
|
||||
'metrics': metrics
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[WebCrawlerLogic.get_crawl_metrics] Error calculating metrics: {str(e)}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def _calculate_readability(self, text: str) -> float:
|
||||
"""
|
||||
Calculate a simple readability score.
|
||||
|
||||
Args:
|
||||
text (str): Text to analyze
|
||||
|
||||
Returns:
|
||||
float: Readability score (0-1)
|
||||
"""
|
||||
try:
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
words = text.split()
|
||||
sentences = [s.strip() for s in text.split('.') if s.strip()]
|
||||
|
||||
if not sentences:
|
||||
return 0.0
|
||||
|
||||
# Simple Flesch Reading Ease approximation
|
||||
avg_sentence_length = len(words) / len(sentences)
|
||||
avg_word_length = sum(len(word) for word in words) / len(words)
|
||||
|
||||
# Normalize to 0-1 scale
|
||||
readability = max(0, min(1, (100 - avg_sentence_length - avg_word_length) / 100))
|
||||
|
||||
return round(readability, 2)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[WebCrawlerLogic._calculate_readability] Error: {str(e)}")
|
||||
return 0.5
|
||||
Reference in New Issue
Block a user