Base code

This commit is contained in:
Kunthawat Greethong
2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
"""Component Logic Services for ALwrity Backend.
This module contains business logic extracted from legacy Streamlit components
and converted to reusable FastAPI services.
"""
from .ai_research_logic import AIResearchLogic
from .personalization_logic import PersonalizationLogic
from .research_utilities import ResearchUtilities
from .style_detection_logic import StyleDetectionLogic
from .web_crawler_logic import WebCrawlerLogic
__all__ = [
"AIResearchLogic",
"PersonalizationLogic",
"ResearchUtilities",
"StyleDetectionLogic",
"WebCrawlerLogic"
]

View File

@@ -0,0 +1,268 @@
"""AI Research Logic Service for ALwrity Backend.
This service handles business logic for AI research configuration and user information
validation, extracted from the legacy Streamlit component.
"""
from typing import Dict, Any, List, Optional
from loguru import logger
import re
from datetime import datetime
class AIResearchLogic:
"""Business logic for AI research configuration and user information."""
def __init__(self):
"""Initialize the AI Research Logic service."""
self.valid_roles = ["Content Creator", "Marketing Manager", "Business Owner", "Other"]
self.valid_research_depths = ["Basic", "Standard", "Deep", "Comprehensive"]
self.valid_content_types = ["Blog Posts", "Social Media", "Technical Articles", "News", "Academic Papers"]
def validate_user_info(self, user_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate user information for AI research configuration.
Args:
user_data: Dictionary containing user information
Returns:
Dict containing validation results
"""
try:
logger.info("Validating user information for AI research")
errors = []
validated_data = {}
# Validate full name
full_name = user_data.get('full_name', '').strip()
if not full_name or len(full_name) < 2:
errors.append("Full name must be at least 2 characters long")
else:
validated_data['full_name'] = full_name
# Validate email
email = user_data.get('email', '').strip().lower()
email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
if not email_pattern.match(email):
errors.append("Invalid email format")
else:
validated_data['email'] = email
# Validate company
company = user_data.get('company', '').strip()
if not company:
errors.append("Company name is required")
else:
validated_data['company'] = company
# Validate role
role = user_data.get('role', '')
if role not in self.valid_roles:
errors.append(f"Role must be one of: {', '.join(self.valid_roles)}")
else:
validated_data['role'] = role
# Determine validation result
is_valid = len(errors) == 0
if is_valid:
logger.info("User information validation successful")
validated_data['validated_at'] = datetime.now().isoformat()
else:
logger.warning(f"User information validation failed: {errors}")
return {
'valid': is_valid,
'user_info': validated_data if is_valid else None,
'errors': errors
}
except Exception as e:
logger.error(f"Error validating user information: {str(e)}")
return {
'valid': False,
'user_info': None,
'errors': [f"Validation error: {str(e)}"]
}
def configure_research_preferences(self, preferences: Dict[str, Any]) -> Dict[str, Any]:
"""
Configure research preferences for AI research.
Args:
preferences: Dictionary containing research preferences
Returns:
Dict containing configuration results
"""
try:
logger.info("Configuring research preferences")
errors = []
configured_preferences = {}
# Validate research depth
research_depth = preferences.get('research_depth', '')
if research_depth not in self.valid_research_depths:
errors.append(f"Research depth must be one of: {', '.join(self.valid_research_depths)}")
else:
configured_preferences['research_depth'] = research_depth
# Validate content types
content_types = preferences.get('content_types', [])
if not content_types:
errors.append("At least one content type must be selected")
else:
invalid_types = [ct for ct in content_types if ct not in self.valid_content_types]
if invalid_types:
errors.append(f"Invalid content types: {', '.join(invalid_types)}")
else:
configured_preferences['content_types'] = content_types
# Validate auto research setting
auto_research = preferences.get('auto_research', False)
if not isinstance(auto_research, bool):
errors.append("Auto research must be a boolean value")
else:
configured_preferences['auto_research'] = auto_research
# Determine configuration result
is_valid = len(errors) == 0
if is_valid:
logger.info("Research preferences configuration successful")
configured_preferences['configured_at'] = datetime.now().isoformat()
else:
logger.warning(f"Research preferences configuration failed: {errors}")
return {
'valid': is_valid,
'preferences': configured_preferences if is_valid else None,
'errors': errors
}
except Exception as e:
logger.error(f"Error configuring research preferences: {str(e)}")
return {
'valid': False,
'preferences': None,
'errors': [f"Configuration error: {str(e)}"]
}
def process_research_request(self, topic: str, preferences: Dict[str, Any]) -> Dict[str, Any]:
"""
Process a research request with configured preferences.
Args:
topic: The research topic
preferences: Configured research preferences
Returns:
Dict containing research processing results
"""
try:
logger.info(f"Processing research request for topic: {topic}")
# Validate topic
if not topic or len(topic.strip()) < 3:
return {
'success': False,
'topic': topic,
'error': 'Topic must be at least 3 characters long'
}
# Validate preferences
if not preferences:
return {
'success': False,
'topic': topic,
'error': 'Research preferences are required'
}
# Process research based on preferences
research_depth = preferences.get('research_depth', 'Standard')
content_types = preferences.get('content_types', [])
auto_research = preferences.get('auto_research', False)
# Simulate research processing (in real implementation, this would call AI services)
research_results = {
'topic': topic,
'research_depth': research_depth,
'content_types': content_types,
'auto_research': auto_research,
'processed_at': datetime.now().isoformat(),
'status': 'processed'
}
logger.info(f"Research request processed successfully for topic: {topic}")
return {
'success': True,
'topic': topic,
'results': research_results
}
except Exception as e:
logger.error(f"Error processing research request: {str(e)}")
return {
'success': False,
'topic': topic,
'error': f"Processing error: {str(e)}"
}
def get_research_configuration_options(self) -> Dict[str, Any]:
"""
Get available configuration options for research.
Returns:
Dict containing all available options
"""
return {
'roles': self.valid_roles,
'research_depths': self.valid_research_depths,
'content_types': self.valid_content_types,
'auto_research_options': [True, False]
}
def validate_complete_research_setup(self, user_info: Dict[str, Any], preferences: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate complete research setup including user info and preferences.
Args:
user_info: User information dictionary
preferences: Research preferences dictionary
Returns:
Dict containing complete validation results
"""
try:
logger.info("Validating complete research setup")
# Validate user information
user_validation = self.validate_user_info(user_info)
# Validate research preferences
preferences_validation = self.configure_research_preferences(preferences)
# Combine results
all_errors = user_validation.get('errors', []) + preferences_validation.get('errors', [])
is_complete = user_validation.get('valid', False) and preferences_validation.get('valid', False)
return {
'complete': is_complete,
'user_info_valid': user_validation.get('valid', False),
'preferences_valid': preferences_validation.get('valid', False),
'errors': all_errors,
'user_info': user_validation.get('user_info'),
'preferences': preferences_validation.get('preferences')
}
except Exception as e:
logger.error(f"Error validating complete research setup: {str(e)}")
return {
'complete': False,
'user_info_valid': False,
'preferences_valid': False,
'errors': [f"Setup validation error: {str(e)}"]
}

View File

@@ -0,0 +1,337 @@
"""Personalization Logic Service for ALwrity Backend.
This service handles business logic for content personalization settings,
extracted from the legacy Streamlit component.
"""
from typing import Dict, Any, List, Optional
from loguru import logger
from datetime import datetime
class PersonalizationLogic:
"""Business logic for content personalization and brand voice configuration."""
def __init__(self):
"""Initialize the Personalization Logic service."""
self.valid_writing_styles = ["Professional", "Casual", "Technical", "Conversational", "Academic"]
self.valid_tones = ["Formal", "Semi-Formal", "Neutral", "Friendly", "Humorous"]
self.valid_content_lengths = ["Concise", "Standard", "Detailed", "Comprehensive"]
self.valid_personality_traits = ["Professional", "Innovative", "Friendly", "Trustworthy", "Creative", "Expert"]
self.valid_readability_levels = ["Simple", "Standard", "Advanced", "Expert"]
self.valid_content_structures = ["Introduction", "Key Points", "Examples", "Conclusion", "Call-to-Action"]
def validate_content_style(self, style_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate content style configuration.
Args:
style_data: Dictionary containing content style settings
Returns:
Dict containing validation results
"""
try:
logger.info("Validating content style configuration")
errors = []
validated_style = {}
# Validate writing style
writing_style = style_data.get('writing_style', '')
if writing_style not in self.valid_writing_styles:
errors.append(f"Writing style must be one of: {', '.join(self.valid_writing_styles)}")
else:
validated_style['writing_style'] = writing_style
# Validate tone
tone = style_data.get('tone', '')
if tone not in self.valid_tones:
errors.append(f"Tone must be one of: {', '.join(self.valid_tones)}")
else:
validated_style['tone'] = tone
# Validate content length
content_length = style_data.get('content_length', '')
if content_length not in self.valid_content_lengths:
errors.append(f"Content length must be one of: {', '.join(self.valid_content_lengths)}")
else:
validated_style['content_length'] = content_length
# Determine validation result
is_valid = len(errors) == 0
if is_valid:
logger.info("Content style validation successful")
validated_style['validated_at'] = datetime.now().isoformat()
else:
logger.warning(f"Content style validation failed: {errors}")
return {
'valid': is_valid,
'style_config': validated_style if is_valid else None,
'errors': errors
}
except Exception as e:
logger.error(f"Error validating content style: {str(e)}")
return {
'valid': False,
'style_config': None,
'errors': [f"Style validation error: {str(e)}"]
}
def configure_brand_voice(self, brand_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Configure brand voice settings.
Args:
brand_data: Dictionary containing brand voice settings
Returns:
Dict containing configuration results
"""
try:
logger.info("Configuring brand voice settings")
errors = []
configured_brand = {}
# Validate personality traits
personality_traits = brand_data.get('personality_traits', [])
if not personality_traits:
errors.append("At least one personality trait must be selected")
else:
invalid_traits = [trait for trait in personality_traits if trait not in self.valid_personality_traits]
if invalid_traits:
errors.append(f"Invalid personality traits: {', '.join(invalid_traits)}")
else:
configured_brand['personality_traits'] = personality_traits
# Validate voice description (optional but if provided, must be valid)
voice_description = brand_data.get('voice_description', '').strip()
if voice_description and len(voice_description) < 10:
errors.append("Voice description must be at least 10 characters long")
elif voice_description:
configured_brand['voice_description'] = voice_description
# Validate keywords (optional)
keywords = brand_data.get('keywords', '').strip()
if keywords:
configured_brand['keywords'] = keywords
# Determine configuration result
is_valid = len(errors) == 0
if is_valid:
logger.info("Brand voice configuration successful")
configured_brand['configured_at'] = datetime.now().isoformat()
else:
logger.warning(f"Brand voice configuration failed: {errors}")
return {
'valid': is_valid,
'brand_config': configured_brand if is_valid else None,
'errors': errors
}
except Exception as e:
logger.error(f"Error configuring brand voice: {str(e)}")
return {
'valid': False,
'brand_config': None,
'errors': [f"Brand configuration error: {str(e)}"]
}
def process_advanced_settings(self, settings: Dict[str, Any]) -> Dict[str, Any]:
"""
Process advanced content generation settings.
Args:
settings: Dictionary containing advanced settings
Returns:
Dict containing processing results
"""
try:
logger.info("Processing advanced content generation settings")
errors = []
processed_settings = {}
# Validate SEO optimization (boolean)
seo_optimization = settings.get('seo_optimization', False)
if not isinstance(seo_optimization, bool):
errors.append("SEO optimization must be a boolean value")
else:
processed_settings['seo_optimization'] = seo_optimization
# Validate readability level
readability_level = settings.get('readability_level', '')
if readability_level not in self.valid_readability_levels:
errors.append(f"Readability level must be one of: {', '.join(self.valid_readability_levels)}")
else:
processed_settings['readability_level'] = readability_level
# Validate content structure
content_structure = settings.get('content_structure', [])
if not content_structure:
errors.append("At least one content structure element must be selected")
else:
invalid_structures = [struct for struct in content_structure if struct not in self.valid_content_structures]
if invalid_structures:
errors.append(f"Invalid content structure elements: {', '.join(invalid_structures)}")
else:
processed_settings['content_structure'] = content_structure
# Determine processing result
is_valid = len(errors) == 0
if is_valid:
logger.info("Advanced settings processing successful")
processed_settings['processed_at'] = datetime.now().isoformat()
else:
logger.warning(f"Advanced settings processing failed: {errors}")
return {
'valid': is_valid,
'advanced_settings': processed_settings if is_valid else None,
'errors': errors
}
except Exception as e:
logger.error(f"Error processing advanced settings: {str(e)}")
return {
'valid': False,
'advanced_settings': None,
'errors': [f"Advanced settings error: {str(e)}"]
}
def process_personalization_settings(self, settings: Dict[str, Any]) -> Dict[str, Any]:
"""
Process complete personalization settings including all components.
Args:
settings: Dictionary containing complete personalization settings
Returns:
Dict containing processing results
"""
try:
logger.info("Processing complete personalization settings")
# Validate content style
content_style = settings.get('content_style', {})
style_validation = self.validate_content_style(content_style)
# Configure brand voice
brand_voice = settings.get('brand_voice', {})
brand_validation = self.configure_brand_voice(brand_voice)
# Process advanced settings
advanced_settings = settings.get('advanced_settings', {})
advanced_validation = self.process_advanced_settings(advanced_settings)
# Combine results
all_errors = (
style_validation.get('errors', []) +
brand_validation.get('errors', []) +
advanced_validation.get('errors', [])
)
is_complete = (
style_validation.get('valid', False) and
brand_validation.get('valid', False) and
advanced_validation.get('valid', False)
)
if is_complete:
# Combine all valid settings
complete_settings = {
'content_style': style_validation.get('style_config'),
'brand_voice': brand_validation.get('brand_config'),
'advanced_settings': advanced_validation.get('advanced_settings'),
'processed_at': datetime.now().isoformat()
}
logger.info("Complete personalization settings processed successfully")
return {
'valid': True,
'settings': complete_settings,
'errors': []
}
else:
logger.warning(f"Personalization settings processing failed: {all_errors}")
return {
'valid': False,
'settings': None,
'errors': all_errors
}
except Exception as e:
logger.error(f"Error processing personalization settings: {str(e)}")
return {
'valid': False,
'settings': None,
'errors': [f"Personalization processing error: {str(e)}"]
}
def get_personalization_configuration_options(self) -> Dict[str, Any]:
"""
Get available configuration options for personalization.
Returns:
Dict containing all available options
"""
return {
'writing_styles': self.valid_writing_styles,
'tones': self.valid_tones,
'content_lengths': self.valid_content_lengths,
'personality_traits': self.valid_personality_traits,
'readability_levels': self.valid_readability_levels,
'content_structures': self.valid_content_structures,
'seo_optimization_options': [True, False]
}
def generate_content_guidelines(self, settings: Dict[str, Any]) -> Dict[str, Any]:
"""
Generate content guidelines based on personalization settings.
Args:
settings: Validated personalization settings
Returns:
Dict containing content guidelines
"""
try:
logger.info("Generating content guidelines from personalization settings")
content_style = settings.get('content_style', {})
brand_voice = settings.get('brand_voice', {})
advanced_settings = settings.get('advanced_settings', {})
guidelines = {
'writing_style': content_style.get('writing_style', 'Professional'),
'tone': content_style.get('tone', 'Neutral'),
'content_length': content_style.get('content_length', 'Standard'),
'brand_personality': brand_voice.get('personality_traits', []),
'seo_optimized': advanced_settings.get('seo_optimization', False),
'readability_level': advanced_settings.get('readability_level', 'Standard'),
'required_sections': advanced_settings.get('content_structure', []),
'generated_at': datetime.now().isoformat()
}
logger.info("Content guidelines generated successfully")
return {
'success': True,
'guidelines': guidelines
}
except Exception as e:
logger.error(f"Error generating content guidelines: {str(e)}")
return {
'success': False,
'error': f"Guidelines generation error: {str(e)}"
}

View File

@@ -0,0 +1,325 @@
"""Research Utilities Service for ALwrity Backend.
This service handles research functionality and result processing,
extracted from the legacy AI research utilities.
"""
from typing import Dict, Any, List, Optional
from loguru import logger
import asyncio
from datetime import datetime
class ResearchUtilities:
"""Business logic for research functionality and result processing."""
def __init__(self):
"""Initialize the Research Utilities service."""
self.research_providers = {
'tavily': 'TAVILY_API_KEY',
'serper': 'SERPER_API_KEY',
'metaphor': 'METAPHOR_API_KEY',
'firecrawl': 'FIRECRAWL_API_KEY'
}
async def research_topic(self, topic: str, api_keys: Dict[str, str]) -> Dict[str, Any]:
"""
Research a topic using available AI services.
Args:
topic: The topic to research
api_keys: Dictionary of API keys for different services
Returns:
Dict containing research results and metadata
"""
try:
logger.info(f"Starting research on topic: {topic}")
# Validate topic
if not topic or len(topic.strip()) < 3:
return {
'success': False,
'topic': topic,
'error': 'Topic must be at least 3 characters long'
}
# Check available API keys
available_providers = []
for provider, key_name in self.research_providers.items():
if api_keys.get(key_name):
available_providers.append(provider)
if not available_providers:
return {
'success': False,
'topic': topic,
'error': 'No research providers available. Please configure API keys.'
}
# Simulate research processing (in real implementation, this would call actual AI services)
research_results = await self._simulate_research(topic, available_providers)
logger.info(f"Research completed successfully for topic: {topic}")
return {
'success': True,
'topic': topic,
'results': research_results,
'metadata': {
'providers_used': available_providers,
'research_timestamp': datetime.now().isoformat(),
'topic_length': len(topic)
}
}
except Exception as e:
logger.error(f"Error during research: {str(e)}")
return {
'success': False,
'topic': topic,
'error': str(e)
}
async def _simulate_research(self, topic: str, providers: List[str]) -> Dict[str, Any]:
"""
Simulate research processing for demonstration purposes.
In real implementation, this would call actual AI research services.
Args:
topic: The research topic
providers: List of available research providers
Returns:
Dict containing simulated research results
"""
# Simulate async processing time
await asyncio.sleep(0.1)
# Generate simulated research results
results = {
'summary': f"Comprehensive research summary for '{topic}' based on multiple sources.",
'key_points': [
f"Key insight 1 about {topic}",
f"Important finding 2 related to {topic}",
f"Notable trend 3 in {topic}",
f"Critical observation 4 regarding {topic}"
],
'sources': [
f"Research source 1 for {topic}",
f"Academic paper on {topic}",
f"Industry report about {topic}",
f"Expert analysis of {topic}"
],
'trends': [
f"Emerging trend in {topic}",
f"Growing interest in {topic}",
f"Market shift related to {topic}"
],
'recommendations': [
f"Action item 1 for {topic}",
f"Strategic recommendation for {topic}",
f"Next steps regarding {topic}"
],
'providers_used': providers,
'research_depth': 'comprehensive',
'confidence_score': 0.85
}
return results
def process_research_results(self, results: Dict[str, Any]) -> Dict[str, Any]:
"""
Process and format research results for better presentation.
Args:
results: Raw research results
Returns:
Dict containing processed and formatted results
"""
try:
logger.info("Processing research results")
if not results or 'success' not in results:
return {
'success': False,
'error': 'Invalid research results format'
}
if not results.get('success', False):
return results # Return error results as-is
# Process successful results
raw_results = results.get('results', {})
metadata = results.get('metadata', {})
# Format and structure the results
processed_results = {
'topic': results.get('topic', ''),
'summary': raw_results.get('summary', ''),
'key_insights': raw_results.get('key_points', []),
'sources': raw_results.get('sources', []),
'trends': raw_results.get('trends', []),
'recommendations': raw_results.get('recommendations', []),
'metadata': {
'providers_used': raw_results.get('providers_used', []),
'research_depth': raw_results.get('research_depth', 'standard'),
'confidence_score': raw_results.get('confidence_score', 0.0),
'processed_at': datetime.now().isoformat(),
'original_timestamp': metadata.get('research_timestamp')
}
}
logger.info("Research results processed successfully")
return {
'success': True,
'processed_results': processed_results
}
except Exception as e:
logger.error(f"Error processing research results: {str(e)}")
return {
'success': False,
'error': f"Results processing error: {str(e)}"
}
def validate_research_request(self, topic: str, api_keys: Dict[str, str]) -> Dict[str, Any]:
"""
Validate a research request before processing.
Args:
topic: The research topic
api_keys: Available API keys
Returns:
Dict containing validation results
"""
try:
logger.info(f"Validating research request for topic: {topic}")
errors = []
warnings = []
# Validate topic
if not topic or len(topic.strip()) < 3:
errors.append("Topic must be at least 3 characters long")
elif len(topic.strip()) > 500:
errors.append("Topic is too long (maximum 500 characters)")
# Check API keys
available_providers = []
for provider, key_name in self.research_providers.items():
if api_keys.get(key_name):
available_providers.append(provider)
else:
warnings.append(f"No API key for {provider}")
if not available_providers:
errors.append("No research providers available. Please configure at least one API key.")
# Determine validation result
is_valid = len(errors) == 0
return {
'valid': is_valid,
'errors': errors,
'warnings': warnings,
'available_providers': available_providers,
'topic_length': len(topic.strip()) if topic else 0
}
except Exception as e:
logger.error(f"Error validating research request: {str(e)}")
return {
'valid': False,
'errors': [f"Validation error: {str(e)}"],
'warnings': [],
'available_providers': [],
'topic_length': 0
}
def get_research_providers_info(self) -> Dict[str, Any]:
"""
Get information about available research providers.
Returns:
Dict containing provider information
"""
return {
'providers': {
'tavily': {
'name': 'Tavily',
'description': 'Intelligent web research',
'api_key_name': 'TAVILY_API_KEY',
'url': 'https://tavily.com/#api'
},
'serper': {
'name': 'Serper',
'description': 'Google search functionality',
'api_key_name': 'SERPER_API_KEY',
'url': 'https://serper.dev/signup'
},
'metaphor': {
'name': 'Metaphor',
'description': 'Advanced web search',
'api_key_name': 'METAPHOR_API_KEY',
'url': 'https://dashboard.exa.ai/login'
},
'firecrawl': {
'name': 'Firecrawl',
'description': 'Web content extraction',
'api_key_name': 'FIRECRAWL_API_KEY',
'url': 'https://www.firecrawl.dev/account'
}
},
'total_providers': len(self.research_providers)
}
def generate_research_report(self, results: Dict[str, Any]) -> Dict[str, Any]:
"""
Generate a formatted research report from processed results.
Args:
results: Processed research results
Returns:
Dict containing formatted research report
"""
try:
logger.info("Generating research report")
if not results.get('success', False):
return {
'success': False,
'error': 'Cannot generate report from failed research'
}
processed_results = results.get('processed_results', {})
# Generate formatted report
report = {
'title': f"Research Report: {processed_results.get('topic', 'Unknown Topic')}",
'executive_summary': processed_results.get('summary', ''),
'key_findings': processed_results.get('key_insights', []),
'trends_analysis': processed_results.get('trends', []),
'recommendations': processed_results.get('recommendations', []),
'sources': processed_results.get('sources', []),
'metadata': processed_results.get('metadata', {}),
'generated_at': datetime.now().isoformat(),
'report_format': 'structured'
}
logger.info("Research report generated successfully")
return {
'success': True,
'report': report
}
except Exception as e:
logger.error(f"Error generating research report: {str(e)}")
return {
'success': False,
'error': f"Report generation error: {str(e)}"
}

View File

@@ -0,0 +1,424 @@
"""Style Detection Logic Service for ALwrity Backend.
This service handles business logic for content style detection and analysis,
migrated from the legacy StyleAnalyzer functionality.
"""
from typing import Dict, Any, List, Optional
from loguru import logger
from datetime import datetime
import json
import re
import sys
import os
# Add the backend directory to Python path for absolute imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
# Import the new backend LLM providers from services
from ..llm_providers.main_text_generation import llm_text_gen
class StyleDetectionLogic:
"""Business logic for content style detection and analysis."""
def __init__(self):
"""Initialize the Style Detection Logic service."""
logger.info("[StyleDetectionLogic.__init__] Initializing style detection service")
def _clean_json_response(self, text: str) -> str:
"""
Clean the LLM response to extract valid JSON.
Args:
text (str): Raw response from LLM
Returns:
str: Cleaned JSON string
"""
try:
# Remove markdown code block markers
cleaned_string = text.replace("```json", "").replace("```", "").strip()
# Log the cleaned JSON for debugging
logger.debug(f"[StyleDetectionLogic._clean_json_response] Cleaned JSON: {cleaned_string}")
return cleaned_string
except Exception as e:
logger.error(f"[StyleDetectionLogic._clean_json_response] Error cleaning response: {str(e)}")
return ""
def analyze_content_style(self, content: Dict[str, Any]) -> Dict[str, Any]:
"""
Analyze the style of the provided content using AI with enhanced prompts.
Args:
content (Dict): Content to analyze, containing main_content, title, etc.
Returns:
Dict: Analysis results with writing style, characteristics, and recommendations
"""
try:
logger.info("[StyleDetectionLogic.analyze_content_style] Starting enhanced style analysis")
# Extract content components
title = content.get('title', '')
description = content.get('description', '')
main_content = content.get('main_content', '')
headings = content.get('headings', [])
domain_info = content.get('domain_info', {})
brand_info = content.get('brand_info', {})
social_media = content.get('social_media', {})
content_structure = content.get('content_structure', {})
# Construct the enhanced analysis prompt (strict JSON, minified, stable keys)
prompt = f"""Analyze the following website content for comprehensive writing style, tone, and characteristics for personalization and AI generation.
RULES:
- Return ONE single-line MINIFIED JSON object only. No markdown, code fences, comments, or prose.
- Use EXACTLY the keys and ordering from the schema below. Do not add extra top-level keys.
- For unknown/unavailable fields use empty string "" or empty array [] and explain in meta.uncertainty.
- Keep text concise; avoid repeating input text.
- Assume token budget; consider only first 5000 chars of main_content and first 10 headings.
WEBSITE INFORMATION:
- Domain: {domain_info.get('domain_name', 'Unknown')}
- Website Type: {self._determine_website_type(domain_info)}
- Brand Name: {brand_info.get('company_name', 'Not specified')}
- Tagline: {brand_info.get('tagline', 'Not specified')}
- Social Media Presence: {', '.join(social_media.keys()) if social_media else 'None detected'}
CONTENT STRUCTURE:
- Headings: {len(headings)} total ({content_structure.get('headings', {}).get('h1', 0)} H1, {content_structure.get('headings', {}).get('h2', 0)} H2)
- Paragraphs: {content_structure.get('paragraphs', 0)}
- Images: {content_structure.get('images', 0)}
- Links: {content_structure.get('links', 0)}
- Has Navigation: {content_structure.get('has_navigation', False)}
- Has Call-to-Action: {content_structure.get('has_call_to_action', False)}
CONTENT TO ANALYZE:
- Title: {title}
- Description: {description}
- Main Content (truncated): {main_content[:5000]}
- Key Headings (first 10): {headings[:10]}
ANALYSIS REQUIREMENTS:
1. Analyze the writing style, tone, and voice characteristics
2. Identify target audience demographics and expertise level
3. Determine content type and purpose
4. Assess content structure and organization patterns
5. Evaluate brand voice consistency and personality
6. Identify unique style elements and patterns
7. Consider the website type and industry context
8. Analyze social media presence impact on content style
REQUIRED JSON SCHEMA (stable key order):
{{
"writing_style": {{
"tone": "", "voice": "", "complexity": "", "engagement_level": "",
"brand_personality": "", "formality_level": "", "emotional_appeal": ""
}},
"content_characteristics": {{
"sentence_structure": "", "vocabulary_level": "", "paragraph_organization": "",
"content_flow": "", "readability_score": "", "content_density": "",
"visual_elements_usage": ""
}},
"target_audience": {{
"demographics": [], "expertise_level": "", "industry_focus": "", "geographic_focus": "",
"psychographic_profile": "", "pain_points": [], "motivations": []
}},
"content_type": {{
"primary_type": "", "secondary_types": [], "purpose": "", "call_to_action": "",
"conversion_focus": "", "educational_value": ""
}},
"brand_analysis": {{
"brand_voice": "", "brand_values": [], "brand_positioning": "", "competitive_differentiation": "",
"trust_signals": [], "authority_indicators": []
}},
"content_strategy_insights": {{
"strengths": [], "weaknesses": [], "opportunities": [], "threats": [],
"recommended_improvements": [], "content_gaps": []
}},
"recommended_settings": {{
"writing_tone": "", "target_audience": "", "content_type": "", "creativity_level": "",
"geographic_location": "", "industry_context": "", "brand_alignment": ""
}},
"meta": {{"schema_version": "1.1", "confidence": 0.0, "notes": "", "uncertainty": {{"fields": []}}}}
}}
"""
# Call the LLM for analysis
logger.debug("[StyleDetectionLogic.analyze_content_style] Sending enhanced prompt to LLM")
analysis_text = llm_text_gen(prompt)
# Clean and parse the response
cleaned_json = self._clean_json_response(analysis_text)
try:
analysis_results = json.loads(cleaned_json)
logger.info("[StyleDetectionLogic.analyze_content_style] Successfully parsed enhanced analysis results")
return {
'success': True,
'analysis': analysis_results
}
except json.JSONDecodeError as e:
logger.error(f"[StyleDetectionLogic.analyze_content_style] Failed to parse JSON response: {e}")
logger.debug(f"[StyleDetectionLogic.analyze_content_style] Raw response: {analysis_text}")
return {
'success': False,
'error': 'Failed to parse analysis response'
}
except Exception as e:
logger.error(f"[StyleDetectionLogic.analyze_content_style] Error in enhanced analysis: {str(e)}")
return {
'success': False,
'error': str(e)
}
def _determine_website_type(self, domain_info: Dict[str, Any]) -> str:
"""Determine the type of website based on domain and content analysis."""
if domain_info.get('is_blog'):
return 'Blog/Content Platform'
elif domain_info.get('is_ecommerce'):
return 'E-commerce/Online Store'
elif domain_info.get('is_corporate'):
return 'Corporate/Business Website'
elif domain_info.get('has_blog_section'):
return 'Business with Blog'
elif domain_info.get('has_about_page') and domain_info.get('has_contact_page'):
return 'Professional Services'
else:
return 'General Website'
def _get_fallback_analysis(self, content: Dict[str, Any]) -> Dict[str, Any]:
"""Get fallback analysis when LLM analysis fails."""
main_content = content.get("main_content", "")
title = content.get("title", "")
# Simple content analysis based on content characteristics
content_length = len(main_content)
word_count = len(main_content.split())
# Determine tone based on content characteristics
if any(word in main_content.lower() for word in ['professional', 'business', 'industry', 'company']):
tone = "professional"
elif any(word in main_content.lower() for word in ['casual', 'fun', 'enjoy', 'exciting']):
tone = "casual"
else:
tone = "neutral"
# Determine complexity based on sentence length and vocabulary
avg_sentence_length = word_count / max(len([s for s in main_content.split('.') if s.strip()]), 1)
if avg_sentence_length > 20:
complexity = "complex"
elif avg_sentence_length > 15:
complexity = "moderate"
else:
complexity = "simple"
return {
"writing_style": {
"tone": tone,
"voice": "active",
"complexity": complexity,
"engagement_level": "medium"
},
"content_characteristics": {
"sentence_structure": "standard",
"vocabulary_level": "intermediate",
"paragraph_organization": "logical",
"content_flow": "smooth"
},
"target_audience": {
"demographics": ["general audience"],
"expertise_level": "intermediate",
"industry_focus": "general",
"geographic_focus": "global"
},
"content_type": {
"primary_type": "article",
"secondary_types": ["blog", "content"],
"purpose": "inform",
"call_to_action": "minimal"
},
"recommended_settings": {
"writing_tone": tone,
"target_audience": "general audience",
"content_type": "article",
"creativity_level": "medium",
"geographic_location": "global"
}
}
def analyze_style_patterns(self, content: Dict[str, Any]) -> Dict[str, Any]:
"""
Analyze recurring patterns in the content style.
Args:
content (Dict): Content to analyze
Returns:
Dict: Pattern analysis results
"""
try:
logger.info("[StyleDetectionLogic.analyze_style_patterns] Starting pattern analysis")
main_content = content.get("main_content", "")
prompt = f"""Analyze the content for recurring writing patterns and style characteristics.
RULES:
- Return ONE single-line MINIFIED JSON object only. No markdown, code fences, comments, or prose.
- Use EXACTLY the keys and ordering from the schema below. No extra top-level keys.
- If uncertain, set empty values and list field names in meta.uncertainty.fields.
- Keep responses concise and avoid quoting long input spans.
Content (truncated to 3000 chars): {main_content[:3000]}
REQUIRED JSON SCHEMA (stable key order):
{{
"patterns": {{
"sentence_length": "", "vocabulary_patterns": [], "rhetorical_devices": [],
"paragraph_structure": "", "transition_phrases": []
}},
"style_consistency": "",
"unique_elements": [],
"meta": {{"schema_version": "1.1", "confidence": 0.0, "notes": "", "uncertainty": {{"fields": []}}}}
}}
"""
analysis_text = llm_text_gen(prompt)
cleaned_json = self._clean_json_response(analysis_text)
try:
pattern_results = json.loads(cleaned_json)
return {
'success': True,
'patterns': pattern_results
}
except json.JSONDecodeError as e:
logger.error(f"[StyleDetectionLogic.analyze_style_patterns] Failed to parse JSON response: {e}")
return {
'success': False,
'error': 'Failed to parse pattern analysis response'
}
except Exception as e:
logger.error(f"[StyleDetectionLogic.analyze_style_patterns] Error during analysis: {str(e)}")
return {
'success': False,
'error': str(e)
}
def generate_style_guidelines(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
"""
Generate comprehensive content guidelines based on enhanced style analysis.
Args:
analysis_results (Dict): Results from enhanced style analysis
Returns:
Dict: Generated comprehensive guidelines
"""
try:
logger.info("[StyleDetectionLogic.generate_style_guidelines] Generating comprehensive style guidelines")
# Extract key information from analysis
writing_style = analysis_results.get('writing_style', {})
content_characteristics = analysis_results.get('content_characteristics', {})
target_audience = analysis_results.get('target_audience', {})
brand_analysis = analysis_results.get('brand_analysis', {})
content_strategy_insights = analysis_results.get('content_strategy_insights', {})
prompt = f"""Generate actionable content creation guidelines based on the style analysis.
ANALYSIS DATA:
Writing Style: {writing_style}
Content Characteristics: {content_characteristics}
Target Audience: {target_audience}
Brand Analysis: {brand_analysis}
Content Strategy Insights: {content_strategy_insights}
REQUIREMENTS:
- Return ONE single-line MINIFIED JSON object only. No markdown, code fences, comments, or prose.
- Use EXACTLY the keys and ordering from the schema below. No extra top-level keys.
- Provide concise, implementation-ready bullets with an example for key items (e.g., tone and CTA examples).
- Include negative guidance (what to avoid) tied to brand constraints where applicable.
- If uncertain, set empty values and list field names in meta.uncertainty.fields.
IMPORTANT: REQUIRED JSON SCHEMA (stable key order):
{{
"guidelines": {{
"tone_recommendations": [],
"structure_guidelines": [],
"vocabulary_suggestions": [],
"engagement_tips": [],
"audience_considerations": [],
"brand_alignment": [],
"seo_optimization": [],
"conversion_optimization": []
}},
"best_practices": [],
"avoid_elements": [],
"content_strategy": "",
"ai_generation_tips": [],
"competitive_advantages": [],
"content_calendar_suggestions": [],
"meta": {{"schema_version": "1.1", "confidence": 0.0, "notes": "", "uncertainty": {{"fields": []}}}}
}}
"""
guidelines_text = llm_text_gen(prompt)
cleaned_json = self._clean_json_response(guidelines_text)
try:
guidelines = json.loads(cleaned_json)
return {
'success': True,
'guidelines': guidelines
}
except json.JSONDecodeError as e:
logger.error(f"[StyleDetectionLogic.generate_style_guidelines] Failed to parse JSON response: {e}")
return {
'success': False,
'error': 'Failed to parse guidelines response'
}
except Exception as e:
logger.error(f"[StyleDetectionLogic.generate_style_guidelines] Error generating guidelines: {str(e)}")
return {
'success': False,
'error': str(e)
}
def validate_style_analysis_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate style analysis request data.
Args:
request_data (Dict): Request data to validate
Returns:
Dict: Validation results
"""
errors = []
# Check if content is provided
if not request_data.get('content') and not request_data.get('url') and not request_data.get('text_sample'):
errors.append("Content is required for style analysis")
# Check content length
content = request_data.get('content', {})
main_content = content.get('main_content', '')
if len(main_content) < 50:
errors.append("Content must be at least 50 characters long for meaningful analysis")
# Check for required fields
if not content.get('title') and not content.get('main_content'):
errors.append("Either title or main content must be provided")
return {
'valid': len(errors) == 0,
'errors': errors
}

View File

@@ -0,0 +1,584 @@
"""Web Crawler Logic Service for ALwrity Backend.
This service handles business logic for web crawling and content extraction,
migrated from the legacy web crawler functionality.
"""
from typing import Dict, Any, List, Optional
from loguru import logger
from datetime import datetime
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import requests
import re
class WebCrawlerLogic:
"""Business logic for web crawling and content extraction."""
def __init__(self):
"""Initialize the Web Crawler Logic service."""
logger.info("[WebCrawlerLogic.__init__] Initializing web crawler service")
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.timeout = 30
self.max_content_length = 10000
def _validate_url(self, url: str) -> bool:
"""
Validate URL format and fix common formatting issues.
Args:
url (str): URL to validate
Returns:
bool: True if URL is valid
"""
try:
# Clean and fix common URL issues
cleaned_url = self._fix_url_format(url)
result = urlparse(cleaned_url)
# Check if we have both scheme and netloc
if not all([result.scheme, result.netloc]):
return False
# Additional validation for domain format
domain = result.netloc
if '.' not in domain or len(domain.split('.')[-1]) < 2:
return False
return True
except Exception as e:
logger.error(f"[WebCrawlerLogic._validate_url] URL validation error: {str(e)}")
return False
def _fix_url_format(self, url: str) -> str:
"""
Fix common URL formatting issues.
Args:
url (str): URL to fix
Returns:
str: Fixed URL
"""
# Remove leading/trailing whitespace
url = url.strip()
# Check if URL already has a protocol but is missing slashes
if url.startswith('https:/') and not url.startswith('https://'):
url = url.replace('https:/', 'https://')
elif url.startswith('http:/') and not url.startswith('http://'):
url = url.replace('http:/', 'http://')
# Add protocol if missing
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
# Fix missing slash after protocol
if '://' in url and not url.split('://')[1].startswith('/'):
url = url.replace('://', ':///')
# Ensure only two slashes after protocol
if ':///' in url:
url = url.replace(':///', '://')
logger.debug(f"[WebCrawlerLogic._fix_url_format] Fixed URL: {url}")
return url
async def crawl_website(self, url: str) -> Dict[str, Any]:
"""
Crawl a website and extract its content asynchronously with enhanced data extraction.
Args:
url (str): The URL to crawl
Returns:
Dict: Extracted website content and metadata
"""
try:
logger.info(f"[WebCrawlerLogic.crawl_website] Starting enhanced crawl for URL: {url}")
# Fix URL format first
fixed_url = self._fix_url_format(url)
logger.info(f"[WebCrawlerLogic.crawl_website] Fixed URL: {fixed_url}")
# Validate URL
if not self._validate_url(fixed_url):
error_msg = f"Invalid URL format: {url}"
logger.error(f"[WebCrawlerLogic.crawl_website] {error_msg}")
return {
'success': False,
'error': error_msg
}
# Fetch the page content
try:
async with aiohttp.ClientSession(headers=self.headers, timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
async with session.get(fixed_url) as response:
if response.status == 200:
html_content = await response.text()
logger.debug("[WebCrawlerLogic.crawl_website] Successfully fetched HTML content")
else:
error_msg = f"Failed to fetch content: Status code {response.status}"
logger.error(f"[WebCrawlerLogic.crawl_website] {error_msg}")
return {
'success': False,
'error': error_msg
}
except Exception as e:
error_msg = f"Failed to fetch content from {fixed_url}: {str(e)}"
logger.error(f"[WebCrawlerLogic.crawl_website] {error_msg}")
return {
'success': False,
'error': error_msg
}
# Parse HTML with BeautifulSoup
logger.debug("[WebCrawlerLogic.crawl_website] Parsing HTML content")
soup = BeautifulSoup(html_content, 'html.parser')
# Extract domain information
domain_info = self._extract_domain_info(fixed_url, soup)
# Extract enhanced main content
main_content = self._extract_enhanced_content(soup)
# Extract social media and brand information
social_media = self._extract_social_media(soup)
brand_info = self._extract_brand_information(soup)
# Extract content structure and patterns
content_structure = self._extract_content_structure(soup)
# Extract content
content = {
'title': soup.title.string.strip() if soup.title else '',
'description': soup.find('meta', {'name': 'description'}).get('content', '').strip() if soup.find('meta', {'name': 'description'}) else '',
'main_content': main_content,
'headings': [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])],
'links': [{'text': a.get_text(strip=True), 'href': urljoin(fixed_url, a.get('href', ''))} for a in soup.find_all('a', href=True)],
'images': [{'alt': img.get('alt', '').strip(), 'src': urljoin(fixed_url, img.get('src', ''))} for img in soup.find_all('img', src=True)],
'meta_tags': {
meta.get('name', meta.get('property', '')): meta.get('content', '').strip()
for meta in soup.find_all('meta')
if (meta.get('name') or meta.get('property')) and meta.get('content')
},
'domain_info': domain_info,
'social_media': social_media,
'brand_info': brand_info,
'content_structure': content_structure
}
logger.debug(f"[WebCrawlerLogic.crawl_website] Extracted {len(content['links'])} links, {len(content['images'])} images, and {len(social_media)} social media links")
logger.info("[WebCrawlerLogic.crawl_website] Successfully completed enhanced website crawl")
return {
'success': True,
'content': content,
'url': fixed_url,
'timestamp': datetime.now().isoformat()
}
except Exception as e:
error_msg = f"Error crawling {url}: {str(e)}"
logger.error(f"[WebCrawlerLogic.crawl_website] {error_msg}")
return {
'success': False,
'error': str(e)
}
def _extract_domain_info(self, url: str, soup: BeautifulSoup) -> Dict[str, Any]:
"""Extract domain-specific information."""
try:
domain = urlparse(url).netloc
return {
'domain': domain,
'domain_name': domain.replace('www.', ''),
'is_blog': any(keyword in domain.lower() for keyword in ['blog', 'medium', 'substack', 'wordpress']),
'is_ecommerce': any(keyword in domain.lower() for keyword in ['shop', 'store', 'cart', 'buy', 'amazon', 'ebay']),
'is_corporate': any(keyword in domain.lower() for keyword in ['corp', 'inc', 'llc', 'company', 'business']),
'has_blog_section': bool(soup.find('a', href=re.compile(r'blog|news|articles', re.I))),
'has_about_page': bool(soup.find('a', href=re.compile(r'about|company|team', re.I))),
'has_contact_page': bool(soup.find('a', href=re.compile(r'contact|support|help', re.I)))
}
except Exception as e:
logger.error(f"[WebCrawlerLogic._extract_domain_info] Error: {str(e)}")
return {}
def _extract_enhanced_content(self, soup: BeautifulSoup) -> str:
"""Extract enhanced main content with better structure detection."""
try:
# Try to find main content areas
main_content_elements = []
# Look for semantic content containers
semantic_selectors = [
'article', 'main', '[role="main"]',
'.content', '.main-content', '.article', '.post',
'.entry', '.page-content', '.site-content'
]
for selector in semantic_selectors:
elements = soup.select(selector)
if elements:
main_content_elements.extend(elements)
break
# If no semantic containers found, look for content-rich divs
if not main_content_elements:
content_divs = soup.find_all('div', class_=re.compile(r'content|main|article|post|entry', re.I))
main_content_elements = content_divs
# If still no content, get all paragraph text
if not main_content_elements:
main_content_elements = soup.find_all(['p', 'article', 'section'])
# Extract text with better formatting
content_parts = []
for elem in main_content_elements:
text = elem.get_text(separator=' ', strip=True)
if text and len(text) > 20: # Only include substantial text
content_parts.append(text)
main_content = ' '.join(content_parts)
# Limit content length
if len(main_content) > self.max_content_length:
main_content = main_content[:self.max_content_length] + "..."
return main_content
except Exception as e:
logger.error(f"[WebCrawlerLogic._extract_enhanced_content] Error: {str(e)}")
return ''
def _extract_social_media(self, soup: BeautifulSoup) -> Dict[str, str]:
"""Extract social media links and handles."""
social_media = {}
try:
# Common social media patterns
social_patterns = {
'facebook': r'facebook\.com|fb\.com',
'twitter': r'twitter\.com|x\.com',
'linkedin': r'linkedin\.com',
'instagram': r'instagram\.com',
'youtube': r'youtube\.com|youtu\.be',
'tiktok': r'tiktok\.com',
'pinterest': r'pinterest\.com',
'github': r'github\.com'
}
# Find all links
links = soup.find_all('a', href=True)
for link in links:
href = link.get('href', '').lower()
for platform, pattern in social_patterns.items():
if re.search(pattern, href):
social_media[platform] = href
break
# Also check for social media meta tags
meta_social = {
'og:site_name': 'site_name',
'twitter:site': 'twitter',
'twitter:creator': 'twitter_creator'
}
for meta in soup.find_all('meta', property=True):
prop = meta.get('property', '')
if prop in meta_social:
social_media[meta_social[prop]] = meta.get('content', '')
return social_media
except Exception as e:
logger.error(f"[WebCrawlerLogic._extract_social_media] Error: {str(e)}")
return {}
def _extract_brand_information(self, soup: BeautifulSoup) -> Dict[str, Any]:
"""Extract brand and company information."""
brand_info = {}
try:
# Extract logo information
logos = soup.find_all('img', alt=re.compile(r'logo|brand', re.I))
if logos:
brand_info['logo_alt'] = [logo.get('alt', '') for logo in logos]
# Extract company name from various sources
company_name_selectors = [
'h1', '.logo', '.brand', '.company-name',
'[class*="logo"]', '[class*="brand"]'
]
for selector in company_name_selectors:
elements = soup.select(selector)
if elements:
brand_info['company_name'] = elements[0].get_text(strip=True)
break
# Extract taglines and slogans
tagline_selectors = [
'.tagline', '.slogan', '.motto',
'[class*="tagline"]', '[class*="slogan"]'
]
for selector in tagline_selectors:
elements = soup.select(selector)
if elements:
brand_info['tagline'] = elements[0].get_text(strip=True)
break
# Extract contact information
contact_info = {}
contact_patterns = {
'email': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
'phone': r'[\+]?[1-9][\d]{0,15}',
'address': r'\d+\s+[a-zA-Z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd)'
}
for info_type, pattern in contact_patterns.items():
matches = re.findall(pattern, soup.get_text())
if matches:
contact_info[info_type] = matches[:3] # Limit to first 3 matches
brand_info['contact_info'] = contact_info
return brand_info
except Exception as e:
logger.error(f"[WebCrawlerLogic._extract_brand_information] Error: {str(e)}")
return {}
def _extract_content_structure(self, soup: BeautifulSoup) -> Dict[str, Any]:
"""Extract content structure and patterns."""
structure = {}
try:
# Count different content types
structure['headings'] = {
'h1': len(soup.find_all('h1')),
'h2': len(soup.find_all('h2')),
'h3': len(soup.find_all('h3')),
'h4': len(soup.find_all('h4')),
'h5': len(soup.find_all('h5')),
'h6': len(soup.find_all('h6'))
}
structure['paragraphs'] = len(soup.find_all('p'))
structure['lists'] = len(soup.find_all(['ul', 'ol']))
structure['images'] = len(soup.find_all('img'))
structure['links'] = len(soup.find_all('a'))
# Analyze content sections
sections = soup.find_all(['section', 'article', 'div'], class_=re.compile(r'section|article|content', re.I))
structure['content_sections'] = len(sections)
# Check for common content patterns
structure['has_navigation'] = bool(soup.find(['nav', 'header']))
structure['has_footer'] = bool(soup.find('footer'))
structure['has_sidebar'] = bool(soup.find(class_=re.compile(r'sidebar|aside', re.I)))
structure['has_call_to_action'] = bool(soup.find(text=re.compile(r'click|buy|sign|register|subscribe', re.I)))
return structure
except Exception as e:
logger.error(f"[WebCrawlerLogic._extract_content_structure] Error: {str(e)}")
return {}
def extract_content_from_text(self, text: str) -> Dict[str, Any]:
"""
Extract content from provided text sample.
Args:
text (str): Text content to process
Returns:
Dict: Processed content with metadata
"""
try:
logger.info("[WebCrawlerLogic.extract_content_from_text] Processing text content")
# Clean and process text
cleaned_text = re.sub(r'\s+', ' ', text.strip())
# Split into sentences for analysis
sentences = [s.strip() for s in cleaned_text.split('.') if s.strip()]
# Extract basic metrics
words = cleaned_text.split()
word_count = len(words)
sentence_count = len(sentences)
avg_sentence_length = word_count / max(sentence_count, 1)
content = {
'title': 'Text Sample',
'description': 'Content provided as text sample',
'main_content': cleaned_text,
'headings': [],
'links': [],
'images': [],
'meta_tags': {},
'metrics': {
'word_count': word_count,
'sentence_count': sentence_count,
'avg_sentence_length': avg_sentence_length,
'unique_words': len(set(words)),
'content_length': len(cleaned_text)
}
}
logger.info("[WebCrawlerLogic.extract_content_from_text] Successfully processed text content")
return {
'success': True,
'content': content,
'timestamp': datetime.now().isoformat()
}
except Exception as e:
error_msg = f"Error processing text content: {str(e)}"
logger.error(f"[WebCrawlerLogic.extract_content_from_text] {error_msg}")
return {
'success': False,
'error': error_msg
}
def validate_crawl_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate web crawl request data.
Args:
request_data (Dict): Request data to validate
Returns:
Dict: Validation results
"""
try:
logger.info("[WebCrawlerLogic.validate_crawl_request] Validating request")
errors = []
# Check for required fields
url = request_data.get('url', '')
text_sample = request_data.get('text_sample', '')
if not url and not text_sample:
errors.append("Either URL or text sample is required")
if url and not self._validate_url(url):
errors.append("Invalid URL format")
if text_sample and len(text_sample) < 50:
errors.append("Text sample must be at least 50 characters")
if text_sample and len(text_sample) > 10000:
errors.append("Text sample is too long (max 10,000 characters)")
if errors:
return {
'valid': False,
'errors': errors
}
logger.info("[WebCrawlerLogic.validate_crawl_request] Request validation successful")
return {
'valid': True,
'url': url,
'text_sample': text_sample
}
except Exception as e:
logger.error(f"[WebCrawlerLogic.validate_crawl_request] Validation error: {str(e)}")
return {
'valid': False,
'errors': [f"Validation error: {str(e)}"]
}
def get_crawl_metrics(self, content: Dict[str, Any]) -> Dict[str, Any]:
"""
Calculate metrics for crawled content.
Args:
content (Dict): Content to analyze
Returns:
Dict: Content metrics
"""
try:
logger.info("[WebCrawlerLogic.get_crawl_metrics] Calculating content metrics")
main_content = content.get('main_content', '')
title = content.get('title', '')
description = content.get('description', '')
headings = content.get('headings', [])
links = content.get('links', [])
images = content.get('images', [])
# Calculate metrics
words = main_content.split()
sentences = [s.strip() for s in main_content.split('.') if s.strip()]
metrics = {
'word_count': len(words),
'sentence_count': len(sentences),
'avg_sentence_length': len(words) / max(len(sentences), 1),
'unique_words': len(set(words)),
'content_length': len(main_content),
'title_length': len(title),
'description_length': len(description),
'heading_count': len(headings),
'link_count': len(links),
'image_count': len(images),
'readability_score': self._calculate_readability(main_content),
'content_density': len(set(words)) / max(len(words), 1)
}
logger.info("[WebCrawlerLogic.get_crawl_metrics] Metrics calculated successfully")
return {
'success': True,
'metrics': metrics
}
except Exception as e:
logger.error(f"[WebCrawlerLogic.get_crawl_metrics] Error calculating metrics: {str(e)}")
return {
'success': False,
'error': str(e)
}
def _calculate_readability(self, text: str) -> float:
"""
Calculate a simple readability score.
Args:
text (str): Text to analyze
Returns:
float: Readability score (0-1)
"""
try:
if not text:
return 0.0
words = text.split()
sentences = [s.strip() for s in text.split('.') if s.strip()]
if not sentences:
return 0.0
# Simple Flesch Reading Ease approximation
avg_sentence_length = len(words) / len(sentences)
avg_word_length = sum(len(word) for word in words) / len(words)
# Normalize to 0-1 scale
readability = max(0, min(1, (100 - avg_sentence_length - avg_word_length) / 100))
return round(readability, 2)
except Exception as e:
logger.error(f"[WebCrawlerLogic._calculate_readability] Error: {str(e)}")
return 0.5