95 lines
3.2 KiB
Python
95 lines
3.2 KiB
Python
"""Web crawler for ALwrity style analysis."""
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler
|
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
|
|
from loguru import logger
|
|
|
|
async def analyze_website_style(url: str, sample_text: str = None) -> dict:
|
|
"""
|
|
Analyze website content or sample text for style analysis.
|
|
|
|
Args:
|
|
url: Website URL to analyze
|
|
sample_text: Optional sample text to analyze instead of website
|
|
|
|
Returns:
|
|
dict: Analysis results including content style metrics
|
|
"""
|
|
try:
|
|
if sample_text:
|
|
# Analyze sample text directly
|
|
return {
|
|
"success": True,
|
|
"content": sample_text,
|
|
"metrics": {
|
|
"word_count": len(sample_text.split()),
|
|
"sentence_count": len(sample_text.split('.')),
|
|
"avg_sentence_length": len(sample_text.split()) / max(len(sample_text.split('.')), 1)
|
|
}
|
|
}
|
|
browser_config = BrowserConfig() # Default browser configuration
|
|
run_config = CrawlerRunConfig() # Default crawl run configuration
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
url=url,
|
|
config=run_config
|
|
)
|
|
print(result.markdown) # Print clean markdown content
|
|
|
|
logger.debug(f"Crawl result: {result}")
|
|
if result.success:
|
|
# Process content for style analysis
|
|
content = result.markdown
|
|
sentences = [s.strip() for s in content.split('.') if s.strip()]
|
|
|
|
return {
|
|
"success": True,
|
|
"content": content,
|
|
"metrics": {
|
|
"word_count": len(content.split()),
|
|
"sentence_count": len(sentences),
|
|
"avg_sentence_length": len(content.split()) / max(len(sentences), 1),
|
|
"internal_links": len(result.links["internal"]),
|
|
"images": len(result.media["images"])
|
|
}
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"error": result.error_message
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in style analysis: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
def analyze_style(url: str = None, sample_text: str = None) -> dict:
|
|
"""
|
|
Synchronous wrapper for style analysis.
|
|
|
|
Args:
|
|
url: Website URL to analyze
|
|
sample_text: Optional sample text to analyze
|
|
|
|
Returns:
|
|
dict: Analysis results
|
|
"""
|
|
return asyncio.run(analyze_website_style(url, sample_text))
|
|
|
|
|
|
# Deep Crawling
|
|
# One of Crawl4AI's most powerful features is its ability to perform
|
|
# configurable deep crawling that can explore websites beyond a single page.
|
|
# With fine-tuned control over crawl depth, domain boundaries,
|
|
# and content filtering, Crawl4AI gives you the tools to extract precisely the content you need.
|
|
#
|
|
#
|
|
#
|
|
#
|
|
#
|