#!/usr/bin/env python3
"""
Thai Keyword Analyzer

Analyze keyword density in Thai text with PyThaiNLP integration.
Handles Thai language specifics (no spaces between words).
"""

import argparse
import json
import sys
from typing import Dict, List, Optional

try:
    from pythainlp import word_tokenize
    from pythainlp.util import normalize
    THAI_SUPPORT = True
except ImportError:
    THAI_SUPPORT = False
    print("Warning: PyThaiNLP not installed. Install with: pip install pythainlp")


class ThaiKeywordAnalyzer:
    """Analyze keyword density in Thai text"""
    
    def __init__(self):
        self.thai_stopwords = set([
            'และ', 'หรือ', 'แต่', 'ว่า', 'ถ้า', 'หาก', 'ซึ่ง', 'ที่', 'ใน', 'บน',
            'ใต้', 'เหนือ', 'จาก', 'ถึง', 'ที่', 'การ', 'ความ', 'อย่าง', 'เมื่อ',
            'สำหรับ', 'กับ', 'ของ', 'เป็น', 'อยู่', 'คือ', 'ได้', 'ให้', 'ไป', 'มา'
        ])
    
    def count_words(self, text: str) -> int:
        """Count Thai words accurately"""
        if not THAI_SUPPORT:
            return len(text.split())
        
        tokens = word_tokenize(text, engine="newmm")
        return len([t for t in tokens if t.strip() and not t.isspace()])
    
    def calculate_density(self, text: str, keyword: str) -> float:
        """Calculate keyword density"""
        if not THAI_SUPPORT:
            text_words = text.lower().split()
            keyword_count = text.lower().count(keyword.lower())
            return (keyword_count / len(text_words) * 100) if text_words else 0
        
        text_norm = normalize(text)
        keyword_norm = normalize(keyword)
        count = text_norm.count(keyword_norm)
        word_count = self.count_words(text)
        return (count / word_count * 100) if word_count > 0 else 0
    
    def find_positions(self, text: str, keyword: str) -> List[int]:
        """Find all keyword positions"""
        positions = []
        text_lower = text.lower()
        keyword_lower = keyword.lower()
        start = 0
        
        while True:
            pos = text_lower.find(keyword_lower, start)
            if pos == -1:
                break
            positions.append(pos)
            start = pos + 1
        
        return positions
    
    def check_critical_placements(self, text: str, keyword: str) -> Dict:
        """Check keyword in critical locations"""
        text_lower = text.lower()
        keyword_lower = keyword.lower()
        
        # First 200 chars (approximately first 100 Thai words)
        in_first_100_words = keyword_lower in text_lower[:200]
        
        # Check H1 (first line if it starts with #)
        lines = text.split('\n')
        in_h1 = False
        if lines and lines[0].startswith('#'):
            in_h1 = keyword_lower in lines[0].lower()
        
        # Last 500 chars (approximately conclusion)
        in_conclusion = keyword_lower in text_lower[-500:] if len(text) > 500 else False
        
        # Count H2 occurrences
        h2_count = sum(1 for line in lines if line.startswith('##') and keyword_lower in line.lower())
        
        return {
            'in_first_100_words': in_first_100_words,
            'in_h1': in_h1,
            'in_conclusion': in_conclusion,
            'in_h2_count': h2_count
        }
    
    def detect_stuffing(self, text: str, keyword: str, density: float) -> Dict:
        """Detect keyword stuffing risk"""
        risk_level = "none"
        warnings = []
        
        if density > 3.0:
            risk_level = "high"
            warnings.append(f"Keyword density {density:.1f}% is very high (over 3%)")
        elif density > 2.5:
            risk_level = "medium"
            warnings.append(f"Keyword density {density:.1f}% is high (over 2.5%)")
        
        # Check for clustering in paragraphs
        paragraphs = text.split('\n\n')
        for i, para in enumerate(paragraphs[:10]):  # Check first 10 paragraphs
            para_density = self.calculate_density(para, keyword)
            if para_density > 5.0:
                risk_level = "high" if risk_level != "high" else risk_level
                warnings.append(f"Paragraph {i+1} has very high density ({para_density:.1f}%)")
        
        return {
            'risk_level': risk_level,
            'warnings': warnings,
            'safe': risk_level in ["none", "low"]
        }
    
    def get_density_status(self, density: float, language: str = 'th') -> str:
        """Determine if density is appropriate"""
        if language == 'th':
            # Thai target: 1.0-1.5%
            if density < 0.5:
                return "too_low"
            elif density < 1.0:
                return "slightly_low"
            elif density <= 1.5:
                return "optimal"
            elif density <= 2.0:
                return "slightly_high"
            else:
                return "too_high"
        else:
            # English target: 1.5-2.0%
            if density < 1.0:
                return "too_low"
            elif density < 1.5:
                return "slightly_low"
            elif density <= 2.0:
                return "optimal"
            elif density <= 2.5:
                return "slightly_high"
            else:
                return "too_high"
    
    def get_recommendations(self, density: float, placements: Dict, language: str = 'th') -> List[str]:
        """Generate recommendations"""
        recs = []
        
        if language == 'th':
            if density < 1.0:
                recs.append("เพิ่มการใช้คำหลักในเนื้อหา (target: 1.0-1.5%)")
            elif density > 2.0:
                recs.append("ลดการใช้คำหลักลง อาจถูกมองว่า keyword stuffing")
            
            if not placements['in_first_100_words']:
                recs.append("เพิ่มคำหลักในย่อหน้าแรก (100 คำแรก)")
            if not placements['in_h1']:
                recs.append("เพิ่มคำหลักในหัวข้อหลัก (H1)")
            if not placements['in_conclusion']:
                recs.append("เพิ่มคำหลักในบทสรุป")
            if placements['in_h2_count'] < 2:
                recs.append("เพิ่มคำหลักในหัวข้อรอง (H2) อย่างน้อย 2-3 แห่ง")
        else:
            if density < 1.5:
                recs.append("Increase keyword usage (target: 1.5-2.0%)")
            elif density > 2.5:
                recs.append("Reduce keyword usage to avoid stuffing penalty")
            
            if not placements['in_first_100_words']:
                recs.append("Add keyword in first 100 words")
            if not placements['in_h1']:
                recs.append("Add keyword in H1 headline")
            if not placements['in_conclusion']:
                recs.append("Add keyword in conclusion")
        
        return recs
    
    def analyze(self, text: str, keyword: str, language: str = 'th') -> Dict:
        """Full keyword analysis"""
        word_count = self.count_words(text)
        density = self.calculate_density(text, keyword)
        positions = self.find_positions(text, keyword)
        placements = self.check_critical_placements(text, keyword)
        stuffing = self.detect_stuffing(text, keyword, density)
        status = self.get_density_status(density, language)
        recommendations = self.get_recommendations(density, placements, language)
        
        return {
            'word_count': word_count,
            'keyword': keyword,
            'occurrences': len(positions),
            'density': round(density, 2),
            'target_density': '1.0-1.5%' if language == 'th' else '1.5-2.0%',
            'status': status,
            'critical_placements': placements,
            'keyword_stuffing_risk': stuffing['risk_level'],
            'recommendations': recommendations
        }


def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(
        description='Analyze keyword density in Thai or English text'
    )
    
    parser.add_argument(
        '--text', '-t',
        required=True,
        help='Text content to analyze'
    )
    
    parser.add_argument(
        '--keyword', '-k',
        required=True,
        help='Target keyword'
    )
    
    parser.add_argument(
        '--language', '-l',
        choices=['th', 'en'],
        default='th',
        help='Content language (default: th)'
    )
    
    parser.add_argument(
        '--output', '-o',
        choices=['json', 'text'],
        default='text',
        help='Output format (default: text)'
    )
    
    args = parser.parse_args()
    
    # Analyze
    analyzer = ThaiKeywordAnalyzer()
    result = analyzer.analyze(args.text, args.keyword, args.language)
    
    # Output
    if args.output == 'json':
        print(json.dumps(result, indent=2, ensure_ascii=False))
    else:
        print("\n📊 Keyword Analysis Results\n")
        print(f"Keyword: {result['keyword']}")
        print(f"Word Count: {result['word_count']}")
        print(f"Occurrences: {result['occurrences']}")
        print(f"Density: {result['density']}% (target: {result['target_density']})")
        print(f"Status: {result['status']}")
        print(f"\nCritical Placements:")
        print(f"  ✓ First 100 words: {'Yes' if result['critical_placements']['in_first_100_words'] else 'No'}")
        print(f"  ✓ H1 Headline: {'Yes' if result['critical_placements']['in_h1'] else 'No'}")
        print(f"  ✓ Conclusion: {'Yes' if result['critical_placements']['in_conclusion'] else 'No'}")
        print(f"  ✓ H2 Headings: {result['critical_placements']['in_h2_count']} found")
        print(f"\nKeyword Stuffing Risk: {result['keyword_stuffing_risk']}")
        
        if result['recommendations']:
            print(f"\n💡 Recommendations:")
            for rec in result['recommendations']:
                print(f"  • {rec}")
        
        print()


if __name__ == '__main__':
    main()