#!/usr/bin/env python3 """ Thai Keyword Analyzer Analyze keyword density in Thai text with PyThaiNLP integration. Handles Thai language specifics (no spaces between words). """ import argparse import json import sys from typing import Dict, List, Optional try: from pythainlp import word_tokenize from pythainlp.util import normalize THAI_SUPPORT = True except ImportError: THAI_SUPPORT = False print("Warning: PyThaiNLP not installed. Install with: pip install pythainlp") class ThaiKeywordAnalyzer: """Analyze keyword density in Thai text""" def __init__(self): self.thai_stopwords = set([ 'และ', 'หรือ', 'แต่', 'ว่า', 'ถ้า', 'หาก', 'ซึ่ง', 'ที่', 'ใน', 'บน', 'ใต้', 'เหนือ', 'จาก', 'ถึง', 'ที่', 'การ', 'ความ', 'อย่าง', 'เมื่อ', 'สำหรับ', 'กับ', 'ของ', 'เป็น', 'อยู่', 'คือ', 'ได้', 'ให้', 'ไป', 'มา' ]) def count_words(self, text: str) -> int: """Count Thai words accurately""" if not THAI_SUPPORT: return len(text.split()) tokens = word_tokenize(text, engine="newmm") return len([t for t in tokens if t.strip() and not t.isspace()]) def calculate_density(self, text: str, keyword: str) -> float: """Calculate keyword density""" if not THAI_SUPPORT: text_words = text.lower().split() keyword_count = text.lower().count(keyword.lower()) return (keyword_count / len(text_words) * 100) if text_words else 0 text_norm = normalize(text) keyword_norm = normalize(keyword) count = text_norm.count(keyword_norm) word_count = self.count_words(text) return (count / word_count * 100) if word_count > 0 else 0 def find_positions(self, text: str, keyword: str) -> List[int]: """Find all keyword positions""" positions = [] text_lower = text.lower() keyword_lower = keyword.lower() start = 0 while True: pos = text_lower.find(keyword_lower, start) if pos == -1: break positions.append(pos) start = pos + 1 return positions def check_critical_placements(self, text: str, keyword: str) -> Dict: """Check keyword in critical locations""" text_lower = text.lower() keyword_lower = keyword.lower() # First 200 chars (approximately first 100 Thai words) in_first_100_words = keyword_lower in text_lower[:200] # Check H1 (first line if it starts with #) lines = text.split('\n') in_h1 = False if lines and lines[0].startswith('#'): in_h1 = keyword_lower in lines[0].lower() # Last 500 chars (approximately conclusion) in_conclusion = keyword_lower in text_lower[-500:] if len(text) > 500 else False # Count H2 occurrences h2_count = sum(1 for line in lines if line.startswith('##') and keyword_lower in line.lower()) return { 'in_first_100_words': in_first_100_words, 'in_h1': in_h1, 'in_conclusion': in_conclusion, 'in_h2_count': h2_count } def detect_stuffing(self, text: str, keyword: str, density: float) -> Dict: """Detect keyword stuffing risk""" risk_level = "none" warnings = [] if density > 3.0: risk_level = "high" warnings.append(f"Keyword density {density:.1f}% is very high (over 3%)") elif density > 2.5: risk_level = "medium" warnings.append(f"Keyword density {density:.1f}% is high (over 2.5%)") # Check for clustering in paragraphs paragraphs = text.split('\n\n') for i, para in enumerate(paragraphs[:10]): # Check first 10 paragraphs para_density = self.calculate_density(para, keyword) if para_density > 5.0: risk_level = "high" if risk_level != "high" else risk_level warnings.append(f"Paragraph {i+1} has very high density ({para_density:.1f}%)") return { 'risk_level': risk_level, 'warnings': warnings, 'safe': risk_level in ["none", "low"] } def get_density_status(self, density: float, language: str = 'th') -> str: """Determine if density is appropriate""" if language == 'th': # Thai target: 1.0-1.5% if density < 0.5: return "too_low" elif density < 1.0: return "slightly_low" elif density <= 1.5: return "optimal" elif density <= 2.0: return "slightly_high" else: return "too_high" else: # English target: 1.5-2.0% if density < 1.0: return "too_low" elif density < 1.5: return "slightly_low" elif density <= 2.0: return "optimal" elif density <= 2.5: return "slightly_high" else: return "too_high" def get_recommendations(self, density: float, placements: Dict, language: str = 'th') -> List[str]: """Generate recommendations""" recs = [] if language == 'th': if density < 1.0: recs.append("เพิ่มการใช้คำหลักในเนื้อหา (target: 1.0-1.5%)") elif density > 2.0: recs.append("ลดการใช้คำหลักลง อาจถูกมองว่า keyword stuffing") if not placements['in_first_100_words']: recs.append("เพิ่มคำหลักในย่อหน้าแรก (100 คำแรก)") if not placements['in_h1']: recs.append("เพิ่มคำหลักในหัวข้อหลัก (H1)") if not placements['in_conclusion']: recs.append("เพิ่มคำหลักในบทสรุป") if placements['in_h2_count'] < 2: recs.append("เพิ่มคำหลักในหัวข้อรอง (H2) อย่างน้อย 2-3 แห่ง") else: if density < 1.5: recs.append("Increase keyword usage (target: 1.5-2.0%)") elif density > 2.5: recs.append("Reduce keyword usage to avoid stuffing penalty") if not placements['in_first_100_words']: recs.append("Add keyword in first 100 words") if not placements['in_h1']: recs.append("Add keyword in H1 headline") if not placements['in_conclusion']: recs.append("Add keyword in conclusion") return recs def analyze(self, text: str, keyword: str, language: str = 'th') -> Dict: """Full keyword analysis""" word_count = self.count_words(text) density = self.calculate_density(text, keyword) positions = self.find_positions(text, keyword) placements = self.check_critical_placements(text, keyword) stuffing = self.detect_stuffing(text, keyword, density) status = self.get_density_status(density, language) recommendations = self.get_recommendations(density, placements, language) return { 'word_count': word_count, 'keyword': keyword, 'occurrences': len(positions), 'density': round(density, 2), 'target_density': '1.0-1.5%' if language == 'th' else '1.5-2.0%', 'status': status, 'critical_placements': placements, 'keyword_stuffing_risk': stuffing['risk_level'], 'recommendations': recommendations } def main(): """Main entry point""" parser = argparse.ArgumentParser( description='Analyze keyword density in Thai or English text' ) parser.add_argument( '--text', '-t', required=True, help='Text content to analyze' ) parser.add_argument( '--keyword', '-k', required=True, help='Target keyword' ) parser.add_argument( '--language', '-l', choices=['th', 'en'], default='th', help='Content language (default: th)' ) parser.add_argument( '--output', '-o', choices=['json', 'text'], default='text', help='Output format (default: text)' ) args = parser.parse_args() # Analyze analyzer = ThaiKeywordAnalyzer() result = analyzer.analyze(args.text, args.keyword, args.language) # Output if args.output == 'json': print(json.dumps(result, indent=2, ensure_ascii=False)) else: print("\n📊 Keyword Analysis Results\n") print(f"Keyword: {result['keyword']}") print(f"Word Count: {result['word_count']}") print(f"Occurrences: {result['occurrences']}") print(f"Density: {result['density']}% (target: {result['target_density']})") print(f"Status: {result['status']}") print(f"\nCritical Placements:") print(f" ✓ First 100 words: {'Yes' if result['critical_placements']['in_first_100_words'] else 'No'}") print(f" ✓ H1 Headline: {'Yes' if result['critical_placements']['in_h1'] else 'No'}") print(f" ✓ Conclusion: {'Yes' if result['critical_placements']['in_conclusion'] else 'No'}") print(f" ✓ H2 Headings: {result['critical_placements']['in_h2_count']} found") print(f"\nKeyword Stuffing Risk: {result['keyword_stuffing_risk']}") if result['recommendations']: print(f"\n💡 Recommendations:") for rec in result['recommendations']: print(f" • {rec}") print() if __name__ == '__main__': main()