#!/usr/bin/env python3 """ Thai Readability Analyzer Analyze Thai text readability with PyThaiNLP integration. Detects formality level, grade level, and sentence structure. """ import argparse import json import re from typing import Dict, List try: from pythainlp import word_tokenize, sent_tokenize THAI_SUPPORT = True except ImportError: THAI_SUPPORT = False print("Warning: PyThaiNLP not installed. Install with: pip install pythainlp") class ThaiReadabilityAnalyzer: """Analyze Thai text readability""" def __init__(self): self.formal_particles = [ 'ครับ', 'ค่ะ', 'ข้าพเจ้า', 'กระผม', 'ดิฉัน', 'ท่าน', 'ซึ่ง', 'อัน', 'ย่อม', 'ย่อมเป็น', 'ประการ', 'ดังกล่าว', 'ดังกล่าวแล้ว', 'ดังนี้' ] self.informal_particles = [ 'นะ', 'จ้ะ', 'อ่ะ', 'มั้ย', 'เปล่าว่ะ', 'gue', 'mang', 'เว้ย', 'วะ', 'เหอะ', 'ซิ', 'นู่น', 'นี่', 'นั่น', 'โครต', 'มาก' ] def count_sentences(self, text: str) -> int: """Count Thai sentences""" if not THAI_SUPPORT: # Fallback: count Thai sentence endings thai_endings = ['.', '!', '?', '।', '๏'] count = sum(text.count(e) for e in thai_endings) return max(count, 1) sentences = sent_tokenize(text, engine="whitespace") return len([s for s in sentences if s.strip()]) def count_words(self, text: str) -> int: """Count Thai words""" if not THAI_SUPPORT: return len(text.split()) tokens = word_tokenize(text, engine="newmm") return len([t for t in tokens if t.strip()]) def calculate_avg_sentence_length(self, text: str) -> float: """Calculate average sentence length""" if not THAI_SUPPORT: sentences = re.split(r'[.!?]', text) sentences = [s for s in sentences if s.strip()] if not sentences: return 0 words = text.split() return len(words) / len(sentences) sentences = sent_tokenize(text, engine="whitespace") sentences = [s for s in sentences if s.strip()] if not sentences: return 0 total_words = sum( len(word_tokenize(s, engine="newmm")) for s in sentences ) return total_words / len(sentences) def detect_formality(self, text: str) -> Dict: """Detect Thai formality level""" formal_count = sum(text.count(p) for p in self.formal_particles) informal_count = sum(text.count(p) for p in self.informal_particles) total = formal_count + informal_count if total == 0: ratio = 0.5 # Neutral else: ratio = formal_count / total if ratio > 0.6: level = "เป็นทางการ (Formal)" score = 80 elif ratio < 0.4: level = "กันเอง (Casual)" score = 20 else: level = "ปกติ (Normal)" score = 50 return { 'level': level, 'score': score, 'formal_particle_count': formal_count, 'informal_particle_count': informal_count, 'ratio': round(ratio, 2) } def estimate_grade_level(self, avg_sentence_length: float, formality_score: int) -> Dict: """Estimate Thai grade level""" # Thai grade level estimation based on sentence complexity if avg_sentence_length < 15: grade_th = "ง่าย (ม.6-ม.9)" grade_num = 6-9 elif avg_sentence_length < 25: grade_th = "ปานกลาง (ม.10-ม.12)" grade_num = 10-12 else: grade_th = "ยาก (ม.13+)" grade_num = 13 # Adjust for formality if formality_score > 70: grade_th += " (ทางการ)" elif formality_score < 30: grade_th += " (กันเอง)" return { 'thai': grade_th, 'numeric_range': grade_num, 'us_equivalent': self._thai_to_us_grade(grade_num) } def _thai_to_us_grade(self, thai_grade_range) -> str: """Convert Thai grade to US equivalent""" if isinstance(thai_grade_range, range): avg = sum(thai_grade_range) / len(thai_grade_range) elif isinstance(thai_grade_range, int): avg = thai_grade_range else: avg = 10 # Very rough conversion if avg <= 9: return "6th-8th grade" elif avg <= 12: return "9th-12th grade" else: return "College+" def analyze_paragraph_structure(self, text: str) -> Dict: """Analyze paragraph structure""" paragraphs = [p for p in text.split('\n\n') if p.strip()] if not paragraphs: return { 'paragraph_count': 0, 'avg_length_words': 0, 'avg_length_sentences': 0 } paragraph_lengths = [ self.count_words(p) for p in paragraphs ] paragraph_sentences = [ self.count_sentences(p) for p in paragraphs ] return { 'paragraph_count': len(paragraphs), 'avg_length_words': round(sum(paragraph_lengths) / len(paragraphs), 1), 'avg_length_sentences': round(sum(paragraph_sentences) / len(paragraphs), 1), 'shortest_paragraph': min(paragraph_lengths), 'longest_paragraph': max(paragraph_lengths) } def calculate_readability_score(self, avg_sentence_length: float, formality_score: int, paragraph_score: float) -> float: """ Calculate overall readability score (0-100) Factors: - Sentence length (optimal: 15-25 words) - Formality (optimal: 40-60 for general content) - Paragraph structure (optimal: varied lengths) """ # Sentence length score (0-40) if 15 <= avg_sentence_length <= 25: sentence_score = 40 elif 10 <= avg_sentence_length < 15 or 25 < avg_sentence_length <= 30: sentence_score = 30 elif avg_sentence_length < 10: sentence_score = 20 else: sentence_score = 15 # Formality score (0-30) # Optimal: 40-60 (normal/formal mix) if 40 <= formality_score <= 60: formality_points = 30 elif 30 <= formality_score < 40 or 60 < formality_score <= 70: formality_points = 25 else: formality_points = 15 # Paragraph score (0-30) paragraph_points = min(30, paragraph_score * 30) total = sentence_score + formality_points + paragraph_points return round(total, 1) def get_recommendations(self, analysis: Dict) -> List[str]: """Generate recommendations""" recs = [] avg_len = analysis['avg_sentence_length'] if avg_len < 15: recs.append("ประโยคสั้นเกินไป พิจารณาเพิ่มรายละเอียดบ้าง") elif avg_len > 25: recs.append("ประโยคยาวเกินไป แบ่งออกเป็น 2-3 ประโยคจะอ่านง่ายขึ้น") formality = analysis['formality']['level'] if "เป็นทางการ" in formality: recs.append("ภาษาเป็นทางการเกินไปสำหรับเนื้อหาทั่วไป พิจารณาใช้ภาษาที่เป็นกันเองมากขึ้น") elif "กันเอง" in formality: recs.append("ภาษาเป็นกันเองมาก ตรวจสอบว่าเหมาะกับกลุ่มเป้าหมายหรือไม่") para = analysis['paragraph_structure'] if para['avg_length_words'] > 200: recs.append("บางย่อหน้ายาวเกินไป แบ่งย่อหน้าเพื่อให้อ่านง่ายขึ้น") if para['paragraph_count'] < 5: recs.append("เพิ่มจำนวนย่อหน้าเพื่อให้อ่านง่ายขึ้น") return recs def analyze(self, text: str) -> Dict: """Full readability analysis""" avg_sentence_length = self.calculate_avg_sentence_length(text) formality = self.detect_formality(text) grade_level = self.estimate_grade_level(avg_sentence_length, formality['score']) paragraph_structure = self.analyze_paragraph_structure(text) # Calculate paragraph score (0-1) para_score = 0.5 # Default if paragraph_structure['paragraph_count'] > 0: # Score based on variety lengths = [paragraph_structure['avg_length_words']] if paragraph_structure['shortest_paragraph'] != paragraph_structure['longest_paragraph']: para_score = 0.8 # Good variety else: para_score = 0.6 # Same length readability_score = self.calculate_readability_score( avg_sentence_length, formality['score'], para_score ) recommendations = self.get_recommendations({ 'avg_sentence_length': avg_sentence_length, 'formality': formality, 'paragraph_structure': paragraph_structure }) return { 'avg_sentence_length': round(avg_sentence_length, 1), 'sentence_count': self.count_sentences(text), 'word_count': self.count_words(text), 'grade_level': grade_level, 'formality': formality, 'paragraph_structure': paragraph_structure, 'readability_score': readability_score, 'recommendations': recommendations } def main(): """Main entry point""" parser = argparse.ArgumentParser( description='Analyze Thai text readability' ) parser.add_argument( '--text', '-t', required=True, help='Text content to analyze' ) parser.add_argument( '--output', '-o', choices=['json', 'text'], default='text', help='Output format (default: text)' ) args = parser.parse_args() # Analyze analyzer = ThaiReadabilityAnalyzer() result = analyzer.analyze(args.text) # Output if args.output == 'json': print(json.dumps(result, indent=2, ensure_ascii=False)) else: print("\n📖 Thai Readability Analysis\n") print(f"Sentence Count: {result['sentence_count']}") print(f"Word Count: {result['word_count']}") print(f"Avg Sentence Length: {result['avg_sentence_length']} words") print(f"\nGrade Level: {result['grade_level']['thai']}") print(f"US Equivalent: {result['grade_level']['us_equivalent']}") print(f"\nFormality: {result['formality']['level']} (score: {result['formality']['score']})") print(f" - Formal particles: {result['formality']['formal_particle_count']}") print(f" - Informal particles: {result['formality']['informal_particle_count']}") print(f"\nParagraph Structure:") print(f" - Count: {result['paragraph_structure']['paragraph_count']}") print(f" - Avg length: {result['paragraph_structure']['avg_length_words']} words") print(f"\nReadability Score: {result['readability_score']}/100") if result['recommendations']: print(f"\n💡 Recommendations:") for rec in result['recommendations']: print(f" • {rec}") print() if __name__ == '__main__': main()