ALwrity/ToBeMigrated/ai_writers/ai_story_illustrator/utils.py

"""
Utility functions for the AI Story Illustrator module.

This module provides helper functions for file operations, string manipulation,
and simple text analysis relevant to story processing.
"""

import os
import re
import tempfile
import uuid
import logging
import shutil
from pathlib import Path
from typing import List, Tuple, Optional, Union

# Attempt to import Pillow for image dimensions, but don't fail if not installed
# unless the specific function is called.
try:
    from PIL import Image
    _PIL_AVAILABLE = True
except ImportError:
    _PIL_AVAILABLE = False

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger('story_illustrator_utils')

# --- Constants ---
IMAGE_EXTENSIONS = frozenset(['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'])
TEXT_EXTENSIONS = frozenset(['.txt', '.md', '.text'])
# Common English words that often start sentences, excluded from simple name detection
COMMON_START_WORDS = frozenset([
    'The', 'A', 'An', 'And', 'But', 'Or', 'For', 'Nor', 'So', 'Yet', 'He', 'She',
    'It', 'They', 'We', 'You', 'I', 'In', 'On', 'At', 'To', 'From', 'With',
    'About', 'As', 'Is', 'Was', 'Were', 'Be', 'Been', 'Being', 'Have', 'Has',
    'Had', 'Do', 'Does', 'Did', 'Will', 'Would', 'Shall', 'Should', 'May',
    'Might', 'Must', 'Can', 'Could'
])


# --- File/Directory Operations ---

def create_temp_directory(prefix: str = "story_illustrator_") -> str:
    """
    Creates a temporary directory using tempfile.mkdtemp.

    Args:
        prefix: A prefix for the temporary directory name.

    Returns:
        The absolute path to the created temporary directory.
    """
    try:
        temp_dir = tempfile.mkdtemp(prefix=prefix)
        logger.info(f"Created temporary directory: {temp_dir}")
        return temp_dir
    except Exception as e:
        logger.error(f"Failed to create temporary directory: {e}", exc_info=True)
        raise  # Re-raise the exception after logging


def sanitize_filename(filename: str) -> str:
    """
    Sanitizes a filename by removing/replacing invalid characters for common filesystems.

    Args:
        filename: The original filename string.

    Returns:
        A sanitized filename string suitable for use in file paths.
    """
    if not isinstance(filename, str):
        logger.warning("sanitize_filename received non-string input, converting.")
        filename = str(filename)

    # Remove characters invalid for Windows/Unix filenames
    # Replace them with an underscore.
    sanitized = re.sub(r'[\\/*?:"<>|\']', "_", filename)
    # Replace consecutive underscores/spaces with a single underscore
    sanitized = re.sub(r'[_ ]+', '_', sanitized)
    # Remove leading/trailing spaces, dots, and underscores
    sanitized = sanitized.strip("._ ")

    # Ensure the filename is not empty after sanitization
    if not sanitized:
        sanitized = "unnamed_file"
        logger.warning("Filename was empty after sanitization, using default.")

    # Limit filename length (optional, adjust as needed)
    # max_len = 255 # Example limit
    # if len(sanitized) > max_len:
    #     name, ext = os.path.splitext(sanitized)
    #     sanitized = name[:max_len - len(ext) - 1] + "_" + ext
    #     logger.warning(f"Filename truncated to maximum length: {sanitized}")

    return sanitized


def get_temp_file_path(
    directory: str, prefix: str = "file_", suffix: str = ".tmp"
) -> str:
    """
    Generates a unique temporary file path within the specified directory.

    Args:
        directory: The directory where the temporary file should be located.
        prefix: A prefix for the filename.
        suffix: A suffix (extension) for the filename.

    Returns:
        The full path for the unique temporary file.
    """
    # Ensure suffix starts with a dot if it's meant to be an extension
    if suffix and not suffix.startswith("."):
        suffix = "." + suffix

    unique_id = uuid.uuid4().hex[:12] # Longer hex UUID for better uniqueness
    filename = f"{prefix}{unique_id}{suffix}"
    return os.path.join(directory, filename)


def ensure_directory_exists(directory: Union[str, Path]) -> str:
    """
    Ensures that a directory exists, creating it recursively if necessary.

    Args:
        directory: The path to the directory (string or Path object).

    Returns:
        The absolute path to the directory as a string.

    Raises:
        OSError: If the directory cannot be created (e.g., permission issues).
    """
    dir_path = Path(directory).resolve() # Use Pathlib for robust handling
    try:
        dir_path.mkdir(parents=True, exist_ok=True)
        # Log only if it needed creation (or if verbose logging is on)
        # logger.info(f"Ensured directory exists: {dir_path}")
        return str(dir_path)
    except OSError as e:
        logger.error(f"Failed to create or access directory {dir_path}: {e}", exc_info=True)
        raise


def cleanup_directory(directory: Union[str, Path]) -> None:
    """
    Removes a directory and all its contents recursively. Handles errors gracefully.

    Args:
        directory: The path to the directory to remove (string or Path object).
    """
    dir_path = Path(directory)
    if not dir_path.exists():
        logger.debug(f"Cleanup skipped: Directory '{directory}' does not exist.")
        return

    if not dir_path.is_dir():
         logger.warning(f"Cleanup warning: Path '{directory}' is not a directory.")
         return

    try:
        shutil.rmtree(dir_path)
        logger.info(f"Successfully removed directory: {directory}")
    except OSError as e:
        logger.error(f"Error removing directory {directory}: {e}", exc_info=True)
    except Exception as e:
        logger.error(
            f"Unexpected error removing directory {directory}: {e}", exc_info=True
        )


# --- File Type Checks ---

def get_file_extension(file_path: Union[str, Path]) -> str:
    """
    Gets the lowercased file extension (including the dot) from a file path.

    Args:
        file_path: The path to the file (string or Path object).

    Returns:
        The file extension (e.g., '.txt', '.png') or an empty string if no extension.
    """
    return Path(file_path).suffix.lower()


def is_image_file(file_path: Union[str, Path]) -> bool:
    """
    Checks if a file is likely an image based on its extension.

    Args:
        file_path: The path to the file (string or Path object).

    Returns:
        True if the file extension is in IMAGE_EXTENSIONS, False otherwise.
    """
    return get_file_extension(file_path) in IMAGE_EXTENSIONS


def is_text_file(file_path: Union[str, Path]) -> bool:
    """
    Checks if a file is likely a text file based on its extension.

    Args:
        file_path: The path to the file (string or Path object).

    Returns:
        True if the file extension is in TEXT_EXTENSIONS, False otherwise.
    """
    return get_file_extension(file_path) in TEXT_EXTENSIONS


# --- Text Analysis (Simple Heuristics) ---

def extract_story_title_from_text(text: str) -> str:
    """
    Attempts to extract a title from story text using simple heuristics.

    Looks for patterns (in order):
    1. Markdown headers (#, ##, etc.) at the start of a line.
    2. The first non-empty line if it's short (< 100 chars) and followed by
       a blank line or is the only line.
    3. The first non-empty line if it's entirely in uppercase (< 100 chars).

    Args:
        text: The story text content.

    Returns:
        An extracted title string, or "Untitled Story" if no pattern matches.
    """
    if not isinstance(text, str) or not text.strip():
        return "Untitled Story"

    # 1. Check for markdown headers ( # Title, ## Title )
    # Needs to match start of line (^) with optional whitespace before #
    header_match = re.search(r'^\s*#+\s+(.+)$', text.strip(), re.MULTILINE)
    if header_match:
        title = header_match.group(1).strip()
        if title: return title

    lines = text.strip().split('\n')
    if not lines:
        return "Untitled Story"

    first_line = lines[0].strip()
    if not first_line: # Skip if first line is blank
        if len(lines) > 1:
            first_line = lines[1].strip() # Try second line
        else:
            return "Untitled Story"

    if not first_line: # Still no title found
         return "Untitled Story"

    # 2. Check if first line is short and potentially a title
    is_short = len(first_line) < 100
    is_followed_by_blank = len(lines) > 1 and not lines[1].strip()
    is_only_line = len(lines) == 1

    if is_short and (is_followed_by_blank or is_only_line):
        return first_line

    # 3. Check if first line is all caps (and short)
    is_all_caps = first_line == first_line.upper() and first_line.isalpha() # Check if it contains letters
    if is_short and is_all_caps:
        return first_line

    # Default if no other pattern matched
    return "Untitled Story"


def estimate_reading_time(text: str, words_per_minute: int = 200) -> float:
    """
    Estimates the reading time of a text in minutes.

    Args:
        text: The text content.
        words_per_minute: The assumed average reading speed.

    Returns:
        The estimated reading time in minutes. Returns 0.0 for empty text.
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0
    if words_per_minute <= 0:
        raise ValueError("words_per_minute must be positive.")

    word_count = len(text.split())
    minutes = word_count / words_per_minute
    return minutes


def count_sentences(text: str) -> int:
    """
    Counts the number of sentences in a text using a very simple heuristic.

    Note: This is a basic implementation counting sentence-ending punctuation
    (. ! ?). It will be inaccurate with abbreviations (Mr., Mrs., etc.),
    ellipses, and complex sentence structures.

    Args:
        text: The text content.

    Returns:
        An estimated count of sentences. Returns 0 for empty text.
    """
    if not isinstance(text, str) or not text.strip():
        return 0

    # Find sequences of one or more sentence-ending punctuation marks
    sentence_endings = re.findall(r'[.!?]+', text)
    count = len(sentence_endings)

    # Handle edge case where text might not end with punctuation but isn't empty
    if count == 0 and len(text.strip()) > 0:
        return 1 # Assume at least one sentence if text exists but no terminators found
    return count


def extract_character_names(text: str, min_occurrences: int = 2) -> List[str]:
    """
    Attempts to extract potential character names from story text.

    Note: This is a simple heuristic based on finding capitalized words
    (excluding common sentence starters) that appear multiple times. It has
    limitations and may produce false positives or miss actual names.

    Args:
        text: The story text content.
        min_occurrences: The minimum number of times a capitalized word must
                         appear to be considered a potential name.

    Returns:
        A list of potential character name strings.
    """
    if not isinstance(text, str) or not text.strip():
        return []
    if min_occurrences < 1:
        min_occurrences = 1 # Ensure at least one occurrence is required

    # Find words starting with an uppercase letter, potentially followed by lowercase
    # Allows for single-letter names like 'X' but focuses on typical Name structure
    capitalized_words = re.findall(r'\b[A-Z][a-zA-Z]*\b', text)

    # Count occurrences, excluding common words
    word_counts: Dict[str, int] = {}
    for word in capitalized_words:
        if word not in COMMON_START_WORDS:
            word_counts[word] = word_counts.get(word, 0) + 1

    # Filter for words that meet the minimum occurrence threshold
    potential_names = [
        word for word, count in word_counts.items() if count >= min_occurrences
    ]

    # Sort for consistency (optional)
    potential_names.sort()

    return potential_names


def extract_setting_details(text: str) -> List[str]:
    """
    Attempts to extract potential setting details using simple regex patterns.

    Note: This is a very basic heuristic looking for common prepositional
    phrases (e.g., "in the forest", "at the castle"). It is highly limited
    and likely to miss many setting details or extract irrelevant phrases.

    Args:
        text: The story text content.

    Returns:
        A list of potential setting phrases found.
    """
    if not isinstance(text, str) or not text.strip():
        return []

    # Patterns looking for prepositions followed by nouns/adjectives
    # Making patterns slightly more general:
    # (\b\w+\b) captures single words
    # (\b\w+\s+\w+\b) captures two-word phrases
    # (\b[A-Z]\w*\b) captures capitalized words (potential proper nouns)
    setting_patterns = [
        r'\b(?:in|on|at|near|beside|inside|outside|under|over|through)\s+(?:the|a|an)\s+((?:[A-Z]\w*|\w+)(?:\s+\w+){0,2})\b', # e.g., in the old house
        r'\b(?:in|on|at)\s+((?:[A-Z]\w+)(?:\s+[A-Z]\w+)*)\b', # e.g., in New York City
        r'\b(?:during|before|after)\s+(?:the|a|an)\s+(\w+(?:\s+\w+){0,2})\b', # e.g., during the storm
    ]

    settings_found = set() # Use a set to avoid duplicates
    for pattern in setting_patterns:
        try:
            matches = re.findall(pattern, text, re.IGNORECASE) # Ignore case
            for match in matches:
                 # If match is tuple due to multiple capture groups, join them?
                 # For these patterns, it should be single strings.
                 if isinstance(match, str):
                      phrase = match.strip()
                      if phrase and len(phrase.split()) <= 5: # Limit phrase length
                           settings_found.add(phrase)
        except re.error as e:
             logger.warning(f"Regex error in extract_setting_details: {e} with pattern: {pattern}")


    # Convert set back to list and sort for consistency
    sorted_settings = sorted(list(settings_found))
    return sorted_settings


# --- Image Operations ---

def get_image_dimensions(image_path: Union[str, Path]) -> Optional[Tuple[int, int]]:
    """
    Gets the (width, height) dimensions of an image file using Pillow.

    Args:
        image_path: The path to the image file (string or Path object).

    Returns:
        A tuple (width, height) if successful, or None if the file is not
        a valid image, Pillow is not installed, or an error occurs.
    """
    if not _PIL_AVAILABLE:
        logger.warning("Pillow (PIL) library not installed. Cannot get image dimensions.")
        return None

    img_path = Path(image_path)
    if not img_path.is_file():
        logger.error(f"Image file not found or is not a file: {image_path}")
        return None

    try:
        with Image.open(img_path) as img:
            width, height = img.size
            logger.debug(f"Dimensions for {image_path}: {width}x{height}")
            return width, height
    except FileNotFoundError:
        logger.error(f"Image file not found at path: {image_path}")
        return None
    except UnidentifiedImageError: # Specific Pillow error for invalid images
         logger.error(f"Could not identify image file (invalid format or corrupted): {image_path}")
         return None
    except Exception as e:
        logger.error(f"Error getting dimensions for image {image_path}: {e}", exc_info=True)
        return None