New tools added to ToBeMigrated/ directory: ai_marketing_tools/: - ai_backlinker: AI-powered backlink generation - ai_google_ads_generator: Google Ads generation with templates ai_writers/: - ai_blog_faqs_writer: FAQ generation for blogs - ai_copywriter: Multiple copywriter frameworks (AIDA, PAS, 4C, 4R, etc.) - ai_finance_report_generator: Financial report generation - ai_story_illustrator: Story illustration - ai_story_video_generator: Story video generation - ai_story_writer: AI story writing - github_blogs: GitHub blog integration - speech_to_blog: Audio to blog conversion - twitter_writers: Twitter/X content generation - youtube_writers: YouTube content generation These tools are in ToBeMigrated/ for future migration to the main backend.
450 lines
15 KiB
Python
450 lines
15 KiB
Python
"""
|
|
Utility functions for the AI Story Illustrator module.
|
|
|
|
This module provides helper functions for file operations, string manipulation,
|
|
and simple text analysis relevant to story processing.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import tempfile
|
|
import uuid
|
|
import logging
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import List, Tuple, Optional, Union
|
|
|
|
# Attempt to import Pillow for image dimensions, but don't fail if not installed
|
|
# unless the specific function is called.
|
|
try:
|
|
from PIL import Image
|
|
_PIL_AVAILABLE = True
|
|
except ImportError:
|
|
_PIL_AVAILABLE = False
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger('story_illustrator_utils')
|
|
|
|
# --- Constants ---
|
|
IMAGE_EXTENSIONS = frozenset(['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'])
|
|
TEXT_EXTENSIONS = frozenset(['.txt', '.md', '.text'])
|
|
# Common English words that often start sentences, excluded from simple name detection
|
|
COMMON_START_WORDS = frozenset([
|
|
'The', 'A', 'An', 'And', 'But', 'Or', 'For', 'Nor', 'So', 'Yet', 'He', 'She',
|
|
'It', 'They', 'We', 'You', 'I', 'In', 'On', 'At', 'To', 'From', 'With',
|
|
'About', 'As', 'Is', 'Was', 'Were', 'Be', 'Been', 'Being', 'Have', 'Has',
|
|
'Had', 'Do', 'Does', 'Did', 'Will', 'Would', 'Shall', 'Should', 'May',
|
|
'Might', 'Must', 'Can', 'Could'
|
|
])
|
|
|
|
|
|
# --- File/Directory Operations ---
|
|
|
|
def create_temp_directory(prefix: str = "story_illustrator_") -> str:
|
|
"""
|
|
Creates a temporary directory using tempfile.mkdtemp.
|
|
|
|
Args:
|
|
prefix: A prefix for the temporary directory name.
|
|
|
|
Returns:
|
|
The absolute path to the created temporary directory.
|
|
"""
|
|
try:
|
|
temp_dir = tempfile.mkdtemp(prefix=prefix)
|
|
logger.info(f"Created temporary directory: {temp_dir}")
|
|
return temp_dir
|
|
except Exception as e:
|
|
logger.error(f"Failed to create temporary directory: {e}", exc_info=True)
|
|
raise # Re-raise the exception after logging
|
|
|
|
|
|
def sanitize_filename(filename: str) -> str:
|
|
"""
|
|
Sanitizes a filename by removing/replacing invalid characters for common filesystems.
|
|
|
|
Args:
|
|
filename: The original filename string.
|
|
|
|
Returns:
|
|
A sanitized filename string suitable for use in file paths.
|
|
"""
|
|
if not isinstance(filename, str):
|
|
logger.warning("sanitize_filename received non-string input, converting.")
|
|
filename = str(filename)
|
|
|
|
# Remove characters invalid for Windows/Unix filenames
|
|
# Replace them with an underscore.
|
|
sanitized = re.sub(r'[\\/*?:"<>|\']', "_", filename)
|
|
# Replace consecutive underscores/spaces with a single underscore
|
|
sanitized = re.sub(r'[_ ]+', '_', sanitized)
|
|
# Remove leading/trailing spaces, dots, and underscores
|
|
sanitized = sanitized.strip("._ ")
|
|
|
|
# Ensure the filename is not empty after sanitization
|
|
if not sanitized:
|
|
sanitized = "unnamed_file"
|
|
logger.warning("Filename was empty after sanitization, using default.")
|
|
|
|
# Limit filename length (optional, adjust as needed)
|
|
# max_len = 255 # Example limit
|
|
# if len(sanitized) > max_len:
|
|
# name, ext = os.path.splitext(sanitized)
|
|
# sanitized = name[:max_len - len(ext) - 1] + "_" + ext
|
|
# logger.warning(f"Filename truncated to maximum length: {sanitized}")
|
|
|
|
return sanitized
|
|
|
|
|
|
def get_temp_file_path(
|
|
directory: str, prefix: str = "file_", suffix: str = ".tmp"
|
|
) -> str:
|
|
"""
|
|
Generates a unique temporary file path within the specified directory.
|
|
|
|
Args:
|
|
directory: The directory where the temporary file should be located.
|
|
prefix: A prefix for the filename.
|
|
suffix: A suffix (extension) for the filename.
|
|
|
|
Returns:
|
|
The full path for the unique temporary file.
|
|
"""
|
|
# Ensure suffix starts with a dot if it's meant to be an extension
|
|
if suffix and not suffix.startswith("."):
|
|
suffix = "." + suffix
|
|
|
|
unique_id = uuid.uuid4().hex[:12] # Longer hex UUID for better uniqueness
|
|
filename = f"{prefix}{unique_id}{suffix}"
|
|
return os.path.join(directory, filename)
|
|
|
|
|
|
def ensure_directory_exists(directory: Union[str, Path]) -> str:
|
|
"""
|
|
Ensures that a directory exists, creating it recursively if necessary.
|
|
|
|
Args:
|
|
directory: The path to the directory (string or Path object).
|
|
|
|
Returns:
|
|
The absolute path to the directory as a string.
|
|
|
|
Raises:
|
|
OSError: If the directory cannot be created (e.g., permission issues).
|
|
"""
|
|
dir_path = Path(directory).resolve() # Use Pathlib for robust handling
|
|
try:
|
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
# Log only if it needed creation (or if verbose logging is on)
|
|
# logger.info(f"Ensured directory exists: {dir_path}")
|
|
return str(dir_path)
|
|
except OSError as e:
|
|
logger.error(f"Failed to create or access directory {dir_path}: {e}", exc_info=True)
|
|
raise
|
|
|
|
|
|
def cleanup_directory(directory: Union[str, Path]) -> None:
|
|
"""
|
|
Removes a directory and all its contents recursively. Handles errors gracefully.
|
|
|
|
Args:
|
|
directory: The path to the directory to remove (string or Path object).
|
|
"""
|
|
dir_path = Path(directory)
|
|
if not dir_path.exists():
|
|
logger.debug(f"Cleanup skipped: Directory '{directory}' does not exist.")
|
|
return
|
|
|
|
if not dir_path.is_dir():
|
|
logger.warning(f"Cleanup warning: Path '{directory}' is not a directory.")
|
|
return
|
|
|
|
try:
|
|
shutil.rmtree(dir_path)
|
|
logger.info(f"Successfully removed directory: {directory}")
|
|
except OSError as e:
|
|
logger.error(f"Error removing directory {directory}: {e}", exc_info=True)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Unexpected error removing directory {directory}: {e}", exc_info=True
|
|
)
|
|
|
|
|
|
# --- File Type Checks ---
|
|
|
|
def get_file_extension(file_path: Union[str, Path]) -> str:
|
|
"""
|
|
Gets the lowercased file extension (including the dot) from a file path.
|
|
|
|
Args:
|
|
file_path: The path to the file (string or Path object).
|
|
|
|
Returns:
|
|
The file extension (e.g., '.txt', '.png') or an empty string if no extension.
|
|
"""
|
|
return Path(file_path).suffix.lower()
|
|
|
|
|
|
def is_image_file(file_path: Union[str, Path]) -> bool:
|
|
"""
|
|
Checks if a file is likely an image based on its extension.
|
|
|
|
Args:
|
|
file_path: The path to the file (string or Path object).
|
|
|
|
Returns:
|
|
True if the file extension is in IMAGE_EXTENSIONS, False otherwise.
|
|
"""
|
|
return get_file_extension(file_path) in IMAGE_EXTENSIONS
|
|
|
|
|
|
def is_text_file(file_path: Union[str, Path]) -> bool:
|
|
"""
|
|
Checks if a file is likely a text file based on its extension.
|
|
|
|
Args:
|
|
file_path: The path to the file (string or Path object).
|
|
|
|
Returns:
|
|
True if the file extension is in TEXT_EXTENSIONS, False otherwise.
|
|
"""
|
|
return get_file_extension(file_path) in TEXT_EXTENSIONS
|
|
|
|
|
|
# --- Text Analysis (Simple Heuristics) ---
|
|
|
|
def extract_story_title_from_text(text: str) -> str:
|
|
"""
|
|
Attempts to extract a title from story text using simple heuristics.
|
|
|
|
Looks for patterns (in order):
|
|
1. Markdown headers (#, ##, etc.) at the start of a line.
|
|
2. The first non-empty line if it's short (< 100 chars) and followed by
|
|
a blank line or is the only line.
|
|
3. The first non-empty line if it's entirely in uppercase (< 100 chars).
|
|
|
|
Args:
|
|
text: The story text content.
|
|
|
|
Returns:
|
|
An extracted title string, or "Untitled Story" if no pattern matches.
|
|
"""
|
|
if not isinstance(text, str) or not text.strip():
|
|
return "Untitled Story"
|
|
|
|
# 1. Check for markdown headers ( # Title, ## Title )
|
|
# Needs to match start of line (^) with optional whitespace before #
|
|
header_match = re.search(r'^\s*#+\s+(.+)$', text.strip(), re.MULTILINE)
|
|
if header_match:
|
|
title = header_match.group(1).strip()
|
|
if title: return title
|
|
|
|
lines = text.strip().split('\n')
|
|
if not lines:
|
|
return "Untitled Story"
|
|
|
|
first_line = lines[0].strip()
|
|
if not first_line: # Skip if first line is blank
|
|
if len(lines) > 1:
|
|
first_line = lines[1].strip() # Try second line
|
|
else:
|
|
return "Untitled Story"
|
|
|
|
if not first_line: # Still no title found
|
|
return "Untitled Story"
|
|
|
|
# 2. Check if first line is short and potentially a title
|
|
is_short = len(first_line) < 100
|
|
is_followed_by_blank = len(lines) > 1 and not lines[1].strip()
|
|
is_only_line = len(lines) == 1
|
|
|
|
if is_short and (is_followed_by_blank or is_only_line):
|
|
return first_line
|
|
|
|
# 3. Check if first line is all caps (and short)
|
|
is_all_caps = first_line == first_line.upper() and first_line.isalpha() # Check if it contains letters
|
|
if is_short and is_all_caps:
|
|
return first_line
|
|
|
|
# Default if no other pattern matched
|
|
return "Untitled Story"
|
|
|
|
|
|
def estimate_reading_time(text: str, words_per_minute: int = 200) -> float:
|
|
"""
|
|
Estimates the reading time of a text in minutes.
|
|
|
|
Args:
|
|
text: The text content.
|
|
words_per_minute: The assumed average reading speed.
|
|
|
|
Returns:
|
|
The estimated reading time in minutes. Returns 0.0 for empty text.
|
|
"""
|
|
if not isinstance(text, str) or not text.strip():
|
|
return 0.0
|
|
if words_per_minute <= 0:
|
|
raise ValueError("words_per_minute must be positive.")
|
|
|
|
word_count = len(text.split())
|
|
minutes = word_count / words_per_minute
|
|
return minutes
|
|
|
|
|
|
def count_sentences(text: str) -> int:
|
|
"""
|
|
Counts the number of sentences in a text using a very simple heuristic.
|
|
|
|
Note: This is a basic implementation counting sentence-ending punctuation
|
|
(. ! ?). It will be inaccurate with abbreviations (Mr., Mrs., etc.),
|
|
ellipses, and complex sentence structures.
|
|
|
|
Args:
|
|
text: The text content.
|
|
|
|
Returns:
|
|
An estimated count of sentences. Returns 0 for empty text.
|
|
"""
|
|
if not isinstance(text, str) or not text.strip():
|
|
return 0
|
|
|
|
# Find sequences of one or more sentence-ending punctuation marks
|
|
sentence_endings = re.findall(r'[.!?]+', text)
|
|
count = len(sentence_endings)
|
|
|
|
# Handle edge case where text might not end with punctuation but isn't empty
|
|
if count == 0 and len(text.strip()) > 0:
|
|
return 1 # Assume at least one sentence if text exists but no terminators found
|
|
return count
|
|
|
|
|
|
def extract_character_names(text: str, min_occurrences: int = 2) -> List[str]:
|
|
"""
|
|
Attempts to extract potential character names from story text.
|
|
|
|
Note: This is a simple heuristic based on finding capitalized words
|
|
(excluding common sentence starters) that appear multiple times. It has
|
|
limitations and may produce false positives or miss actual names.
|
|
|
|
Args:
|
|
text: The story text content.
|
|
min_occurrences: The minimum number of times a capitalized word must
|
|
appear to be considered a potential name.
|
|
|
|
Returns:
|
|
A list of potential character name strings.
|
|
"""
|
|
if not isinstance(text, str) or not text.strip():
|
|
return []
|
|
if min_occurrences < 1:
|
|
min_occurrences = 1 # Ensure at least one occurrence is required
|
|
|
|
# Find words starting with an uppercase letter, potentially followed by lowercase
|
|
# Allows for single-letter names like 'X' but focuses on typical Name structure
|
|
capitalized_words = re.findall(r'\b[A-Z][a-zA-Z]*\b', text)
|
|
|
|
# Count occurrences, excluding common words
|
|
word_counts: Dict[str, int] = {}
|
|
for word in capitalized_words:
|
|
if word not in COMMON_START_WORDS:
|
|
word_counts[word] = word_counts.get(word, 0) + 1
|
|
|
|
# Filter for words that meet the minimum occurrence threshold
|
|
potential_names = [
|
|
word for word, count in word_counts.items() if count >= min_occurrences
|
|
]
|
|
|
|
# Sort for consistency (optional)
|
|
potential_names.sort()
|
|
|
|
return potential_names
|
|
|
|
|
|
def extract_setting_details(text: str) -> List[str]:
|
|
"""
|
|
Attempts to extract potential setting details using simple regex patterns.
|
|
|
|
Note: This is a very basic heuristic looking for common prepositional
|
|
phrases (e.g., "in the forest", "at the castle"). It is highly limited
|
|
and likely to miss many setting details or extract irrelevant phrases.
|
|
|
|
Args:
|
|
text: The story text content.
|
|
|
|
Returns:
|
|
A list of potential setting phrases found.
|
|
"""
|
|
if not isinstance(text, str) or not text.strip():
|
|
return []
|
|
|
|
# Patterns looking for prepositions followed by nouns/adjectives
|
|
# Making patterns slightly more general:
|
|
# (\b\w+\b) captures single words
|
|
# (\b\w+\s+\w+\b) captures two-word phrases
|
|
# (\b[A-Z]\w*\b) captures capitalized words (potential proper nouns)
|
|
setting_patterns = [
|
|
r'\b(?:in|on|at|near|beside|inside|outside|under|over|through)\s+(?:the|a|an)\s+((?:[A-Z]\w*|\w+)(?:\s+\w+){0,2})\b', # e.g., in the old house
|
|
r'\b(?:in|on|at)\s+((?:[A-Z]\w+)(?:\s+[A-Z]\w+)*)\b', # e.g., in New York City
|
|
r'\b(?:during|before|after)\s+(?:the|a|an)\s+(\w+(?:\s+\w+){0,2})\b', # e.g., during the storm
|
|
]
|
|
|
|
settings_found = set() # Use a set to avoid duplicates
|
|
for pattern in setting_patterns:
|
|
try:
|
|
matches = re.findall(pattern, text, re.IGNORECASE) # Ignore case
|
|
for match in matches:
|
|
# If match is tuple due to multiple capture groups, join them?
|
|
# For these patterns, it should be single strings.
|
|
if isinstance(match, str):
|
|
phrase = match.strip()
|
|
if phrase and len(phrase.split()) <= 5: # Limit phrase length
|
|
settings_found.add(phrase)
|
|
except re.error as e:
|
|
logger.warning(f"Regex error in extract_setting_details: {e} with pattern: {pattern}")
|
|
|
|
|
|
# Convert set back to list and sort for consistency
|
|
sorted_settings = sorted(list(settings_found))
|
|
return sorted_settings
|
|
|
|
|
|
# --- Image Operations ---
|
|
|
|
def get_image_dimensions(image_path: Union[str, Path]) -> Optional[Tuple[int, int]]:
|
|
"""
|
|
Gets the (width, height) dimensions of an image file using Pillow.
|
|
|
|
Args:
|
|
image_path: The path to the image file (string or Path object).
|
|
|
|
Returns:
|
|
A tuple (width, height) if successful, or None if the file is not
|
|
a valid image, Pillow is not installed, or an error occurs.
|
|
"""
|
|
if not _PIL_AVAILABLE:
|
|
logger.warning("Pillow (PIL) library not installed. Cannot get image dimensions.")
|
|
return None
|
|
|
|
img_path = Path(image_path)
|
|
if not img_path.is_file():
|
|
logger.error(f"Image file not found or is not a file: {image_path}")
|
|
return None
|
|
|
|
try:
|
|
with Image.open(img_path) as img:
|
|
width, height = img.size
|
|
logger.debug(f"Dimensions for {image_path}: {width}x{height}")
|
|
return width, height
|
|
except FileNotFoundError:
|
|
logger.error(f"Image file not found at path: {image_path}")
|
|
return None
|
|
except UnidentifiedImageError: # Specific Pillow error for invalid images
|
|
logger.error(f"Could not identify image file (invalid format or corrupted): {image_path}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error getting dimensions for image {image_path}: {e}", exc_info=True)
|
|
return None |