Auto-sync from website-creator
This commit is contained in:
57
skills/image-analyze/SKILL.md
Normal file
57
skills/image-analyze/SKILL.md
Normal file
@@ -0,0 +1,57 @@
|
||||
---
|
||||
name: image-analyze
|
||||
description: Analyze images using vision AI when the current model doesn't support image input. Use this skill when you need to understand, describe, or extract information from images.
|
||||
---
|
||||
|
||||
# Image Analyze
|
||||
|
||||
Analyze images with vision AI via `python3 scripts/analyze_image.py <image_path> [prompt]`.
|
||||
|
||||
## Commands
|
||||
|
||||
| Command | Args | Description |
|
||||
|---------|------|-------------|
|
||||
| `analyze` | `<image_path> [prompt]` | Analyze image with optional custom prompt |
|
||||
|
||||
## Options
|
||||
|
||||
| Option | Default | Description |
|
||||
|--------|---------|-------------|
|
||||
| `--max-tokens` | 1024 | Maximum tokens in response |
|
||||
| `--temperature` | 0.7 | Response creativity (0-2) |
|
||||
| `--model` | moonshotai/Kimi-K2.5-TEE | Vision model to use |
|
||||
|
||||
## Examples
|
||||
|
||||
```bash
|
||||
# Basic analysis
|
||||
python3 scripts/analyze_image.py photo.jpg
|
||||
|
||||
# With custom prompt
|
||||
python3 scripts/analyze_image.py diagram.png "Extract all text and explain the workflow"
|
||||
|
||||
# Detailed analysis
|
||||
python3 scripts/analyze_image.py screenshot.png "Describe all UI elements and their positions"
|
||||
|
||||
# OCR-like extraction
|
||||
python3 scripts/analyze_image.py document.jpg "Transcribe all text exactly as shown"
|
||||
```
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Provide image path (PNG, JPG, JPEG, GIF, WEBP, BMP)
|
||||
2. Optionally provide custom analysis prompt
|
||||
3. Script converts image to base64 and sends to vision API
|
||||
4. Returns detailed analysis text
|
||||
|
||||
## Output Format
|
||||
|
||||
- Success: Analysis text directly
|
||||
- Error: `Error: message` (to stderr)
|
||||
|
||||
## Notes
|
||||
|
||||
- Requires `CHUTES_API_TOKEN` in environment
|
||||
- Uses Kimi-K2.5-TEE vision model via Chutes AI
|
||||
- Supports common image formats
|
||||
- Best for: image description, OCR, UI analysis, diagram interpretation
|
||||
7
skills/image-analyze/scripts/.env.example
Normal file
7
skills/image-analyze/scripts/.env.example
Normal file
@@ -0,0 +1,7 @@
|
||||
# Chutes AI API Token
|
||||
# Same token as image-generation and image-edit skills
|
||||
# Get your token from your Chutes AI account
|
||||
#
|
||||
# WARNING: Never commit actual credentials!
|
||||
|
||||
CHUTES_API_TOKEN=your_chutes_api_token_here
|
||||
146
skills/image-analyze/scripts/analyze_image.py
Executable file
146
skills/image-analyze/scripts/analyze_image.py
Executable file
@@ -0,0 +1,146 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import base64
|
||||
from pathlib import Path
|
||||
import requests
|
||||
|
||||
|
||||
def load_env():
|
||||
env_path = Path(__file__).parent / ".env"
|
||||
if env_path.exists():
|
||||
for line in env_path.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith("#") and "=" in line:
|
||||
k, v = line.split("=", 1)
|
||||
os.environ.setdefault(k.strip(), v.strip().strip("\"'"))
|
||||
|
||||
|
||||
load_env()
|
||||
|
||||
API_TOKEN = os.environ.get("CHUTES_API_TOKEN")
|
||||
API_URL = "https://llm.chutes.ai/v1/chat/completions"
|
||||
DEFAULT_MODEL = "moonshotai/Kimi-K2.5-TEE"
|
||||
|
||||
|
||||
def image_to_base64_url(image_path):
|
||||
if not os.path.exists(image_path):
|
||||
raise FileNotFoundError(f"Image file not found: {image_path}")
|
||||
|
||||
suffix = Path(image_path).suffix.lower()
|
||||
mime_types = {
|
||||
".png": "image/png",
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
".bmp": "image/bmp",
|
||||
}
|
||||
|
||||
mime_type = mime_types.get(suffix, "image/jpeg")
|
||||
|
||||
with open(image_path, "rb") as f:
|
||||
image_bytes = f.read()
|
||||
|
||||
encoded = base64.b64encode(image_bytes).decode("utf-8")
|
||||
return f"data:{mime_type};base64,{encoded}"
|
||||
|
||||
|
||||
def analyze_image(
|
||||
image_path,
|
||||
prompt="Analyze this image in detail. Describe what you see, including objects, people, text, colors, composition, and any relevant context.",
|
||||
max_tokens=1024,
|
||||
temperature=0.7,
|
||||
model=None,
|
||||
):
|
||||
if not API_TOKEN:
|
||||
print("Error: CHUTES_API_TOKEN not set in environment", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not os.path.exists(image_path):
|
||||
print(f"Error: Image file not found: {image_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
image_url = image_to_base64_url(image_path)
|
||||
|
||||
use_model = model or DEFAULT_MODEL
|
||||
|
||||
payload = {
|
||||
"model": use_model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}
|
||||
],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"Authorization": f"Bearer {API_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
response = requests.post(API_URL, headers=headers, json=payload, timeout=120)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
|
||||
if "choices" in result and len(result["choices"]) > 0:
|
||||
content = result["choices"][0].get("message", {}).get("content", "")
|
||||
if content:
|
||||
print(content)
|
||||
else:
|
||||
print("Error: No content in response", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("Error: Invalid response format", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error: API request failed - {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Analyze images with vision AI")
|
||||
parser.add_argument("image_path", help="Path to image file")
|
||||
parser.add_argument("prompt", nargs="?", default="", help="Custom analysis prompt")
|
||||
parser.add_argument(
|
||||
"--max-tokens", type=int, default=1024, help="Max tokens in response"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--temperature", type=float, default=0.7, help="Response creativity (0-2)"
|
||||
)
|
||||
parser.add_argument("--model", type=str, default=None, help="Vision model to use")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
prompt = (
|
||||
args.prompt
|
||||
if args.prompt
|
||||
else "Analyze this image in detail. Describe what you see, including objects, people, text, colors, composition, and any relevant context."
|
||||
)
|
||||
|
||||
analyze_image(
|
||||
image_path=args.image_path,
|
||||
prompt=prompt,
|
||||
max_tokens=args.max_tokens,
|
||||
temperature=args.temperature,
|
||||
model=args.model,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
skills/image-analyze/scripts/requirements.txt
Normal file
1
skills/image-analyze/scripts/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
requests>=2.28.0
|
||||
Reference in New Issue
Block a user